In [5]:
#Libraries
import pandas as pd                  # Data Manipulation
import numpy as np                   # Matrices support
import matplotlib.pyplot as plt      # Plotting
import seaborn as sns                # Statistical Visualization
from matplotlib import cm
from scipy import stats
from numpy.polynomial.polynomial import polyfit
from random import randint
import math

# Data Wrangling
In this notebook we transform the data to the appropriate formats, by making transformations, creating new variables and correcting missing values.

The database has 15781 registers

In [6]:
data = pd.read_excel('201713_ECOAS_Anonimizado.xlsx')

# Data Validation

In [7]:
#The variable 'Crn UV' only contains the value "No aplica", so we drop this column
data['Crn UV'].unique
data = data.drop(['Crn UV'],axis=1)

In [8]:
#The variable 'Grupo Virtual' only contains the value "NO", so we drop this column 
print(data[['Grupo Virtual','ID_Prof']].groupby(['Grupo Virtual'],as_index=False).count())
data = data.drop(['Grupo Virtual'],axis=1)

  Grupo Virtual  ID_Prof
0            NO    15780


In [10]:
# Replace the nan values of the column "Tesis en título" with 0s
data['Tesis en título'] = data['Tesis en título'].fillna(0)
tesis = data[['Tesis en título','ID_Prof']].groupby(['Tesis en título'],as_index=False).count()
tesis['porcentaje'] = tesis['ID_Prof']/len(data)*100
tesis.head()

Unnamed: 0,Tesis en título,ID_Prof,porcentaje
0,0.0,15236,96.54648
1,1.0,545,3.45352


In [11]:
#Replace nan values of multiple columns to 0s
nan_list = ['Book','Book Series','Conference Proceeding','Journal Q1/Q2','Journal Q3/Q4','Total Docs','Book Norm.',
            'Book Series Norm.','Conference Proceeding Norm.','Journal Q1/Q2 Norm.','Journal Q3/Q4 Norm.',
            'Total Docs. Norm.']

for index,value in enumerate(nan_list):
    data[value] = data[value].fillna(0)

In [12]:
#Creates the variable sni_sn
data.rename(columns = {'Nivel SNI 2019':'SNI'}, inplace = True)                     #Changes the name of the column "Nivel SNI 2019" to "SNI"

sni = data[['SNI','ID_Prof']].groupby(['SNI'],as_index=False).count()
sni['porcentaje'] = sni['ID_Prof']/len(data)*100
print(sni.head())                                                                   #Table of the quantity of SNIs by level

#10% of profesors are reseearchers. So we create a bariable named sni_sn
a = [];
num = data['SNI'].isnull()*1
for index,value in enumerate(num):
    if value == 0:
        a.append('SNI')
    else:
        a.append('NO SNI')
data['sni_sn'] = a   

  SNI  ID_Prof  porcentaje
0   1      827    5.240479
1   2      272    1.723592
2   3       95    0.601990
3   C      338    2.141816


In [13]:
#Creates the variable "calif" which is the average of the answers of the questions in the survery
data['calif'] = (data['05 Prom'] + data['05 Prom'] + data['05 Prom'])/3

In [14]:
#Creates the variable "prof_sn" which classifies if a class is at undergraduate level or not
a = [];
for index,value in enumerate(data['Nivel materia']):
    if value == 'Profesional':
        a.append('Profesional')
    elif value == 'Preparatoria':
        a.append('Preparatoria')
    else:
        a.append('Posgrado')
data['prof_sn'] = a

profesional = data[['prof_sn','ID_Prof']].groupby(['prof_sn'],as_index=False).count()
profesional['porcentaje'] = profesional['ID_Prof']/len(data)*100
profesional

Unnamed: 0,prof_sn,ID_Prof,porcentaje
0,Posgrado,871,5.519295
1,Preparatoria,184,1.165959
2,Profesional,14726,93.314746


In [15]:
#Counts the amount of profesors per nationality
prof_nacionalidad =  data[['Nacionalidad','ID_Prof']].groupby(['Nacionalidad'],as_index=False).count().sort_values(['ID_Prof'],ascending=False)
prof_nacionalidad['porcentaje'] = prof_nacionalidad['ID_Prof']/len(data)*100
prof_nacionalidad.head()

Unnamed: 0,Nacionalidad,ID_Prof,porcentaje
34,Mexicana,14866,94.201888
21,Estadounidense,122,0.773082
20,Española,88,0.557633
30,Italiana,74,0.468918
11,Colombiana,68,0.430898


In [17]:
#Creates the variables "mxn_sn" because there are 56 different nationalities
a = [];
for index,value in enumerate(data['Nacionalidad']):
    if value == 'Mexicana':
        a.append('Mexicano');
    else:
        a.append('Extranjero');
data['mxn_sn'] = a        

In [18]:
#Changes the data type of colums to categorical
data['sni_sn'] = data['sni_sn'].astype('category')
data['prof_sn'] = data['prof_sn'].astype('category')
data['mxn_sn'] = data['mxn_sn'].astype('category')