In [50]:
#Libraries
import pandas as pd                  # Data Manipulation
import numpy as np                   # Matrices support
import matplotlib.pyplot as plt      # Plotting
import seaborn as sns                # Statistical Visualization
from matplotlib import cm
from scipy import stats
from numpy.polynomial.polynomial import polyfit
from random import randint
import math

# Data Wrangling
In this notebook we transform the original data to the appropriate structure, by making transformations, creating new variables, dropping variables and correcting missing values.

The database has 15781 registers

In [68]:
data = pd.read_excel('201713_ECOAS_Anonimizado.xlsx')

# Data Validation
Multiple columns have only one value or are of no interest

In [69]:
drop_variables = ['Ejercicio','Grupo Virtual','Crn UV']
data = data.drop(drop_variables,axis=1)

Now that we have the useful variables, we are going to give them the appropriate format.

In [71]:
#Replace nan values of multiple columns to 0s
nan_list = ['Tesis en título','Book','Book Series','Conference Proceeding','Journal Q1/Q2','Journal Q3/Q4','Total Docs','Book Norm.',
            'Book Series Norm.','Conference Proceeding Norm.','Journal Q1/Q2 Norm.','Journal Q3/Q4 Norm.',
            'Total Docs. Norm.']

for index,value in enumerate(nan_list):
    data[value] = data[value].fillna(0)

## Creation of variables

In [73]:
#Renames variables
data.rename(columns = {'ID_Prof':'ID'}, inplace = True)
data.rename(columns = {'Nivel SNI 2019':'SNI'}, inplace = True)
data.rename(columns = {'05 #Op':'05num','05 Prom':'05avg','05 Desv':'05std','06 #Op':'06num','06 Prom':'06avg','06 Desv':'06std','08 #Op':'08num','08 Prom':'08avg','08 Desv':'08std'},inplace=True)
data.rename(columns = {'Semestres Experiencia':'experience'}, inplace = True)
data.rename(columns = {'Edad al semestre':'age'}, inplace = True)
data.rename(columns = {'Grado Máximo Prof':'grado'}, inplace = True)


In [74]:
#Creates the variable sni_yn
data.rename(columns = {'Nivel SNI 2019':'SNI'}, inplace = True)                     #Changes the name of the column "Nivel SNI 2019" to "SNI"

#Creates a count table of sni levels 
sni = data[['SNI','ID']].groupby(['SNI'],as_index=False).count()
sni['percentage'] = sni['ID']/len(data)*100

#10% of profesors are reseearchers (SNI). So we create a variable named sni_yn (sni yes or no)
a = [];
num = data['SNI'].isnull()*1
for index,value in enumerate(num):
    if value == 0:
        a.append('SNI')
    else:
        a.append('NO SNI')
data['sni_yn'] = a                                                                  #Creates the new variable

In [75]:
#Creates the variable "calif" which is the average of the answers of the questions in the survery
data['score'] = (data['05avg'] + data['06avg'] + data['08avg'])/3

In [76]:
#Creates the variable "under_yn" which classifies if a class is at undergraduate level or not
#We grouped Doctorado, Maestría and Especialidad as Posgrado
a = [];
for index,value in enumerate(data['Nivel materia']):
    if value == 'Profesional':
        a.append('Undergraduate')
    elif value == 'Preparatoria':
        a.append('Highschool')
    else:
        a.append('Graduate')
data['under_yn'] = a

#Creates a count table of the class leve
profesional = data[['under_yn','ID']].groupby(['under_yn'],as_index=False).count()
profesional['percentage'] = profesional['ID']/len(data)*100

In [77]:
#Counts the amount of profesors per nationality
prof_nacionalidad =  data[['Nacionalidad','ID']].groupby(['Nacionalidad'],as_index=False).count().sort_values(['ID'],ascending=False)
prof_nacionalidad['percentage'] = prof_nacionalidad['ID']/len(data)*100

#Because 94% of the professors are mexican, we create a variable named mxn_yn
a = [];
for index,value in enumerate(data['Nacionalidad']):
    if value == 'Mexicana':
        a.append('Mexican');
    else:
        a.append('Foreign');
data['mxn_yn'] = a        

In [78]:
#Changes the data type of colums to categorical
data['sni_yn'] = data['sni_yn'].astype('category')
data['under_yn'] = data['under_yn'].astype('category')
data['mxn_yn'] = data['mxn_yn'].astype('category')
data['Estatus PDHD (Habilidades Docentes)'] = data['Estatus PDHD (Habilidades Docentes)'].astype('category')
data['Género'] = data['Género'].astype('category')
data['Indicador grupo terminal'] = data['Indicador grupo terminal'].astype('category')
data['Profesor Titular'] = data['Profesor Titular'].astype('category')