In [1]:
# Escenario 3: Balancear la cantidad de registros según la variable target y usar solamente los registros completos.
# selecciono 150.000 registros por cada opción de mi target, 
# entonces son 150.000 con SI y 150.000 con NO (300.000 registros del total)
# solo se usan los datos que contienen todas las variables

In [2]:
# Cargar funciones de la librería de python data analysis
import pandas as pd 

# Leer csv con datos y cargar en el dataframe data
# data = pd.read_csv("data/analisis_desercion_2018.csv", sep=';', encoding = "ISO-8859-1") 
data2018 = pd.read_csv("data/analisis_desercion_2018.csv", sep=';', encoding = "ISO-8859-1") 
data2019 = pd.read_csv("data/analisis_desercion_2019.csv", sep=';', encoding = "ISO-8859-1") 
data2020 = pd.read_csv("data/analisis_desercion_2020.csv", sep=';', encoding = "ISO-8859-1") 

In [3]:
print ('Registros 2018: ' + str(data2018['estudiante_id'].count()))
print ('Registros 2019: ' + str(data2019['estudiante_id'].count()))
print ('Registros 2020: ' + str(data2020['estudiante_id'].count()))

Registros 2018: 1144031
Registros 2019: 1182570
Registros 2020: 1276005


In [4]:
# concatenar los 3 datasets
data = pd.concat([data2018,data2019,data2020])
# Preview de las 5 primeras filas de data 
data.head()

Unnamed: 0,estudiante_id,grado_curso,edad,tiene_sobreedad,tipo_documento,sexo,es_indigena,idioma,tipo_gestion_id,tipo_zona_id,...,cuantas_personas_vive,cuantas_piezas_dormir,recibe_beneficio,como_se_traslada_escuela,cuanto_tiempo_tarda,recibe_alimentacion,nivel_pobreza,matriculado_periodo_actual,estudiante_id_posible,desertor
0,3,11,13,0,1,1,0,Castellano,1,1,...,4,1,1,Caminando,Menos de 30 minutos,0,3.420555,1,,0
1,4,6,8,0,1,2,0,Guaraní,1,1,...,4,1,0,Caminando,Menos de 30 minutos,0,20.33361,1,,0
2,5,7,8,0,1,1,0,Castellano,1,1,...,4,2,0,Caminando,Menos de 30 minutos,0,23.43542,1,,0
3,78,6,7,0,1,1,0,Castellano,1,1,...,4,2,0,En moto,Menos de 30 minutos,0,11.50687,0,,1
4,13,7,8,0,1,1,0,Castellano,3,1,...,4,4,0,Caminando,Menos de 30 minutos,0,14.55257,1,,0


In [5]:
# Identificar variables categoricas
df = data
df.dtypes[df.dtypes=='object']

idioma                      object
como_se_traslada_escuela    object
cuanto_tiempo_tarda         object
dtype: object

In [6]:
# cargar codificador por etiquetas (LabelEncoder)
from sklearn import preprocessing

# Codificar cada variable categorica con su propio encoder --> utilizo .astype(str) para convertir los valores a string
leIdioma = preprocessing.LabelEncoder()
df["idioma"] = leIdioma.fit_transform(df["idioma"].astype(str))
leComoSeTrasladaEscuela = preprocessing.LabelEncoder()
df["como_se_traslada_escuela"] = leComoSeTrasladaEscuela.fit_transform(df["como_se_traslada_escuela"].astype(str))
leCuantoTiempoTarda = preprocessing.LabelEncoder()
df["cuanto_tiempo_tarda"] = leCuantoTiempoTarda.fit_transform(df["cuanto_tiempo_tarda"].astype(str))

df.head()

Unnamed: 0,estudiante_id,grado_curso,edad,tiene_sobreedad,tipo_documento,sexo,es_indigena,idioma,tipo_gestion_id,tipo_zona_id,...,cuantas_personas_vive,cuantas_piezas_dormir,recibe_beneficio,como_se_traslada_escuela,cuanto_tiempo_tarda,recibe_alimentacion,nivel_pobreza,matriculado_periodo_actual,estudiante_id_posible,desertor
0,3,11,13,0,1,1,0,0,1,1,...,4,1,1,0,1,0,3.420555,1,,0
1,4,6,8,0,1,2,0,1,1,1,...,4,1,0,0,1,0,20.33361,1,,0
2,5,7,8,0,1,1,0,0,1,1,...,4,2,0,0,1,0,23.43542,1,,0
3,78,6,7,0,1,1,0,0,1,1,...,4,2,0,2,1,0,11.50687,0,,1
4,13,7,8,0,1,1,0,0,3,1,...,4,4,0,0,1,0,14.55257,1,,0


In [17]:
# Carga decision tree, holdout split y metricas
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

###########################################################
### opcion de seleccionar filas que "estén completas" y con balanceo de valor target
df = df.loc[(df['cuantas_personas_vive'] != 0) 
                 & (df['cuantas_piezas_dormir'] != 0) 
                 & (df['como_se_traslada_escuela'] != 'SIN DATO')]
df1 = df[df['desertor'] == 1] 
df0 = df[df['desertor'] == 0] 

df1 = df1.iloc[:150000]
df0 = df0.iloc[:150000]

# concatenar los 2 df
dflimit = pd.concat([df0,df1])

# Split en train y test
X = dflimit.iloc[:,1:28]
y = dflimit['desertor']

###########################################################

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

# Entrenar decision tree con entropia, size minimo de nodo 200 y profundidad maxima 12
clf = DecisionTreeClassifier(criterion="entropy", min_samples_leaf=200, max_depth=12)
clf = clf.fit(X_train,y_train)

# Predecir con datos de test
y_pred = clf.predict(X_test)

print("\nAccuracy:",metrics.accuracy_score(y_test, y_pred))
metrics.precision_recall_fscore_support(y_test, y_pred, average=None)


Accuracy: 0.8476444444444444


(array([0.84520938, 0.85014643]),
 array([0.85283837, 0.84240368]),
 array([0.84900674, 0.84625734]),
 array([45202, 44798], dtype=int64))

In [18]:
# cargar exportador de grafos y funcion de llamada a sistema
from sklearn.tree import export_graphviz
# Exportar arbol
export_graphviz(clf, out_file="data/desercion.dot",  
                filled=True, rounded=True,
                special_characters=True, feature_names = X.columns,class_names = ['SI','NO'])


In [9]:
#pip install pydot
#conda install -c anaconda graphviz

In [19]:
import pydot
(graph,) = pydot.graph_from_dot_file('data/desercion.dot')
graph.write_png('data/desercion.png')

In [20]:
# Calcular matriz de confusion
#metrics.confusion_matrix(y_test, y_pred)
pd.crosstab(y_test, y_pred, 
            rownames=['actual'], 
            colnames=['pred'], margins=False, margins_name="Total")

pred,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,38550,6652
1,7060,37738


In [21]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, y_pred)) 

              precision    recall  f1-score   support

           0       0.85      0.85      0.85     45202
           1       0.85      0.84      0.85     44798

    accuracy                           0.85     90000
   macro avg       0.85      0.85      0.85     90000
weighted avg       0.85      0.85      0.85     90000



In [22]:
# Obtener importancia de variables y vertificar variables mas relevantes
#clf.feature_importances_
fi = pd.DataFrame(zip(X.columns,clf.feature_importances_), columns=['feature','importance'])

fi[fi['importance'] > 0.0].sort_values(by=['importance'], ascending=False)

Unnamed: 0,feature,importance
12,aprobado_complementario,0.361531
3,tipo_documento,0.234196
1,edad,0.214504
0,grado_curso,0.058403
25,recibe_alimentacion,0.049329
11,aprobado_ordinario,0.020076
9,departamento_id,0.013258
2,tiene_sobreedad,0.012695
26,nivel_pobreza,0.007654
7,tipo_gestion_id,0.005071


In [14]:
from sklearn.model_selection import GridSearchCV

#Create GridSearchCV & kNN classifier
parameters = {'min_samples_leaf': [50, 100, 200, 300, 400, 500, 600], 
                 'max_depth': [5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
             }

clf = GridSearchCV(estimator=DecisionTreeClassifier(criterion="entropy"), param_grid = parameters, cv=5)
clf.fit(X, y)
print("\nBest Validation Score: ", clf.best_score_)
print("\nBest Params: ",clf.best_params_)

clf.cv_results_


Best Validation Score:  0.8388433333333334

Best Params:  {'max_depth': 12, 'min_samples_leaf': 200}


{'mean_fit_time': array([0.57980051, 0.56493559, 0.56101313, 0.56939964, 0.57599959,
        0.57479973, 0.5621902 , 0.64381537, 0.64980021, 0.71753426,
        0.6536006 , 0.63559275, 0.63579965, 0.6339994 , 0.76619434,
        0.71639299, 0.71450582, 0.71138897, 0.71720591, 0.7171998 ,
        0.70599961, 0.79120007, 0.80591946, 0.80619946, 0.78839984,
        0.77840238, 0.790411  , 0.78338699, 0.86810098, 0.86080132,
        0.8857378 , 0.85911503, 0.85480018, 0.8529922 , 0.86139951,
        0.94180598, 0.93638239, 0.95039978, 0.92329364, 0.91200023,
        0.9158123 , 0.90940042, 1.01239891, 1.00440006, 1.0096024 ,
        0.9798079 , 0.97280035, 0.97140522, 0.94799995, 1.07599578,
        1.07901053, 1.04222369, 1.02719893, 1.04220009, 0.99440846,
        0.98280115, 1.15510292, 1.11320515, 1.09219966, 1.10340009,
        1.03701053, 1.02239923, 1.1420105 , 1.26599269, 1.33979988,
        1.12259326, 1.09200082, 1.0800343 , 1.0425992 , 1.01480846]),
 'std_fit_time': array([0.026