# Machine Learning

#### Imports

In [8]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
import lightgbm as lgbm

import pickle

pd.set_option('display.max_columns', None)

## Carregar base de dados

In [9]:
PATH = '../data/processed/'

In [10]:
dataset = pd.read_csv(os.path.join(PATH, 'dataset.csv'), sep=',')
print(dataset.shape)
dataset.head()

(4748, 93)


Unnamed: 0,BOM,RUIM,DESEMPENHO_ESC,CD_ESCOLA,CAPITAL,AREA,ESTIMATED_POP,RURAL_URBAN,GVA_PUBLIC,GVA_TOTAL,GDP,GDP_CAPITA,COMP_H,COMP_O,COMP_P,COMP_Q,CLUSTER,DEPENDENCIAS_SALAS_AULA,DEPENDENCIAS_SALA_RECURSO,DEPENDENCIAS_TOT_SALAS_AULA,DEPENDENCIAS_CANTINA,DEPENDENCIAS_COPA,DEPENDENCIAS_REFEITORIO,DEPENDENCIAS_SALA_LEITURA,DEPENDENCIAS_TOT_SALA_LEITURA,DEPENDENCIAS_TOT_QUADRA,DEPENDENCIAS_SALA_PROF,DEPENDENCIAS_PATIO_COBERTO,DEPENDENCIAS_PATIO_DESCOBERTO,DEPENDENCIAS_TOT_VESTIARIO,DEPENDENCIAS_LAB_INFO,DEPENDENCIAS_LAB_CIENCIAS,DEPENDENCIAS_LAB_CIENCIA_FISICA_BIOLOGICA,DEPENDENCIAS_TOT_LAB_CIENCIA,DEPENDENCIAS_LAB_MULTIUSO,DEPENDENCIAS_OFICINA,DEPENDENCIAS_DORMITORIO,DEPENDENCIAS_SANITARIO_ADEQ_DEF,DEPENDENCIAS_SANITARIO_AL_MASC,DEPENDENCIAS_SANITARIO_AL_FEM,DEPENDENCIAS_TOT_SANITARIO_AL,DEPENDENCIAS_TOT_SANITARIO_FUNC,DEPENDENCIAS_DEPEND_ADEQ_DEF,DEPENDENCIAS_SALA_ED_FISICA,DEPENDENCIAS_SALA_PROG_ESC_FAMILIA,DEPENDENCIAS_BRINQUEDOTECA,DEPENDENCIAS_SALA_REFORCO,DEPENDENCIAS_AREA_SERVICO,DEPENDENCIAS_SALA_ATENDIMENTO,DEPENDENCIAS_SALA_ENTRETENIMENTO,FORMACAO_APERF/EXTENSIAOCULTURAL,FORMACAO_BACHARELADO/TECNIOLOGO,FORMACAO_DOUTORADO,FORMACAO_ENSINO_MEDIO,FORMACAO_ESPECIALIZACAO,FORMACAO_LICENCIATURA,FORMACAO_MESTRADO,FORMACAO_S/INFO,QTD_SERVIDORES,QTD_PROFESSORES,MEDIA_FORMACOES,QTD_FORMACAO_CONTINUADA,QTD_CARGOS_DISTINTOS,QTD_TOTAL_ALUNOS,QTD_CLASSES,MEDIA_ALUNOS_SALA,STD_ALUNOS_SALA,QTD_CLASSE_TIPO_ENSINO FUNDAMENTAL DE 9 ANOS,QTD_CLASSE_TIPO_ENSINO MEDIO,QTD_CLASSE_TIPO_CEL,QTD_ALUNOS_TIPO_ENSINO FUNDAMENTAL DE 9 ANOS,QTD_ALUNOS_TIPO_ENSINO MEDIO,QTD_ALUNOS_TIPO_CEL,JORNADA_QTD_DISCIPLINAS_mean,JORNADA_QTD_DISCIPLINAS_std,JORNADA_QTD_DISCIPLINAS_max,JORNADA_QTD_TOTAL_AULAS_mean,JORNADA_QTD_TOTAL_AULAS_std,JORNADA_QTD_TOTAL_AULAS_max,SERVIDORES_IDADE_mean,SERVIDORES_IDADE_std,SERVIDORES_TEMPO_CARGO_C_mean,SERVIDORES_TEMPO_CARGO_C_std,SERVIDORES_CAT_FUNCIONAL_A,SERVIDORES_CAT_FUNCIONAL_F,SERVIDORES_CAT_FUNCIONAL_N,SERVIDORES_CAT_FUNCIONAL_O,SERVIDORES_CAT_FUNCIONAL_P,MATEMATICA,LEITURA,ESCRITA,RELACAO_ALUNO_POR_SERVIDOR,RELACAO_ALUNO_POR_PROFESSOR
0,1.0,7.0,0,24,1,1521.11,12176866.0,Urbano,41902892.72,569910500.0,687035900.0,57071.43,19515.0,153.0,16030.0,22248.0,17,35,0,35,1,0,1,1,1,2,1,1,1,0,2,0,1,1,0,0,0,0,1,1,2,2,1,1,0,0,0,0,0,0,0.0,0.101695,0.008475,0.016949,0.016949,0.771186,0.084746,0.0,139,118.0,1.228814,13.0,6,2703,88,30.715909,7.214208,34.0,40.0,0.0,956.0,1229.0,0.0,3.486726,1.768348,10,48.849558,20.847756,108,46.691429,8.700472,10.725714,6.325839,0.794286,0.142857,0.0,0.062857,0.0,4.0,3.0,4.0,19.446043,22.90678
1,0.0,4.0,0,36,1,1521.11,12176866.0,Urbano,41902892.72,569910500.0,687035900.0,57071.43,19515.0,153.0,16030.0,22248.0,17,21,1,22,1,0,3,1,1,2,1,1,0,0,2,0,1,1,3,0,0,0,1,1,2,2,0,1,0,0,1,0,0,0,0.0,0.094118,0.011765,0.0,0.152941,0.729412,0.011765,0.0,92,85.0,1.305882,15.0,3,2164,69,31.362319,8.549116,34.0,29.0,0.0,1189.0,945.0,0.0,2.975,1.492386,8,46.525,19.308931,104,45.365385,8.625384,8.586538,5.766152,0.759615,0.173077,0.0,0.067308,0.0,4.0,3.0,4.0,23.521739,25.458824
2,0.0,2.0,0,48,1,1521.11,12176866.0,Urbano,41902892.72,569910500.0,687035900.0,57071.43,19515.0,153.0,16030.0,22248.0,17,19,0,19,1,0,0,1,1,1,1,1,0,0,2,0,2,2,1,0,0,0,3,3,6,3,0,0,0,0,0,0,0,0,0.0,0.089286,0.017857,0.0,0.017857,0.839286,0.035714,0.0,66,56.0,1.160714,4.0,4,1189,35,33.971429,5.695642,0.0,35.0,0.0,0.0,1189.0,0.0,2.125,0.489246,4,44.375,16.629409,64,45.729167,8.617582,9.25,7.293833,0.854167,0.104167,0.0,0.041667,0.0,4.0,3.0,4.0,18.015152,21.232143
3,0.0,2.0,0,59,1,1521.11,12176866.0,Urbano,41902892.72,569910500.0,687035900.0,57071.43,19515.0,153.0,16030.0,22248.0,17,11,0,11,0,0,0,1,1,1,1,0,2,0,1,0,0,1,0,0,0,0,1,2,3,3,0,1,0,0,0,0,0,0,0.0,0.090909,0.0,0.0,0.022727,0.818182,0.068182,0.0,49,44.0,1.204545,4.0,3,974,28,34.785714,5.166539,0.0,28.0,0.0,0.0,974.0,0.0,2.15,0.533494,4,42.5,16.195916,98,43.230769,8.446813,7.615385,5.994003,0.794872,0.128205,0.0,0.076923,0.0,4.0,3.0,4.0,19.877551,22.136364
4,3.0,1.0,1,61,1,1521.11,12176866.0,Urbano,41902892.72,569910500.0,687035900.0,57071.43,19515.0,153.0,16030.0,22248.0,17,15,1,16,0,1,1,1,1,1,1,1,0,2,1,0,0,0,0,0,0,0,1,1,2,5,0,0,0,0,0,0,0,0,0.0,0.039216,0.0,0.098039,0.039216,0.803922,0.019608,0.0,59,51.0,1.078431,3.0,4,909,33,27.545455,7.814613,30.0,0.0,0.0,897.0,0.0,0.0,2.488889,0.869227,4,48.755556,8.668531,64,47.788462,6.889803,11.961538,9.876837,0.653846,0.134615,0.0,0.211538,0.0,4.0,3.0,4.0,15.40678,17.823529


#### Remover a coluna de identificação *CD_ESCOLA* e os atributos *BOM* e *RUIM* que foram utilizados para construir a classe alvo
Cada linha representa uma escola, apesar da remoção de CD_ESCOLA, que é necessário já que este não é um atributo para predizer o desempenho escolar, é possível identificar o resultado da predição da escola com os índices do dataset, que permite localizar o *CD_ESCOLA* posteriormente

In [11]:
dataset.drop(['BOM', 'RUIM', 'CD_ESCOLA'], axis=1, inplace=True)

### Renomear a coluna *DESEMPENHO_ESC* para *DESEMPENHO*

In [12]:
dataset = dataset.rename(columns={'DESEMPENHO_ESC': 'DESEMPENHO'})

## Encodificação
Transformar os atributos categóricos para numéricos

### Transformar atributo CLUSTER para One Hot Encoder 

In [13]:
df_cluster_dummie = pd.get_dummies(dataset['CLUSTER'], prefix='CLUSTER_')
dataset = pd.concat([dataset.drop('CLUSTER', axis=1), df_cluster_dummie], axis=1)

### Transformar o atributo RURAL_URBAN para o tipo binario

- Urbano -> 1
- Intermediário Adjacente -> 0
- Rural Adjacente -> 0

In [14]:
dataset['RURAL_URBAN'].value_counts(normalize=True) * 100

Urbano                     89.406066
Rural Adjacente             6.234204
Intermediário Adjacente     4.359730
Name: RURAL_URBAN, dtype: float64

In [15]:
dataset['RURAL_URBAN'] = np.where(dataset['RURAL_URBAN'] == 'Urbano', 1, 0)

## Preenchimento dos dados faltantes

In [16]:
missing_values_cols = dataset.isnull().sum()
missing_values_cols[missing_values_cols > 0]

Series([], dtype: int64)

In [17]:
dataset['CAPITAL'] = dataset['CAPITAL'].fillna(0)

In [18]:
cols = dataset.columns

imputer = SimpleImputer(strategy='mean')
dataset = imputer.fit_transform(dataset)

dataset = pd.DataFrame(dataset, columns=cols)

## Separacao dos dados
- Treinamento: 80%
- Teste: 20%

In [19]:
X_train, X_test, y_train, y_test = train_test_split(dataset.drop('DESEMPENHO', axis=1), dataset['DESEMPENHO'], test_size=0.2, random_state=17)

## Modeling

### Validação Cruzada com 10 folds

In [20]:
scale_pos_weight = y_train[y_train==0].count() / y_train[y_train==1].count()

In [21]:
model = lgbm.LGBMClassifier(n_estimators=300, is_imbalanced=True, scale_pos_weight=scale_pos_weight, devide='gpu')
scores = cross_val_score(estimator=model, X=X_train, y=y_train, cv=10, n_jobs=-1)

print('Acuracia Media:', np.mean(scores))
print('Desvio Padrao:', np.std(scores))
print(scores)

Acuracia Media: 0.9212817664213304
Desvio Padrao: 0.011642302222870014
[0.91315789 0.93157895 0.90789474 0.91842105 0.91052632 0.93157895
 0.91578947 0.91315789 0.92348285 0.94722955]


### Avaliação no conjunto de teste

In [22]:
model = lgbm.LGBMClassifier(n_estimators=300, is_imbalanced=True, scale_pos_weight=scale_pos_weight, devide='gpu')
model.fit(X_train, y_train)
pred = model.predict(X_test)

In [23]:
print('Acc score:', accuracy_score(y_test, pred))
print('F1 score:', f1_score(y_test, pred))
print('AUC ROC:', roc_auc_score(y_test, pred))

Acc score: 0.9315789473684211
F1 score: 0.7840531561461794
AUC ROC: 0.9023666514037403


### Modelagem com o mesmo classificador, LGBM, porem removendo o atributo relacionado ao Ensino Medio
O ensino medio apresenta um SARESP extremamente desbalanceado, com um desempenho ruim em na maioria dos casos. O modelo conseguiu aprender que uma quantidade alta de alunos do ensino medio e/ou muitas turmas destas vao impactar negativamente no desempenho do SARESP escolar. Esses atributos serao removidos, para testar a performance do modelo sem estes, mais importante que isto, para aumentar a possibilidade do modelo explorar os outros atributos, o que sera extremamente util para analisar o impacto das caracteristicas.

In [24]:
cols_to_drop = ['QTD_ALUNOS_TIPO_ENSINO FUNDAMENTAL DE 9 ANOS', 'QTD_ALUNOS_TIPO_ENSINO MEDIO', 'QTD_ALUNOS_TIPO_CEL', 'QTD_CLASSE_TIPO_ENSINO MEDIO', 'QTD_CLASSE_TIPO_ENSINO FUNDAMENTAL DE 9 ANOS', 'QTD_CLASSE_TIPO_CEL']

In [25]:
X_train, X_test, y_train, y_test = train_test_split(dataset.drop(cols_to_drop + ['DESEMPENHO'], axis=1), dataset['DESEMPENHO'], test_size=0.2, random_state=17)

In [26]:
model2 = lgbm.LGBMClassifier(n_estimators=300, is_imbalanced=True, scale_pos_weight=scale_pos_weight, devide='gpu')
model2.fit(X_train, y_train)
pred = model2.predict(X_test)

In [27]:
print('Acc score:', accuracy_score(y_test, pred))
print('F1 score:', f1_score(y_test, pred))
print('AUC ROC:', roc_auc_score(y_test, pred))

Acc score: 0.92
F1 score: 0.7414965986394558
AUC ROC: 0.8682899237751501


### Modelagem com uma Arvore de Decisao simples (com profundidade pequena)
Esse modelo sera utilizado para descobrir regras que separam as escolas com desempenho ruins das boas.

In [28]:
X_train, X_test, y_train, y_test = train_test_split(dataset.drop(cols_to_drop + ['DESEMPENHO'], axis=1), dataset['DESEMPENHO'], test_size=0.2, random_state=17)

In [29]:
tree = DecisionTreeClassifier(criterion='entropy', max_depth=4, random_state=17)
scores = cross_val_score(estimator=tree, X=X_train, y=y_train, cv=10, n_jobs=-1)

print('Acuracia Media:', np.mean(scores))
print('Desvio Padrao:', np.std(scores))
print(scores)

Acuracia Media: 0.9044209137619775
Desvio Padrao: 0.01799611161097428
[0.91578947 0.91578947 0.92631579 0.92631579 0.89473684 0.88947368
 0.88684211 0.88947368 0.87598945 0.92348285]


In [30]:
tree = DecisionTreeClassifier(criterion='entropy', max_depth=4, random_state=17)
tree.fit(X_train, y_train)
pred = tree.predict(X_test)

print('Acc score:', accuracy_score(y_test, pred))
print('F1 score:', f1_score(y_test, pred))
print('AUC ROC:', roc_auc_score(y_test, pred))

Acc score: 0.911578947368421
F1 score: 0.7123287671232876
AUC ROC: 0.8481967301424839


### Desempenho no conjunto de treinamento
Apenas para verificiar se nao houve overfitting

In [31]:
# idx = X_train['QTD_TOTAL_ALUNOS'].sort_values(ascending=False).head(380).index
y_train.value_counts(normalize=True)

0.0    0.837283
1.0    0.162717
Name: DESEMPENHO, dtype: float64

In [32]:
idx = X_train['QTD_TOTAL_ALUNOS'].sort_values(ascending=False).head(380).index
y_train.loc[idx].value_counts(normalize=True)

0.0    0.968421
1.0    0.031579
Name: DESEMPENHO, dtype: float64

## Exportar modelo

In [33]:
with open('../models/model_1.pkl', 'wb') as f:
    pickle.dump(model, f)
    
with open('../models/model_2.pkl', 'wb') as f:
    pickle.dump(model2, f)
    
with open('../models/model_tree.pkl', 'wb') as f:
    pickle.dump(tree, f)

### Conjunto de dados completo (final)

In [34]:
dataset.to_csv('../data/processed/dataset_final.csv', index=False)