# Feature Engineering
Criacao de features e preparacao dos dados para etapa de Machine Learning

#### Imports

In [1]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from unidecode import unidecode

pd.set_option('display.max_columns', None)

## Load data

In [2]:
PATH = '../data/processed/'

### Dados das Escolas (SARESP)

In [3]:
df_escolas = pd.read_csv(os.path.join(PATH, 'escolas_saresp_2018.csv'), sep=',')
print(df_escolas.shape)
df_escolas.head()

(20378, 14)


Unnamed: 0,DEPADM,DepBol,NomeDepBol,codRMet,CODESC,NOMESC,SERIE_ANO,cod_per,periodo,co_comp,ds_comp,medprof,PONTUACAO,DESEMPENHO
0,1,1,Rede Estadual,1,12,AYRES DE MOURA PROFESSOR,9º Ano EF,9,GERAL,1,LÍNGUA PORTUGUESA,263.1,BASICO,RUIM
1,1,1,Rede Estadual,1,12,AYRES DE MOURA PROFESSOR,9º Ano EF,9,GERAL,2,MATEMÁTICA,272.3,BASICO,RUIM
2,1,1,Rede Estadual,1,24,GAVIAO PEIXOTO BRIGADEIRO,3º Ano EF,9,GERAL,1,LÍNGUA PORTUGUESA,170.4,BASICO,RUIM
3,1,1,Rede Estadual,1,24,GAVIAO PEIXOTO BRIGADEIRO,3º Ano EF,9,GERAL,2,MATEMÁTICA,181.5,BASICO,RUIM
4,1,1,Rede Estadual,1,24,GAVIAO PEIXOTO BRIGADEIRO,5º Ano EF,9,GERAL,1,LÍNGUA PORTUGUESA,207.9,ADEQUADO,BOM


### Dados das Dependencias

In [4]:
df_dependencias = pd.read_csv(os.path.join(PATH, 'dependencias.csv'), sep=',')
print(df_dependencias.shape)
df_dependencias.head()

(5608, 38)


Unnamed: 0,CODESC,NOMESC,TIPOESC,TIPOESC_DESC,SITUACAO,SALAS_AULA,SALA_RECURSO,TOT_SALAS_AULA,CANTINA,COPA,REFEITORIO,SALA_LEITURA,TOT_SALA_LEITURA,TOT_QUADRA,SALA_PROF,PATIO_COBERTO,PATIO_DESCOBERTO,TOT_VESTIARIO,LAB_INFO,LAB_CIENCIAS,LAB_CIENCIA_FISICA_BIOLOGICA,TOT_LAB_CIENCIA,LAB_MULTIUSO,OFICINA,DORMITORIO,SANITARIO_ADEQ_DEF,SANITARIO_AL_MASC,SANITARIO_AL_FEM,TOT_SANITARIO_AL,TOT_SANITARIO_FUNC,DEPEND_ADEQ_DEF,SALA_ED_FISICA,SALA_PROG_ESC_FAMILIA,BRINQUEDOTECA,SALA_REFORCO,AREA_SERVICO,SALA_ATENDIMENTO,SALA_ENTRETENIMENTO
0,985429,CEL JTO A EE FLEURIDES CAVALINI MENECHINO PROFA,6,CEL,Ativa,5,0,5,0,0,1,1,1,2,1,1,0,0,1,0,0,0,0,0,0,0,1,1,2,2,1,0,0,0,0,0,0,0
1,31045,DURVALINO GRION PROF,8,EE,Ativa,9,0,9,0,1,0,1,1,1,1,2,0,0,1,0,0,0,0,0,0,1,1,1,2,2,1,0,0,0,0,0,0,0
2,31112,FLEURIDES CAVALLINI MENECHINO PROFA,8,EE,Ativa,12,1,13,0,1,1,1,1,1,1,2,1,2,1,0,0,0,0,0,0,1,2,3,5,2,0,1,0,0,0,0,0,0
3,30806,HELEN KELLER,8,EE,Ativa,16,2,18,1,1,1,1,1,1,1,2,0,0,1,0,0,0,0,0,0,1,2,2,4,2,1,1,1,0,0,0,0,0
4,31264,9 DE JULHO,8,EE,Ativa,21,0,21,0,1,1,1,1,2,1,1,1,2,1,0,0,3,0,0,0,0,1,1,2,2,0,0,1,0,0,0,0,0


### Dados da Formacao dos Servidores

In [5]:
df_formacao_serv = pd.read_csv(os.path.join(PATH, 'formacao_servidores.csv'), sep=',')
print(df_formacao_serv.shape)
df_formacao_serv.head()

(240465, 31)


Unnamed: 0,REGIAO_EXERC,DE_EXERC,CIE_ESCOLA,UA_EXERC,NOME_UA_EXERC,MUNICIPIO_EXERC,RG12,DI,CPF,NOME,QUADRO_C,CARGO_C,NM_CARGO_C,CATEG_C,DTIEXER_C,QUADRO_E,CARGO_E,NMCARGO_E,CATEG_E,FORMACAO,id_interno,FORMACAO_APERF/EXTENSIAOCULTURAL,FORMACAO_BACHARELADO/TECNIOLOGO,FORMACAO_DOUTORADO,FORMACAO_ENSINOFUNDAMENTAL,FORMACAO_ENSINOMIaDIO,FORMACAO_ESPECIALIZAIaIAO,FORMACAO_LICENCIATURA,FORMACAO_MESTRADO,FORMACAO_S/INFO,TITULACAO
0,INTERIOR,D.E.REG. RIBEIRAO PRETO,911306,79305,EE ROSANGELA BASILE-PROFA.,RIBEIRAO PRETO,164234.0,1.0,164234.0,164234.0,QM,6407.0,PROFESSOR EDUCACAO BASICA I,O,01/06/2016,QM,6407.0,PROFESSOR EDUCACAO BASICA I,O,LICENCIATURA,164234.0,0,0,0,0,0,0,1,0,0,LICENCIATURA
1,INTERIOR,D.E.REG. RIBEIRAO PRETO,911306,79305,EE ROSANGELA BASILE-PROFA.,RIBEIRAO PRETO,382145.0,1.0,382145.0,382145.0,QM,6407.0,PROFESSOR EDUCACAO BASICA I,A,08/02/1993,QM,6407.0,PROFESSOR EDUCACAO BASICA I,A,LICENCIATURA + BACHARELADO/TECNIOLOGO + ESPECI...,382145.0,0,1,0,0,0,1,1,0,0,ESPECIALIZACAO
2,INTERIOR,D.E.REG. RIBEIRAO PRETO,911306,79305,EE ROSANGELA BASILE-PROFA.,RIBEIRAO PRETO,345349.0,1.0,345349.0,345349.0,QM,6407.0,PROFESSOR EDUCACAO BASICA I,A,18/08/1994,QM,6407.0,PROFESSOR EDUCACAO BASICA I,A,LICENCIATURA + ESPECIALIZAIaIAO,345349.0,0,0,0,0,0,1,1,0,0,ESPECIALIZACAO
3,INTERIOR,D.E.REG. RIBEIRAO PRETO,911306,79305,EE ROSANGELA BASILE-PROFA.,RIBEIRAO PRETO,290900.0,1.0,290900.0,290900.0,QM,6407.0,PROFESSOR EDUCACAO BASICA I,O,03/04/2017,QM,6407.0,PROFESSOR EDUCACAO BASICA I,O,LICENCIATURA,290900.0,0,0,0,0,0,0,1,0,0,LICENCIATURA
4,INTERIOR,D.E.REG. RIBEIRAO PRETO,911306,79305,EE ROSANGELA BASILE-PROFA.,RIBEIRAO PRETO,316060.0,1.0,316060.0,316060.0,QM,6407.0,PROFESSOR EDUCACAO BASICA I,O,04/06/2018,QM,6407.0,PROFESSOR EDUCACAO BASICA I,O,LICENCIATURA,316060.0,0,0,0,0,0,0,1,0,0,LICENCIATURA


### Dados das Coordenadas

In [6]:
df_localizacao = pd.read_csv(os.path.join(PATH, 'localizacao.csv'), sep=',')
print(df_localizacao.shape)
df_localizacao.head()

(1460, 35)


Unnamed: 0,nomedep,depadm,mun,codmun,de,CD_ESCOLA,CD_DIRETORIA,CD_DIRETORIA_ESTADUAL,CD_DIRETORIA_SUPVS_PROPR,NM_COMPLETO_ESCOLA,CD_UNIDADE,DS_ENDERECO,COMPLEMENTO,NUMERO,CD_ORIGEM_UNIDADE,CD_ORIGEM_ESCOLA,CD_ORIGEM_ENDERECO,LATITUDE,LONGITUDE,DEPADM,DepBol,NomeDepBol,codRMet,CODESC,NOMESC,SERIE_ANO,cod_per,periodo,co_comp,ds_comp,medprof,PONTUACAO,DESEMPENHO,Points,CLUSTER
0,ESTADUAL - SE,1,SAO PAULO,100,NORTE 1,36444,10101,10101,10101,GENESIO DE ALMEIDA MOURA DOUTOR,37806,DOMINGOS AREVALO,RUA,862,37806,36444,37866,-23.447,-46.6967,1,1,Rede Estadual,1,36444,GENESIO DE ALMEIDA MOURA DOUTOR,5º Ano EF,9,GERAL,2,MATEMÁTICA,181.6,BASICO,RUIM,POINT (-46.6967 -23.447),12
1,ESTADUAL - SE,1,SAO PAULO,100,LESTE 5,1582,10205,10205,10205,DOMINGOS FAUSTINO SARMIENTO,24902,21 DE ABRIL,RUA,970,24902,1582,24962,-23.542232,-46.605793,1,1,Rede Estadual,1,1582,DOMINGOS FAUSTINO SARMIENTO,5º Ano EF,9,GERAL,2,MATEMÁTICA,218.3,BASICO,RUIM,POINT (-46.6057931815944 -23.5422317000605),12
2,ESTADUAL - SE,1,SAO PAULO,100,CENTRO SUL,4388,10316,10316,10316,SEMINARIO NOSSA SENHORA DA GLORIA,37397,RUA MOREIRA DE GODOI,RUA,399,37397,4388,37457,-23.591604,-46.609698,1,1,Rede Estadual,1,4388,SEMINARIO NOSSA SENHORA DA GLORIA,5º Ano EF,9,GERAL,2,MATEMÁTICA,218.7,BASICO,RUIM,POINT (-46.6096977 -23.5916041),12
3,ESTADUAL - SE,1,SAO PAULO,100,CENTRO SUL,4424,10316,10316,10316,FRANCISCO DE ASSIS REYS,18610,RUA BOM PASTOR,RUA,1560,18610,4424,18670,-23.590326,-46.607238,1,1,Rede Estadual,1,4424,FRANCISCO DE ASSIS REYS,5º Ano EF,9,GERAL,2,MATEMÁTICA,252.4,ADEQUADO,BOM,POINT (-46.6072382 -23.590326),12
4,ESTADUAL - SE,1,SAO PAULO,100,CENTRO SUL,1739,10316,10316,10316,ANDRE DREYFUS PROFESSOR,12296,RUA PEDRO DE GODOI,RUA,170,12296,1739,12356,-23.587969,-46.582752,1,1,Rede Estadual,1,1739,ANDRE DREYFUS PROFESSOR,5º Ano EF,9,GERAL,2,MATEMÁTICA,227.6,ADEQUADO,BOM,POINT (-46.5827517 -23.5879693),12


## Criacao de atributos

### Atributos criados atraves da agregacao da Formacao dos Servidores 

In [7]:
## Selecionando apenas o QUADRO MAGISTERIO
# df_formacao_serv_QM = df_formacao_serv[df_formacao_serv['QUADRO_C'] == 'QM']
df_formacao_serv_QM = df_formacao_serv[(df_formacao_serv['QUADRO_C'] == 'QM') & (df_formacao_serv['NM_CARGO_C'] == 'PROFESSOR EDUCACAO BASICA I')]

qtd_titulacao_escola = df_formacao_serv_QM.groupby('CIE_ESCOLA')['TITULACAO'].value_counts().unstack()
qtd_titulacao_escola.fillna(0, inplace=True)
qtd_titulacao_escola.head()

TITULACAO,APERF/EXTENSIAOCULTURAL,BACHARELADO/TECNIOLOGO,DOUTORADO,ENSINO_MEDIO,ESPECIALIZACAO,LICENCIATURA,MESTRADO,S/INFO
CIE_ESCOLA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0.0,19.0,1.0,20.0,45.0,423.0,19.0,0.0
12,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0
24,0.0,1.0,0.0,2.0,0.0,24.0,0.0,0.0
36,0.0,1.0,0.0,0.0,1.0,2.0,0.0,0.0
48,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [7]:
## VERSAO NORMALIZADA

## Selecionando apenas o QUADRO MAGISTERIO e professores da EDUCACAO BASICA I
# df_formacao_serv_QM = df_formacao_serv[df_formacao_serv['QUADRO_C'] == 'QM']
df_formacao_serv_QM = df_formacao_serv[(df_formacao_serv['QUADRO_C'] == 'QM') & (df_formacao_serv['NM_CARGO_C'] == 'PROFESSOR EDUCACAO BASICA I')]

qtd_titulacao_escola = df_formacao_serv_QM.groupby('CIE_ESCOLA')['TITULACAO'].value_counts(normalize=True).unstack()
qtd_titulacao_escola.fillna(0, inplace=True)
qtd_titulacao_escola.head()

TITULACAO,APERF/EXTENSIAOCULTURAL,BACHARELADO/TECNIOLOGO,DOUTORADO,ENSINO_MEDIO,ESPECIALIZACAO,LICENCIATURA,MESTRADO,S/INFO
CIE_ESCOLA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0.0,0.036053,0.001898,0.037951,0.085389,0.802657,0.036053,0.0
12,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
24,0.0,0.037037,0.0,0.074074,0.0,0.888889,0.0,0.0
36,0.0,0.25,0.0,0.0,0.25,0.5,0.0,0.0
48,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [8]:
## Qtd de servidores por escola
qtd_servidores = df_formacao_serv_QM.groupby('CIE_ESCOLA').size()
qtd_servidores.name = 'QTD_SERVIDORES'

In [9]:
df_features_formacao = pd.concat([qtd_titulacao_escola, qtd_servidores], axis=1)
df_features_formacao.reset_index(inplace=True)
df_features_formacao.head()

Unnamed: 0,CIE_ESCOLA,APERF/EXTENSIAOCULTURAL,BACHARELADO/TECNIOLOGO,DOUTORADO,ENSINO_MEDIO,ESPECIALIZACAO,LICENCIATURA,MESTRADO,S/INFO,QTD_SERVIDORES
0,0,0.0,0.036053,0.001898,0.037951,0.085389,0.802657,0.036053,0.0,527
1,12,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,4
2,24,0.0,0.037037,0.0,0.074074,0.0,0.888889,0.0,0.0,27
3,36,0.0,0.25,0.0,0.0,0.25,0.5,0.0,0.0,4
4,48,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1


## Selecao de atributos e juncao das bases

In [10]:
df_dependencias.head(2)

Unnamed: 0,CODESC,NOMESC,TIPOESC,TIPOESC_DESC,SITUACAO,SALAS_AULA,SALA_RECURSO,TOT_SALAS_AULA,CANTINA,COPA,REFEITORIO,SALA_LEITURA,TOT_SALA_LEITURA,TOT_QUADRA,SALA_PROF,PATIO_COBERTO,PATIO_DESCOBERTO,TOT_VESTIARIO,LAB_INFO,LAB_CIENCIAS,LAB_CIENCIA_FISICA_BIOLOGICA,TOT_LAB_CIENCIA,LAB_MULTIUSO,OFICINA,DORMITORIO,SANITARIO_ADEQ_DEF,SANITARIO_AL_MASC,SANITARIO_AL_FEM,TOT_SANITARIO_AL,TOT_SANITARIO_FUNC,DEPEND_ADEQ_DEF,SALA_ED_FISICA,SALA_PROG_ESC_FAMILIA,BRINQUEDOTECA,SALA_REFORCO,AREA_SERVICO,SALA_ATENDIMENTO,SALA_ENTRETENIMENTO
0,985429,CEL JTO A EE FLEURIDES CAVALINI MENECHINO PROFA,6,CEL,Ativa,5,0,5,0,0,1,1,1,2,1,1,0,0,1,0,0,0,0,0,0,0,1,1,2,2,1,0,0,0,0,0,0,0
1,31045,DURVALINO GRION PROF,8,EE,Ativa,9,0,9,0,1,0,1,1,1,1,2,0,0,1,0,0,0,0,0,0,1,1,1,2,2,1,0,0,0,0,0,0,0


In [15]:
data = pd.merge(df_localizacao, df_dependencias, left_on='CD_ESCOLA', right_on='CODESC', how='inner')

In [16]:
data = pd.merge(data, df_features_formacao, left_on='CD_ESCOLA', right_on='CIE_ESCOLA', how='inner')

In [17]:
# data = pd.merge(data, df_localizacao[['CD_ESCOLA', 'CLUSTER']], on='CD_ESCOLA', how='inner')

In [18]:
data.head()

Unnamed: 0,nomedep,depadm,mun,codmun,de,CD_ESCOLA,CD_DIRETORIA,CD_DIRETORIA_ESTADUAL,CD_DIRETORIA_SUPVS_PROPR,NM_COMPLETO_ESCOLA,CD_UNIDADE,DS_ENDERECO,COMPLEMENTO,NUMERO,CD_ORIGEM_UNIDADE,CD_ORIGEM_ESCOLA,CD_ORIGEM_ENDERECO,LATITUDE,LONGITUDE,DEPADM,DepBol,NomeDepBol,codRMet,CODESC_x,NOMESC_x,SERIE_ANO,cod_per,periodo,co_comp,ds_comp,medprof,PONTUACAO,DESEMPENHO,Points,CLUSTER,CODESC_y,NOMESC_y,TIPOESC,TIPOESC_DESC,SITUACAO,SALAS_AULA,SALA_RECURSO,TOT_SALAS_AULA,CANTINA,COPA,REFEITORIO,SALA_LEITURA,TOT_SALA_LEITURA,TOT_QUADRA,SALA_PROF,PATIO_COBERTO,PATIO_DESCOBERTO,TOT_VESTIARIO,LAB_INFO,LAB_CIENCIAS,LAB_CIENCIA_FISICA_BIOLOGICA,TOT_LAB_CIENCIA,LAB_MULTIUSO,OFICINA,DORMITORIO,SANITARIO_ADEQ_DEF,SANITARIO_AL_MASC,SANITARIO_AL_FEM,TOT_SANITARIO_AL,TOT_SANITARIO_FUNC,DEPEND_ADEQ_DEF,SALA_ED_FISICA,SALA_PROG_ESC_FAMILIA,BRINQUEDOTECA,SALA_REFORCO,AREA_SERVICO,SALA_ATENDIMENTO,SALA_ENTRETENIMENTO,CIE_ESCOLA,APERF/EXTENSIAOCULTURAL,BACHARELADO/TECNIOLOGO,DOUTORADO,ENSINO_MEDIO,ESPECIALIZACAO,LICENCIATURA,MESTRADO,S/INFO,QTD_SERVIDORES
0,ESTADUAL - SE,1,SAO PAULO,100,NORTE 1,36444,10101,10101,10101,GENESIO DE ALMEIDA MOURA DOUTOR,37806,DOMINGOS AREVALO,RUA,862,37806,36444,37866,-23.447,-46.6967,1,1,Rede Estadual,1,36444,GENESIO DE ALMEIDA MOURA DOUTOR,5º Ano EF,9,GERAL,2,MATEMÁTICA,181.6,BASICO,RUIM,POINT (-46.6967 -23.447),12,36444,GENESIO DE ALMEIDA MOURA DOUTOR,8,EE,Ativa,19,0,19,0,0,1,1,1,2,1,0,0,0,1,0,0,0,0,0,0,0,1,1,2,0,0,1,0,0,0,0,0,0,36444,0.0,0.0,0.0,0.08,0.04,0.88,0.0,0.0,25
1,ESTADUAL - SE,1,SAO PAULO,100,LESTE 5,1582,10205,10205,10205,DOMINGOS FAUSTINO SARMIENTO,24902,21 DE ABRIL,RUA,970,24902,1582,24962,-23.542232,-46.605793,1,1,Rede Estadual,1,1582,DOMINGOS FAUSTINO SARMIENTO,5º Ano EF,9,GERAL,2,MATEMÁTICA,218.3,BASICO,RUIM,POINT (-46.6057931815944 -23.5422317000605),12,1582,DOMINGOS FAUSTINO SARMIENTO,8,EE,Ativa,14,1,15,1,0,2,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,3,3,6,3,0,1,1,0,0,0,0,0,1582,0.0,0.030303,0.0,0.0,0.090909,0.878788,0.0,0.0,33
2,ESTADUAL - SE,1,SAO PAULO,100,CENTRO SUL,4388,10316,10316,10316,SEMINARIO NOSSA SENHORA DA GLORIA,37397,RUA MOREIRA DE GODOI,RUA,399,37397,4388,37457,-23.591604,-46.609698,1,1,Rede Estadual,1,4388,SEMINARIO NOSSA SENHORA DA GLORIA,5º Ano EF,9,GERAL,2,MATEMÁTICA,218.7,BASICO,RUIM,POINT (-46.6096977 -23.5916041),12,4388,SEMINARIO NOSSA SENHORA DA GLORIA,8,EE,Ativa,26,0,26,1,0,1,1,1,2,1,2,1,0,1,0,1,1,0,0,0,0,3,3,6,2,0,1,0,0,0,0,0,0,4388,0.0,0.037037,0.0,0.296296,0.037037,0.62963,0.0,0.0,27
3,ESTADUAL - SE,1,SAO PAULO,100,CENTRO SUL,4424,10316,10316,10316,FRANCISCO DE ASSIS REYS,18610,RUA BOM PASTOR,RUA,1560,18610,4424,18670,-23.590326,-46.607238,1,1,Rede Estadual,1,4424,FRANCISCO DE ASSIS REYS,5º Ano EF,9,GERAL,2,MATEMÁTICA,252.4,ADEQUADO,BOM,POINT (-46.6072382 -23.590326),12,4424,FRANCISCO DE ASSIS REYS,8,EE,Ativa,11,1,12,1,0,1,1,1,2,1,1,0,0,1,0,0,0,0,0,0,1,1,1,2,0,0,2,0,1,0,0,0,0,4424,0.0,0.03125,0.0,0.09375,0.03125,0.84375,0.0,0.0,32
4,ESTADUAL - SE,1,SAO PAULO,100,CENTRO SUL,1739,10316,10316,10316,ANDRE DREYFUS PROFESSOR,12296,RUA PEDRO DE GODOI,RUA,170,12296,1739,12356,-23.587969,-46.582752,1,1,Rede Estadual,1,1739,ANDRE DREYFUS PROFESSOR,5º Ano EF,9,GERAL,2,MATEMÁTICA,227.6,ADEQUADO,BOM,POINT (-46.5827517 -23.5879693),12,1739,ANDRE DREYFUS PROFESSOR,8,EE,Ativa,8,1,9,0,0,1,1,1,1,1,1,0,0,1,0,0,0,0,0,0,0,1,1,2,2,0,1,0,0,0,0,0,0,1739,0.0,0.047619,0.0,0.095238,0.0,0.857143,0.0,0.0,21


In [19]:
data.columns

Index(['nomedep', 'depadm', 'mun', 'codmun', 'de', 'CD_ESCOLA', 'CD_DIRETORIA',
       'CD_DIRETORIA_ESTADUAL', 'CD_DIRETORIA_SUPVS_PROPR',
       'NM_COMPLETO_ESCOLA', 'CD_UNIDADE', 'DS_ENDERECO', 'COMPLEMENTO',
       'NUMERO', 'CD_ORIGEM_UNIDADE', 'CD_ORIGEM_ESCOLA', 'CD_ORIGEM_ENDERECO',
       'LATITUDE', 'LONGITUDE', 'DEPADM', 'DepBol', 'NomeDepBol', 'codRMet',
       'CODESC_x', 'NOMESC_x', 'SERIE_ANO', 'cod_per', 'periodo', 'co_comp',
       'ds_comp', 'medprof', 'PONTUACAO', 'DESEMPENHO', 'Points', 'CLUSTER',
       'CODESC_y', 'NOMESC_y', 'TIPOESC', 'TIPOESC_DESC', 'SITUACAO',
       'SALAS_AULA', 'SALA_RECURSO', 'TOT_SALAS_AULA', 'CANTINA', 'COPA',
       'REFEITORIO', 'SALA_LEITURA', 'TOT_SALA_LEITURA', 'TOT_QUADRA',
       'SALA_PROF', 'PATIO_COBERTO', 'PATIO_DESCOBERTO', 'TOT_VESTIARIO',
       'LAB_INFO', 'LAB_CIENCIAS', 'LAB_CIENCIA_FISICA_BIOLOGICA',
       'TOT_LAB_CIENCIA', 'LAB_MULTIUSO', 'OFICINA', 'DORMITORIO',
       'SANITARIO_ADEQ_DEF', 'SANITARIO_AL_MASC', 'SA

In [20]:
columns_to_remove = [
    'nomedep', 'depadm', 'mun', 'codmun', 'de', 'CD_DIRETORIA',
    'CD_DIRETORIA_ESTADUAL', 'CD_DIRETORIA_SUPVS_PROPR',
    'NM_COMPLETO_ESCOLA', 'CD_UNIDADE', 'DS_ENDERECO', 'COMPLEMENTO',
    'NUMERO', 'CD_ORIGEM_UNIDADE', 'CD_ORIGEM_ESCOLA', 'CD_ORIGEM_ENDERECO',
    'LATITUDE', 'LONGITUDE', 'DEPADM', 'DepBol', 'NomeDepBol', 'codRMet',
    'CODESC_x', 'NOMESC_x', 'SERIE_ANO', 'cod_per', 'periodo', 'co_comp',
    'medprof', 'Points', 'CODESC_y', 'NOMESC_y', 'TIPOESC', 'TIPOESC_DESC', 
    'SITUACAO', 'CIE_ESCOLA'
]

In [21]:
dataset = data.drop(columns_to_remove, axis=1)

In [22]:
print(dataset.shape)
dataset.head()

(1458, 47)


Unnamed: 0,CD_ESCOLA,ds_comp,PONTUACAO,DESEMPENHO,CLUSTER,SALAS_AULA,SALA_RECURSO,TOT_SALAS_AULA,CANTINA,COPA,REFEITORIO,SALA_LEITURA,TOT_SALA_LEITURA,TOT_QUADRA,SALA_PROF,PATIO_COBERTO,PATIO_DESCOBERTO,TOT_VESTIARIO,LAB_INFO,LAB_CIENCIAS,LAB_CIENCIA_FISICA_BIOLOGICA,TOT_LAB_CIENCIA,LAB_MULTIUSO,OFICINA,DORMITORIO,SANITARIO_ADEQ_DEF,SANITARIO_AL_MASC,SANITARIO_AL_FEM,TOT_SANITARIO_AL,TOT_SANITARIO_FUNC,DEPEND_ADEQ_DEF,SALA_ED_FISICA,SALA_PROG_ESC_FAMILIA,BRINQUEDOTECA,SALA_REFORCO,AREA_SERVICO,SALA_ATENDIMENTO,SALA_ENTRETENIMENTO,APERF/EXTENSIAOCULTURAL,BACHARELADO/TECNIOLOGO,DOUTORADO,ENSINO_MEDIO,ESPECIALIZACAO,LICENCIATURA,MESTRADO,S/INFO,QTD_SERVIDORES
0,36444,MATEMÁTICA,BASICO,RUIM,12,19,0,19,0,0,1,1,1,2,1,0,0,0,1,0,0,0,0,0,0,0,1,1,2,0,0,1,0,0,0,0,0,0,0.0,0.0,0.0,0.08,0.04,0.88,0.0,0.0,25
1,1582,MATEMÁTICA,BASICO,RUIM,12,14,1,15,1,0,2,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,3,3,6,3,0,1,1,0,0,0,0,0,0.0,0.030303,0.0,0.0,0.090909,0.878788,0.0,0.0,33
2,4388,MATEMÁTICA,BASICO,RUIM,12,26,0,26,1,0,1,1,1,2,1,2,1,0,1,0,1,1,0,0,0,0,3,3,6,2,0,1,0,0,0,0,0,0,0.0,0.037037,0.0,0.296296,0.037037,0.62963,0.0,0.0,27
3,4424,MATEMÁTICA,ADEQUADO,BOM,12,11,1,12,1,0,1,1,1,2,1,1,0,0,1,0,0,0,0,0,0,1,1,1,2,0,0,2,0,1,0,0,0,0,0.0,0.03125,0.0,0.09375,0.03125,0.84375,0.0,0.0,32
4,1739,MATEMÁTICA,ADEQUADO,BOM,12,8,1,9,0,0,1,1,1,1,1,1,0,0,1,0,0,0,0,0,0,0,1,1,2,2,0,1,0,0,0,0,0,0,0.0,0.047619,0.0,0.095238,0.0,0.857143,0.0,0.0,21


## Exportar dataset

In [23]:
dataset.to_csv('../data/processed/dataset.csv', index=False)