# Base enem 2016
## Predição se o aluno é treineiro.  

## Primeiro teste:

### * Somente a limpeza dos dados 
### * Sem balanceamento 
### * Regressão Logística

Score obtido: 87.921225

In [104]:
import pandas as pd
import numpy as np
import warnings

from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression

warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 300)
pd.set_option('display.max_rows', 300)

In [105]:
file_train = "train.csv"
file_test = "test.csv"

df_raw_train = pd.read_csv(file_train, index_col=False)
df_raw_test = pd.read_csv(file_test, index_col=False)
df_raw_train.shape, df_raw_test.shape

((13730, 167), (4570, 43))

In [106]:
columns_used=['NU_INSCRICAO','CO_UF_RESIDENCIA', 'SG_UF_RESIDENCIA', 'NU_IDADE', 'TP_SEXO', 'TP_COR_RACA', 
          'TP_NACIONALIDADE','TP_ST_CONCLUSAO','TP_ANO_CONCLUIU', 'TP_ESCOLA', 'TP_ENSINO',
          'TP_PRESENCA_CN', 'TP_PRESENCA_CH', 'TP_PRESENCA_LC','NU_NOTA_CN', 'NU_NOTA_CH', 
          'NU_NOTA_LC','TP_LINGUA','TP_STATUS_REDACAO', 'NU_NOTA_COMP1', 'NU_NOTA_COMP2', 
          'NU_NOTA_COMP3','NU_NOTA_COMP4', 'NU_NOTA_COMP5', 'NU_NOTA_REDACAO','Q001', 'Q002',
          'Q006', 'Q024', 'Q025', 'Q026', 'Q027', 'Q047', 'IN_TREINEIRO']

numerical_vars = ['NU_NOTA_CN', 'NU_NOTA_CH', 'NU_NOTA_LC','NU_NOTA_COMP1', 'NU_NOTA_COMP2', 'NU_NOTA_COMP3', 
                  'NU_NOTA_COMP4','NU_NOTA_COMP5', 'NU_NOTA_REDACAO'] 
target = ['IN_TREINEIRO']

In [107]:
df_train=df_raw_train[columns_used]
df_train.drop(['SG_UF_RESIDENCIA','TP_ENSINO'], inplace=True, axis=1)

df_test=df_raw_test[columns_used[:-1]]
df_test.drop(['SG_UF_RESIDENCIA','TP_ENSINO'], inplace=True, axis=1)

df_train[numerical_vars] = df_train[numerical_vars].fillna(0)
df_test[numerical_vars] = df_test[numerical_vars].fillna(0)

In [108]:
df_train_clean = pd.DataFrame(index=df_train.index)
df_test_clean = pd.DataFrame(index=df_test.index)

In [109]:
df_train_clean['NU_INSCRICAO'] = df_raw_train['NU_INSCRICAO']
df_test_clean['NU_INSCRICAO'] = df_raw_test['NU_INSCRICAO']

In [110]:
def create_encoder(column, prefix):

    #encoder = OneHotEncoder()
    #train_column_df = pd.DataFrame(encoder.fit_transform(df_train[[column]]).toarray())
    #test_column_df = pd.DataFrame(encoder.fit_transform(df_test[[column]]).toarray())
    
    train_column_df = pd.get_dummies(df_train[column])
    test_column_df = pd.get_dummies(df_test[column])
    
    train_name_columns = df_train[column].sort_values().unique()
    train_name_columns_co = [str(prefix) + str(train_name_column) for train_name_column in train_name_columns]
    
    test_name_columns = df_test[column].sort_values().unique()
    test_name_columns_co = [str(prefix) + str(test_name_column) for test_name_column in test_name_columns] 
    
    train_column_df.columns=train_name_columns_co
    test_column_df.columns=test_name_columns_co
    
    global df_train_clean
    global df_test_clean
    
    df_train_clean = pd.concat([df_train_clean, train_column_df ], axis=1)
    df_test_clean = pd.concat([df_test_clean, test_column_df ], axis=1)

In [111]:
categorical_vars = {'CO_UF_RESIDENCIA' : 'co_uf_', 'TP_SEXO' : 'sexo_', 'TP_COR_RACA': 'raca_', 'TP_ST_CONCLUSAO': 'tp_st_con_', 
                    'TP_ANO_CONCLUIU': 'tp_ano_con_', 'TP_ESCOLA': 'tp_esc_','TP_PRESENCA_CN': 'tp_pres_cn', 
                    'TP_PRESENCA_CH': 'tp_pres_ch', 'TP_PRESENCA_LC': 'tp_pres_lc', 'TP_LINGUA': 'tp_ling_', 
                    'Q001': 'q001_', 'Q002': 'q002_', 'Q006': 'q006_', 'Q024': 'q024_',
                    'Q025': 'q025_', 'Q026': 'q026_', 'Q047': 'q047_'}

In [112]:
for column, prefix in categorical_vars.items():
    create_encoder(column, prefix)

In [113]:
numerical_vars = ['NU_NOTA_CN', 'NU_NOTA_CH', 'NU_NOTA_LC','NU_NOTA_COMP1', 'NU_NOTA_COMP2', 'NU_NOTA_COMP3', 
                  'NU_NOTA_COMP4','NU_NOTA_COMP5', 'NU_NOTA_REDACAO'] 

df_train_clean = pd.concat([df_train_clean, df_train[numerical_vars]], axis=1)
df_test_clean = pd.concat([df_test_clean, df_test[numerical_vars]], axis=1)

In [115]:
X_train = df_train_clean.loc[:,'co_uf_11':]
y_train = df_train['IN_TREINEIRO']

In [116]:
X_test = df_test_clean.loc[:,'co_uf_11':]
X_train.shape, y_train.shape, X_test.shape

((13730, 122), (13730,), (4570, 120))

In [117]:
X_train_comp_X_test = X_train[X_test.columns]

In [118]:
X_train_comp_X_test.shape, y_train.shape, X_test.shape

((13730, 120), (13730,), (4570, 120))

In [119]:
regressor = LogisticRegression()
regressor.fit(X_train_comp_X_test, y_train)

LogisticRegression()

In [97]:
y_pred = regressor.predict(X_test)

In [98]:
df_result_insc = pd.DataFrame(df_test_clean['NU_INSCRICAO'])

In [99]:
resultado = pd.concat([df_result_insc, pd.DataFrame(np.round(y_pred,3))], axis=1)
resultado.reset_index(inplace=True, drop=True)

In [100]:
resultado.columns=['NU_INSCRICAO', 'IN_TREINEIRO']

In [103]:
resultado.to_csv("answer.csv", index=False)