# Base enem 2016
## Predição se o aluno é treineiro. 
## Segundo teste:
### * Limpeza dos dados
### * Uso do balanceamento
### * Algoritmos:
        * Regressão Logística (Score obtido: 96.083151)
        * Decision tree (Score obtido: 96.105033)
        * Random Forest (Score obtido: 96.630197)

In [92]:
import pandas as pd
import numpy as np
import warnings

from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree

warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 300)
pd.set_option('display.max_rows', 300)

In [93]:
file_train = "train.csv"
file_test = "test.csv"

df_raw_train = pd.read_csv(file_train, index_col=False)
df_raw_test = pd.read_csv(file_test, index_col=False)
df_raw_train.shape, df_raw_test.shape

((13730, 167), (4570, 43))

In [94]:
columns_used=['NU_INSCRICAO','CO_UF_RESIDENCIA', 'SG_UF_RESIDENCIA', 'NU_IDADE', 'TP_SEXO', 'TP_COR_RACA', 
          'TP_NACIONALIDADE','TP_ST_CONCLUSAO','TP_ANO_CONCLUIU', 'TP_ESCOLA', 'TP_ENSINO',
          'TP_PRESENCA_CN', 'TP_PRESENCA_CH', 'TP_PRESENCA_LC','NU_NOTA_CN', 'NU_NOTA_CH', 
          'NU_NOTA_LC','TP_LINGUA','TP_STATUS_REDACAO', 'NU_NOTA_COMP1', 'NU_NOTA_COMP2', 
          'NU_NOTA_COMP3','NU_NOTA_COMP4', 'NU_NOTA_COMP5', 'NU_NOTA_REDACAO','Q001', 'Q002',
          'Q006', 'Q024', 'Q025', 'Q026', 'Q027', 'Q047', 'IN_TREINEIRO']

numerical_vars = ['NU_NOTA_CN', 'NU_NOTA_CH', 'NU_NOTA_LC','NU_NOTA_COMP1', 'NU_NOTA_COMP2', 'NU_NOTA_COMP3', 
                  'NU_NOTA_COMP4','NU_NOTA_COMP5', 'NU_NOTA_REDACAO'] 
target = ['IN_TREINEIRO']

In [95]:
df_train=df_raw_train[columns_used]
df_train.drop(['SG_UF_RESIDENCIA','TP_ENSINO'], inplace=True, axis=1)

df_test=df_raw_test[columns_used[:-1]]
df_test.drop(['SG_UF_RESIDENCIA','TP_ENSINO'], inplace=True, axis=1)

df_train[numerical_vars] = df_train[numerical_vars].fillna(0)
df_test[numerical_vars] = df_test[numerical_vars].fillna(0)

In [96]:
df_train_clean = pd.DataFrame(index=df_train.index)
df_test_clean = pd.DataFrame(index=df_test.index)

In [97]:
df_train_clean['NU_INSCRICAO'] = df_raw_train['NU_INSCRICAO']
df_test_clean['NU_INSCRICAO'] = df_raw_test['NU_INSCRICAO']

In [98]:
def create_encoder(column, prefix):
    
    train_column_df = pd.get_dummies(df_train[column])
    test_column_df = pd.get_dummies(df_test[column])
    
    train_name_columns = df_train[column].sort_values().unique()
    train_name_columns_co = [str(prefix) + str(train_name_column) for train_name_column in train_name_columns]
    
    test_name_columns = df_test[column].sort_values().unique()
    test_name_columns_co = [str(prefix) + str(test_name_column) for test_name_column in test_name_columns] 
    
    train_column_df.columns=train_name_columns_co
    test_column_df.columns=test_name_columns_co
    
    global df_train_clean
    global df_test_clean
    
    df_train_clean = pd.concat([df_train_clean, train_column_df ], axis=1)
    df_test_clean = pd.concat([df_test_clean, test_column_df ], axis=1)

In [99]:
categorical_vars = {'CO_UF_RESIDENCIA' : 'co_uf_', 'TP_SEXO' : 'sexo_', 'TP_COR_RACA': 'raca_', 'TP_ST_CONCLUSAO': 'tp_st_con_', 
                    'TP_ANO_CONCLUIU': 'tp_ano_con_', 'TP_ESCOLA': 'tp_esc_','TP_PRESENCA_CN': 'tp_pres_cn', 
                    'TP_PRESENCA_CH': 'tp_pres_ch', 'TP_PRESENCA_LC': 'tp_pres_lc', 'TP_LINGUA': 'tp_ling_', 
                    'Q001': 'q001_', 'Q002': 'q002_', 'Q006': 'q006_', 'Q024': 'q024_',
                    'Q025': 'q025_', 'Q026': 'q026_', 'Q047': 'q047_'}

In [100]:
for column, prefix in categorical_vars.items():
    create_encoder(column, prefix)

In [101]:
numerical_vars = ['NU_NOTA_CN', 'NU_NOTA_CH', 'NU_NOTA_LC','NU_NOTA_COMP1', 'NU_NOTA_COMP2', 'NU_NOTA_COMP3', 
                  'NU_NOTA_COMP4','NU_NOTA_COMP5', 'NU_NOTA_REDACAO'] 

df_train_clean = pd.concat([df_train_clean, df_train[numerical_vars]], axis=1)
df_test_clean = pd.concat([df_test_clean, df_test[numerical_vars]], axis=1)

In [102]:
X_train = df_train_clean.loc[:,'co_uf_11':]
y_train = df_train['IN_TREINEIRO']

In [103]:
X_test = df_test_clean.loc[:,'co_uf_11':]
X_train.shape, y_train.shape, X_test.shape

((13730, 122), (13730,), (4570, 120))

In [104]:
X_train_comp_X_test = X_train[X_test.columns]

In [105]:
X_train_comp_X_test.shape, y_train.shape, X_test.shape

((13730, 120), (13730,), (4570, 120))

In [106]:
df_train['IN_TREINEIRO'].value_counts()

0    11947
1     1783
Name: IN_TREINEIRO, dtype: int64

## Aplicando SMOTE

In [107]:
#smote = SMOTE(ratio="minority")
smote = SMOTE(sampling_strategy="minority")
X_smote, y_smote = smote.fit_resample(X_train_comp_X_test, y_train)

In [108]:
regressor = LogisticRegression()
regressor.fit(X_smote, y_smote)

LogisticRegression()

In [39]:
y_pred = regressor.predict(X_test)

In [40]:
df_result_insc = pd.DataFrame(df_test_clean['NU_INSCRICAO'])

In [41]:
resultado = pd.concat([df_result_insc, pd.DataFrame(np.round(y_pred,3))], axis=1)
resultado.reset_index(inplace=True, drop=True)

In [42]:
resultado.columns=['NU_INSCRICAO', 'IN_TREINEIRO']

In [46]:
resultado.to_csv("answer.csv", index=False) # Score obtido: 96.083151

## Decision Tree

In [64]:
decision_tree = DecisionTreeClassifier(max_depth=2)
decision_tree_fitted = decision_tree.fit(X_smote, y_smote)

In [65]:
y_pred = decision_tree.predict(X_test)

In [66]:
resultado = pd.concat([df_result_insc, pd.DataFrame(np.round(y_pred,3))], axis=1)
resultado.reset_index(inplace=True, drop=True)

In [67]:
resultado.columns=['NU_INSCRICAO', 'IN_TREINEIRO']
resultado.to_csv("answer.csv", index=False) # Score 96.105033

## Random Forest

In [112]:
random_forest = RandomForestClassifier(n_estimators=500)
random_forest.fit(X_smote, y_smote)

RandomForestClassifier(n_estimators=500)

In [113]:
y_pred = random_forest.predict(X_test)

In [114]:
resultado = pd.concat([df_result_insc, pd.DataFrame(np.round(y_pred,3))], axis=1)
resultado.reset_index(inplace=True, drop=True)

In [115]:
resultado.columns=['NU_INSCRICAO', 'IN_TREINEIRO']
resultado.to_csv("answer.csv", index=False) # Score obtido: 96.630197

## Random Forest - parametrização

In [109]:
random_forest = RandomForestClassifier(n_estimators=1000, min_samples_leaf=1 ,random_state=0, class_weight="balanced", n_jobs=6)
random_forest.fit(X_smote, y_smote)


RandomForestClassifier(class_weight='balanced', n_estimators=1000, n_jobs=6,
                       random_state=0)

In [110]:
y_pred = random_forest.predict(X_test)

In [111]:
resultado = pd.concat([df_result_insc, pd.DataFrame(np.round(y_pred,3))], axis=1)
resultado.reset_index(inplace=True, drop=True)
resultado.columns=['NU_INSCRICAO', 'IN_TREINEIRO']
resultado.to_csv("answer.csv", index=False) # Score Obtido 96.586433