In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

from imblearn.over_sampling import (RandomOverSampler,ADASYN,BorderlineSMOTE,
                                    KMeansSMOTE,SMOTE,SVMSMOTE)

from imblearn.under_sampling import (RandomUnderSampler,CondensedNearestNeighbour,
                                     EditedNearestNeighbours,
                                    RepeatedEditedNearestNeighbours,
                                    NeighbourhoodCleaningRule,AllKNN,TomekLinks)
from imblearn.pipeline import Pipeline

In [2]:
df = pd.read_csv("train.csv")

In [17]:
def sampling_predict(df,func):
    resampling = func()
    model = XGBClassifier()
    pipeline = Pipeline([('Resampling', resampling), ('XGBClassifier', model)])
    X=df.drop("IN_TREINEIRO",axis=1)
    y=df.IN_TREINEIRO
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,stratify=y,random_state=42)
    pipeline.fit(X_train, y_train) 
    predicted = pipeline.predict(X_test)
    print('Classifcation report:\n', classification_report(y_test, predicted))
    print('Confusion matrix:\n', confusion_matrix(y_test, predicted))

In [37]:
def treino(df,model,submit=False):
    if submit:
        X, y = df.drop("IN_TREINEIRO",axis=1),df["IN_TREINEIRO"]
        model.fit(X,y)
        return model
    if "IN_TREINEIRO" in df:
        X, y = df.drop("IN_TREINEIRO",axis=1),df["IN_TREINEIRO"]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,stratify=y, random_state=42)
        model.fit(X_train,y_train)
        predicted = model.predict(X_test)
        print('Classifcation report:\n', classification_report(y_test, predicted))
        print('Confusion matrix:\n', confusion_matrix(y_test, predicted))
        return model
    else:
        return model.predict(df)

In [4]:
def drop_modcat(df,ct=None):
    anom_columns = ["NU_INSCRICAO","CO_UF_RESIDENCIA"]
    cat_variables = ["SG_UF_RESIDENCIA","TP_SEXO","Q001","Q002","Q006","Q024","Q025","Q026","Q027","Q047"]
    num_variables = ["NU_NOTA_COMP1","NU_NOTA_COMP2","NU_NOTA_COMP3","NU_NOTA_COMP4","NU_NOTA_COMP5",
                     "NU_NOTA_REDACAO","NU_NOTA_CN","NU_NOTA_CH","NU_NOTA_LC"
                    ]
    df.drop(anom_columns,axis=1,inplace=True)
    return pd.get_dummies(df, columns=cat_variables, drop_first=True)
    #Testa se estamos com dataframe de treino ou de teste
    if "NU_NOTA_MT" in df.columns:
        for col in num_variables:
            col_tratar = df1[col]
            media = col_tratar.mean()
            std = col_tratar.std()
            df1[col] = (col_tratar-media)/std
        return df1
    else:
        return pd.DataFrame(ct.transform(df1),columns=df1.columns)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13730 entries, 0 to 13729
Columns: 167 entries, Unnamed: 0 to Q050
dtypes: float64(28), int64(79), object(60)
memory usage: 17.5+ MB


In [6]:
df = df[['NU_INSCRICAO', 'CO_UF_RESIDENCIA', 'SG_UF_RESIDENCIA', 'NU_IDADE', 'TP_SEXO', 'TP_COR_RACA',
 'TP_NACIONALIDADE', 'TP_ST_CONCLUSAO', 'TP_ANO_CONCLUIU', 'TP_ESCOLA', 'TP_ENSINO',"IN_TREINEIRO",
 'TP_DEPENDENCIA_ADM_ESC', 'IN_BAIXA_VISAO', 'IN_CEGUEIRA', 'IN_SURDEZ', 'IN_DISLEXIA', 'IN_DISCALCULIA',
 'IN_SABATISTA', 'IN_GESTANTE', 'IN_IDOSO', 'TP_PRESENCA_CN', 'TP_PRESENCA_CH', 'TP_PRESENCA_LC',"TP_PRESENCA_MT",
 'NU_NOTA_CN', 'NU_NOTA_CH',
 'NU_NOTA_LC', 'TP_LINGUA', 'TP_STATUS_REDACAO', 'NU_NOTA_COMP1', 'NU_NOTA_COMP2', 'NU_NOTA_COMP3',
 'NU_NOTA_COMP4', 'NU_NOTA_COMP5', 'NU_NOTA_REDACAO', 'Q001',
 'Q002', 'Q006', 'Q024', 'Q025', 'Q026', 'Q027', 'Q047']]

In [7]:
df_dum = drop_modcat(df)

In [15]:
treino(df_dum)

Classifcation report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      3584
           1       0.98      0.98      0.98       535

    accuracy                           0.99      4119
   macro avg       0.99      0.99      0.99      4119
weighted avg       0.99      0.99      0.99      4119

Confusion matrix:
 [[3575    9]
 [  12  523]]


XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
              validate_parameters=False, verbosity=None)

In [28]:
under = ["RandomUnderSampler"]
over = ["RandomOverSampler"]
for model in under:
    print("\n"+model+"\n")
    sampling_predict(df_dum,eval(model))
for model in over:
    print("\n"+model+"\n")
    sampling_predict(df_dum,eval(model))


RandomUnderSampler

Classifcation report:
               precision    recall  f1-score   support

           0       1.00      0.99      0.99      3584
           1       0.92      0.99      0.95       535

    accuracy                           0.99      4119
   macro avg       0.96      0.99      0.97      4119
weighted avg       0.99      0.99      0.99      4119

Confusion matrix:
 [[3538   46]
 [   5  530]]

RandomOverSampler

Classifcation report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      3584
           1       0.97      0.98      0.98       535

    accuracy                           0.99      4119
   macro avg       0.98      0.99      0.99      4119
weighted avg       0.99      0.99      0.99      4119

Confusion matrix:
 [[3569   15]
 [  10  525]]


In [33]:
model = treino(df_dum,submit=1)

In [40]:
df_test = pd.read_csv("test.csv")
df_dum1 = drop_modcat(df_test.copy())
preds = treino(df_dum1,model)

In [42]:
z = pd.Series(preds,name="IN_TREINEIRO")
df_entrega = pd.concat([df_test.NU_INSCRICAO,z], axis=1)
df_entrega.to_csv("answer.csv",index=False)