<a href="https://colab.research.google.com/github/MathMachado/eDreams/blob/master/eDreams_5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install & Load Main Python libraries



https://towardsdatascience.com/a-deep-dive-into-imbalanced-data-over-sampling-f1167ed74b5

https://towardsdatascience.com/from-zero-to-hero-in-xgboost-tuning-e48b59bfaf58

https://www.datacamp.com/community/tutorials/xgboost-in-python

https://towardsdatascience.com/how-to-calibrate-undersampled-model-scores-8f3319c1ea5b

https://towardsdatascience.com/probability-calibration-for-imbalanced-dataset-64af3730eaab

In [0]:
import pandas as pd
import numpy as np

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Load dataframes: training & test sample

In [0]:
url_train= "https://raw.githubusercontent.com/MathMachado/eDreams/master/Dataframes/train.csv?token=AGDJQ6YO3T3X2CYDGGUPRO252KACE"
url_test= "https://raw.githubusercontent.com/MathMachado/eDreams/master/Dataframes/test.csv?token=AGDJQ623H4TWE2QKQWENGA252KAD6"

# Stacking training and validation samples for a single treatment
df_train= pd.read_csv(url_train, sep= ";", index_col= 'ID', parse_dates = ['DEPARTURE', 'ARRIVAL'])
df_test= pd.read_csv(url_test, sep= ";", index_col= 'ID', parse_dates = ['DEPARTURE', 'ARRIVAL'])

# Resetting the test sample indices
df_test['ID']= range(50000, 80000)
df_test.set_index('ID',inplace=True)

# merge train and test
df = df_train.append(df_test, sort= True)

# Records training and test dataframe indexes to separate these dataframes later
train_index = df_train.index
test_index = df_test.index

In [0]:
df.shape

In [0]:
df.head()

In [0]:
df.tail()

In [0]:
df_test.head()

In [0]:
df_test.tail()

In [0]:
df.info()

# Data Preparation

In [0]:
df_T= df.copy()
# Capturing the Company: First 2 positions of WEBSITE.
df_T['COMPANY']= df_T['WEBSITE'].str[0:2].astype(str)

# Capturing the Country: rest of the string of WEBSITE.
df_T['COUNTRY']= df_T['WEBSITE'].str[2:len(df['WEBSITE'])].astype(str)

df_T.head()

In [0]:
df_T['COMPANY'].value_counts() 

There's no 'TL'. So I'll replace 'TL' by 'MV'.

In [0]:
df_T['COMPANY']= df_T['COMPANY'].replace('TL', 'MV')
df_T['COMPANY'].value_counts() 

In [0]:
df_T['COUNTRY'].value_counts() 

In [0]:
# Corrigindo Poland Abbreviation
df_T['COUNTRY']= df_T['COUNTRY'].replace({'PLC': 'PL'})

# Corrigindo France Abbreviation
df_T['COUNTRY']= df_T['COUNTRY'].replace({'FRC': 'FR'})

# Corrigindo DEC Abbreviation
df_T['COUNTRY']= df_T['COUNTRY'].replace({'DEC': 'DE'})

# Corrigindo DEC Abbreviation
df_T['COUNTRY']= df_T['COUNTRY'].replace({'DKC': 'DK'})

df_T['COUNTRY'].value_counts() 

In [0]:
#df_T['COUNTRY']= df_T['COUNTRY'].replace(['PLC', 'DEC', 'DKC', 'FRC'], 'MV')
#df_T['COUNTRY'].value_counts() 

In [0]:
df_T['COUNTRY']= df_T['COUNTRY'].replace(['UK'], 'GB')
df_T['COUNTRY'].value_counts() 

## Treating date variables
> Since there is no information regarding the year of the transaction, I will assume that the transactions are from 2018 or 2019. I will assign the year conveniently from the analysis of the variables DEPARTURE and ARRIVAL.

In [0]:
df2= df_T.copy()
df2['DEPARTURE_WITH_YEAR']= df2['DEPARTURE'] +'/2018'
df2['ARRIVAL_WITH_YEAR']= df2['ARRIVAL'] +'/2018'
df2['ARRIVAL_WITH_YEAR_FIXED']= df2['ARRIVAL'] +'/2019'

df2['DEPARTURE_WITH_YEAR']= pd.to_datetime(df2['DEPARTURE_WITH_YEAR'])
df2['ARRIVAL_WITH_YEAR']= pd.to_datetime(df2['ARRIVAL_WITH_YEAR'])
df2['ARRIVAL_WITH_YEAR_FIXED']= pd.to_datetime(df2['ARRIVAL_WITH_YEAR_FIXED'])
df2.head()

In [0]:
df2['MONTH_DEPARTURE']= df2['DEPARTURE_WITH_YEAR'].dt.month
df2= df2.drop(['DEPARTURE_WITH_YEAR', 'ARRIVAL_WITH_YEAR', 'ARRIVAL_WITH_YEAR_FIXED', 'WEBSITE', 'DEPARTURE','ARRIVAL','TIMESTAMP'], axis= 1)
df2.head()

In [0]:
# Converting column DISTANCE to numeric. For this purpose, I'll cut the distance in the ","
df3= df2.copy()
df3[['DISTANCE_2','DISTANCE_REST']] = df3['DISTANCE'].str.split(",",expand=True)
df3['DISTANCE_2']= pd.to_numeric(df3['DISTANCE_2'])
df3[['HAUL_TYPE','DISTANCE','DISTANCE_2','DISTANCE_REST']].head(10)

In [0]:
df3= df3.drop(columns= ['DISTANCE_REST','DISTANCE'], axis= 1)
df3= df3.rename({'DISTANCE_2': 'DISTANCE'}, axis=1)
df3.head()

In [0]:
# Checking Missing Values
df3.isna().sum()

Let's treat Missing Values in DISTANCE and DEVICE below:

In [0]:
# Replacing NaN's from DISTANCE
df3['DISTANCE'] = np.where((df3['DISTANCE'].isnull()), df3['DISTANCE'].median(), df3['DISTANCE'])

# Replacing NaN's of DEVICE with 'NO_DEVICE'
df3["DEVICE"].fillna("NO_DEVICE", inplace= True)

df3.isna().sum()

# Binning numeric features

In [0]:
df3.head()

In [0]:
df4= df3.copy()
df4['DISTANCE_BUCKET'] = pd.cut(df4['DISTANCE'], bins= 10, labels= [1,2,3,4,5,6,7,8,9,10])
df4= df4.drop(['DISTANCE'], axis= 1)
df4['DISTANCE_BUCKET'].value_counts()

In [0]:
df4.head()

In [0]:
df5= df4.copy()

d_Var_Target= {True: 1, False: 0}
df5['EXTRA_BAGGAGE']= df5['EXTRA_BAGGAGE'].map(d_Var_Target)
df5['SMS']= df5['SMS'].map(d_Var_Target)
df5['TRAIN']= df5['TRAIN'].map(d_Var_Target)
df5.head()

In [0]:
#from google.colab import drive
#drive.mount('/content/drive')

In [0]:
#!cp ./sample_data/FeatureTools.csv /content/drive/My\ Drive/

In [0]:
#feature_matrix= df_train= pd.read_csv('/content/drive/My Drive/FeatureTools.csv', sep= ",", index_col= 'ID')
#feature_matrix.head()

In [0]:
#feature_matrix.shape

In [0]:
#feature_matrix.head()

In [0]:
#df6= feature_matrix.copy()
df6= df5.copy()

from sklearn.preprocessing import LabelEncoder

df6['DEVICE']= LabelEncoder().fit_transform(df6['DEVICE'])
df6['HAUL_TYPE']= LabelEncoder().fit_transform(df6['HAUL_TYPE'])
df6['PRODUCT']= LabelEncoder().fit_transform(df6['PRODUCT'])
df6['TRIP_TYPE']= LabelEncoder().fit_transform(df6['TRIP_TYPE'])
df6['COMPANY']= LabelEncoder().fit_transform(df6['COMPANY'])
df6['COUNTRY']= LabelEncoder().fit_transform(df6['COUNTRY'])
df6['DISTANCE_BUCKET']= LabelEncoder().fit_transform(df6['DISTANCE_BUCKET'])

#df6.reset_index('ID', inplace=True)
df6.head()

## Treating categorical variables

In [0]:
l_Vars_Obj= list(df6.select_dtypes(include=['category', 'object']).columns)
l_Vars_Obj

In [0]:
#for col in l_Vars_Obj:
#    df11[col]= LabelEncoder().fit_transform(df11[col])

# Modeling

## Train/Test Split

### Balancing the training sample

In [0]:
from collections import Counter

In [0]:
def calibration(data, train_pop, target_pop, sampled_train_pop, sampled_target_pop):

    df_Calibrated= ((data*(target_pop/train_pop)/(sampled_target_pop/sampled_train_pop))/(((1-data)*(1-target_pop/train_pop)/(1-sampled_target_pop/sampled_train_pop))+(data*(target_pop/train_pop)/(sampled_target_pop/sampled_train_pop))))

    return calibrated_data

In [0]:
print(sorted(Counter(df6['EXTRA_BAGGAGE']).items()))

In [0]:
df7= calibration(df6, 50000, 500, 10000, 500)

In [0]:
#df11.to_csv('df11.csv')

In [0]:
pd.set_option('display.max_columns', None)

In [0]:
X= df6.loc[train_index, :]
y= X.loc[:, 'EXTRA_BAGGAGE']

X= X.drop('EXTRA_BAGGAGE', axis= 1)

In [0]:
X.head()

In [0]:
X.shape

In [0]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.over_sampling import RandomOverSampler

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2)
X_train.shape

In [0]:
# applying SMOTE to our data and checking the class counts
X_Resampled1, y_Resampled1 = SMOTE().fit_resample(X_train, y_train)
print(sorted(Counter(y_Resampled1).items()))

In [0]:
# applying ADASYN
X_Resampled2, y_Resampled2 = ADASYN().fit_resample(X_train, y_train)
print(sorted(Counter(y_Resampled2).items()))

In [0]:
# BorderlineSMOTE
X_Resampled3, y_Resampled3 = BorderlineSMOTE().fit_resample(X_train, y_train)
print(sorted(Counter(y_Resampled3).items()))

In [0]:
# imports 
X_Resampled4, y_Resampled4 = RandomOverSampler(random_state=0).fit_resample(X_train, y_train)
print(sorted(Counter(y_Resampled4).items()))

In [0]:
calibration(model_results, X_train.shape, 500, 10000, 500)

In [0]:
from sklearn.model_selection import train_test_split
X_train1, X_test1, y_train1, y_test1 = train_test_split(X_Resampled1, y_Resampled1, test_size= 0.2)
X_train2, X_test2, y_train2, y_test2 = train_test_split(X_Resampled2, y_Resampled2, test_size= 0.2)
X_train3, X_test3, y_train3, y_test3 = train_test_split(X_Resampled3, y_Resampled3, test_size= 0.2)
X_train4, X_test4, y_train4, y_test4 = train_test_split(X_Resampled4, y_Resampled4, test_size= 0.2)

Next, we will apply the following estimators / classifiers to the training sample:

# Evaluation
> As I'll submit a binary output, I need to use a F1-Score, as suggested in the challenge. Firstly, let's understand what's F1-Score metric:

* **Precision**: When the model predicts positive, how often is it correct? A low precision can also indicate a large number of False Positives.

    $Precision= \frac{TruePositives}{TruePositive + FalsePositives}$

* **Recall**: Recall is the number of True Positives divided by the number of True Positives and the number of False Negatives. Put another way it is the number of positive predictions divided by the number of positive class values in the test data. It is also called Sensitivity or the True Positive Rate. A low recall indicates many False Negatives.

    $Recall= \frac{TruePositives}{TruePositives + FalseNegatives}$

* **F1 Score**: F1 score conveys the balance between the precision and the recall.

    $F1= 2*\frac{Precision*Recall}{Precision+Recall}$

Source: [Classification Accuracy is Not Enough: More Performance Measures You Can Use](https://machinelearningmastery.com/classification-accuracy-is-not-enough-more-performance-measures-you-can-use/)

## Interpretation
> A good F1 score means that you have low false positives and low false negatives, so you’re correctly identifying real threats and you are not disturbed by false alarms. 
>> An F1 score is considered perfect when it’s 1, while the model is a total failure when it’s 0.

In [0]:
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.metrics import f1_score, accuracy_score
from sklearn.metrics import confusion_matrix

i_Seed= 20111974
i_CV= 10

In [0]:
def make_confusion_matrix(cf,
                          group_names=None,
                          categories='auto',
                          count=True,
                          percent=True,
                          cbar=True,
                          xyticks=False,
                          xyplotlabels=True,
                          sum_stats=True,
                          figsize= (8,8),
                          cmap='Blues'):
    '''
    This function will make a pretty plot of an sklearn Confusion Matrix cm using a Seaborn heatmap visualization.
    Arguments
    ---------
    cf:            confusion matrix to be passed in
    group_names:   List of strings that represent the labels row by row to be shown in each square.
    categories:    List of strings containing the categories to be displayed on the x,y axis. Default is 'auto'
    count:         If True, show the raw number in the confusion matrix. Default is True.
    normalize:     If True, show the proportions for each category. Default is True.
    cbar:          If True, show the color bar. The cbar values are based off the values in the confusion matrix.
                   Default is True.
    xyticks:       If True, show x and y ticks. Default is True.
    xyplotlabels:  If True, show 'True Label' and 'Predicted Label' on the figure. Default is True.
    sum_stats:     If True, display summary statistics below the figure. Default is True.
    figsize:       Tuple representing the figure size. Default will be the matplotlib rcParams value.
    cmap:          Colormap of the values displayed from matplotlib.pyplot.cm. Default is 'Blues'
                   See http://matplotlib.org/examples/color/colormaps_reference.html
    '''


    # CODE TO GENERATE TEXT INSIDE EACH SQUARE
    blanks = ['' for i in range(cf.size)]

    if group_names and len(group_names)==cf.size:
        group_labels = ["{}\n".format(value) for value in group_names]
    else:
        group_labels = blanks

    if count:
        group_counts = ["{0:0.0f}\n".format(value) for value in cf.flatten()]
    else:
        group_counts = blanks

    if percent:
        group_percentages = ["{0:.2%}".format(value) for value in cf.flatten()/np.sum(cf)]
    else:
        group_percentages = blanks

    box_labels = [f"{v1}{v2}{v3}".strip() for v1, v2, v3 in zip(group_labels,group_counts,group_percentages)]
    box_labels = np.asarray(box_labels).reshape(cf.shape[0],cf.shape[1])


    # CODE TO GENERATE SUMMARY STATISTICS & TEXT FOR SUMMARY STATS
    if sum_stats:
        #Accuracy is sum of diagonal divided by total observations
        accuracy  = np.trace(cf) / float(np.sum(cf))

        #if it is a binary confusion matrix, show some more stats
        if len(cf)==2:
            #Metrics for Binary Confusion Matrices
            precision = cf[1,1] / sum(cf[:,1])
            recall    = cf[1,1] / sum(cf[1,:])
            f1_score  = 2*precision*recall / (precision + recall)
            stats_text = "\n\nAccuracy={:0.3f}\nPrecision={:0.3f}\nRecall={:0.3f}\nF1 Score={:0.3f}".format(accuracy,precision,recall,f1_score)
        else:
            stats_text = "\n\nAccuracy={:0.3f}".format(accuracy)
    else:
        stats_text = ""


    # SET FIGURE PARAMETERS ACCORDING TO OTHER ARGUMENTS
    if figsize==None:
        #Get default figure size if not set
        figsize = plt.rcParams.get('figure.figsize')

    if xyticks==False:
        #Do not show categories if xyticks is False
        categories=False


    # MAKE THE HEATMAP VISUALIZATION
    plt.figure(figsize=figsize)
    sns.heatmap(cf,annot=box_labels,fmt="",cmap=cmap,cbar=cbar,xticklabels=categories,yticklabels=categories)

    if xyplotlabels:
        plt.ylabel('True label')
        plt.xlabel('Predicted label' + stats_text)
    else:
        plt.xlabel(stats_text)

In [0]:
# Definindo a função para o GridSearchCV
def GridSearchOptimizer(modelo, Model_Opt, d_Parametros, X_train, y_train, X_test, y_test, cv= i_CV):
    Model_GridSearchCV = GridSearchCV(modelo, d_Parametros, cv= i_CV, n_jobs= -1, verbose= 10, scoring= 'accuracy')
    Model_GridSearchCV.fit(X_train, y_train)
    #print(f"\nGridSearchCV levou {tempo_elapsed:.2f} segundos.")

    # Parâmetros que otimizam a classificação:
    print(f'\nParametros otimizados: {Model_GridSearchCV.best_params_}')
    
    if Model_Opt== 'Model_DT2':
        print(f'\nDecisionTreeClassifier *********************************************************************************************************')
        Model_Opt= DecisionTreeClassifier(criterion= Model_GridSearchCV.best_params_['criterion'], 
                                          max_depth= Model_GridSearchCV.best_params_['max_depth'],
                                          max_leaf_nodes= Model_GridSearchCV.best_params_['max_leaf_nodes'],
                                          min_samples_split= Model_GridSearchCV.best_params_['min_samples_leaf'],
                                          min_samples_leaf= Model_GridSearchCV.best_params_['min_samples_split'], 
                                          random_state= i_Seed)
    elif Model_Opt== 'Model_RF2':
        print(f'\nRandomForestClassifier *********************************************************************************************************')
        Model_Opt= RandomForestClassifier(bootstrap= Model_GridSearchCV.best_params_['bootstrap'],
                                          max_depth= Model_GridSearchCV.best_params_['max_depth'],
                                          max_features= Model_GridSearchCV.best_params_['max_features'],
                                          min_samples_leaf= Model_GridSearchCV.best_params_['min_samples_leaf'],
                                          min_samples_split= Model_GridSearchCV.best_params_['min_samples_split'],
                                          n_estimators= Model_GridSearchCV.best_params_['n_estimators'],
                                          random_state= i_Seed)
        
    elif Model_Opt== 'Model_AB2':
        print(f'\nAdaBoostClassifier *********************************************************************************************************')
        Model_Opt= AdaBoostClassifier(algorithm='SAMME.R',
                                      base_estimator=RandomForestClassifier(bootstrap= False, 
                                                                            max_depth= 10,
                                                                            max_features= 'auto',
                                                                            min_samples_leaf= 1, 
                                                                            min_samples_split= 2,
                                                                            n_estimators= 400),
                                      learning_rate= Model_GridSearchCV.best_params_['learning_rate'], 
                                      n_estimators= Model_GridSearchCV.best_params_['n_estimators'], 
                                      random_state=i_Seed)
        
    elif Model_Opt== 'Model_GB2':
        print(f'\nGradientBoostingClassifier *********************************************************************************************************')
        Model_Opt= GradientBoostingClassifier(learning_rate= Model_GridSearchCV.best_params_['learning_rate'],
                                              n_estimators= Model_GridSearchCV.best_params_['n_estimators'],
                                              max_depth= Model_GridSearchCV.best_params_['max_depth'],
                                              min_samples_split= Model_GridSearchCV.best_params_['min_samples_split'],
                                              min_samples_leaf= Model_GridSearchCV.best_params_['min_samples_leaf'],
                                              max_features= Model_GridSearchCV.best_params_['max_features'])
        
    elif Model_Opt== 'Model_XGB2':
        print(f'\nXGBoostingClassifier *********************************************************************************************************')
        Model_Opt= XGBoostingClassifier(learning_rate= Model_GridSearchCV.best_params_['learning_rate'],
                                        max_depth= Model_GridSearchCV.best_params_['max_depth'],
                                        subsample= Model_GridSearchCV.best_params_['subsample'],
                                        gamma= Model_GridSearchCV.best_params_['gamma'],
                                        reg_lambda= Model_GridSearchCV.best_params_['reg_lambda'],
                                        reg_alpha= Model_GridSearchCV.best_params_['reg_alpha'],
                                        n_estimators= Model_GridSearchCV.best_params_['n_estimators'],
                                        min_child_weight= Model_GridSearchCV.best_params_['min_child_weight'])

                                        ': [1,3,5,7],
                    '': [100,250,500,1000],
                    '': [1, 1.5, 2, 3, 4.5],
                    '': [0, 0.5, 1],
                    '': [0.01, 0.1, 0.3, 0.5, 1, 1.5, 2],
                    '': [0.2, 0.4, 0.5, 0.6, 0.7],
                    'max_depth': [2,4,7,10],
                    'learning_rate
        
    # Treina novamente usando os parametros otimizados...
    Model_Opt.fit(X_train, y_train)

    # Cross-Validation com 10 folds
    print(f'\n********* CROSS-VALIDATION ***********')
    a_Scores_CV = cross_val_score(Model_Opt, X_train, y_train, cv= i_CV)
    print(f'Média das Acurácias calculadas pelo CV....: {100*round(a_Scores_CV.mean(),4)}')
    print(f'std médio das Acurácias calculadas pelo CV: {100*round(a_Scores_CV.std(),4)}')

    # Faz predições com os parametros otimizados...
    y_pred = Model_Opt.predict(X_test)
  
    # Importância das COLUNAS
    print(f'\n********* IMPORTÂNCIA DAS COLUNAS ***********')
    df_Importance= pd.DataFrame(zip(l_Col_Names, Model_Opt.feature_importances_), columns= ['coluna', 'importancia'])
    df_Importance= df_Importance.sort_values(by= ['importancia'], ascending=False)
    print(df_Importance)

    # Matriz de Confusão
    print(f'\n********* CONFUSION MATRIX - PARAMETER TUNNING ***********')
    cf_matrix = confusion_matrix(y_test, y_pred)
    cf_labels = ['True Neg','False Pos','False Neg','True Pos']
    cf_categories = ['Zero', 'One']
    make_confusion_matrix(cf_matrix, group_names= cf_labels, categories= cf_categories)

    return Model_Opt, Model_GridSearchCV.best_params_

# XGBoosting

In [0]:
from xgboost import XGBClassifier
import xgboost as xgb

# Instancia...
Model_XGB= XGBClassifier(learning_rate=0.01,  
                      subsample = 0.8,
                      objective='binary:logistic', 
                      max_depth= 3)

In [0]:
Model_XGB.fit(X_train1, y_train1)
y_pred1 = Model_XGB.predict(X_test1)
accuracy_score(y_test1, y_pred1)

In [0]:
Model_XGB.fit(X_train2, y_train2)
y_pred2 = Model_XGB.predict(X_test2)
accuracy_score(y_test2, y_pred2)

In [0]:
Model_XGB.fit(X_train3, y_train3)
y_pred3 = Model_XGB.predict(X_test3)
accuracy_score(y_test3, y_pred3)

In [0]:
Model_XGB.fit(X_train4, y_train4)
y_pred4 = Model_XGB.predict(X_test4)
accuracy_score(y_test4, y_pred4)

In [0]:
# Calculate feature importances
importances = Model_XGB.feature_importances_
importances[0:45]

In [0]:
# Colunas mais importantes
X2= X_train.iloc[:, [0,3,6,7,8,9,14,20,22,27,28]]
X_test= X_test.iloc[:, [0,3,6,7,8,9,14,20,22,27,28]]
l_Cols= X2.columns
X2.shape

In [0]:
y_train.shape

In [0]:
X_Resampled22, y_Resampled22 = BorderlineSMOTE().fit_resample(X2, y_train)

In [0]:
X_Resampled22= pd.DataFrame(X_Resampled22, columns= [X2.columns])
y_Resampled22= pd.DataFrame(y_Resampled22)

In [0]:
Model_XGB.fit(X_Resampled22, y_Resampled22)
y_pred = Model_XGB.predict(X_test)
accuracy_score(y_test, y_pred)

In [0]:
# Cross-Validation com 10 folds
a_Scores_CV = cross_val_score(Model_XGB, X_train22, y_train2, cv= i_CV)
print(f'Média das Acurácias calculadas pelo CV....: {100*round(a_Scores_CV.mean(),4)}')
print(f'std médio das Acurácias calculadas pelo CV: {100*round(a_Scores_CV.std(),4)}')

In [0]:
# Dicionário de parâmetros para XGBoost:
d_Parametros_XGB = {'min_child_weight': [1,3,5,7],
                    'n_estimators': [100,250,500,1000],
                    'reg_lambda': [1, 1.5, 2, 3, 4.5],
                    'reg_alpha': [0, 0.5, 1],
                    'gamma': [0.01, 0.1, 0.3, 0.5, 1, 1.5, 2],
                    'subsample': [0.2, 0.4, 0.5, 0.6, 0.7],
                    'max_depth': [2,4,7,10],
                    'learning_rate': [0.1, 0.01, 0.001]}

In [0]:
# Invoca a função
Model_XGB, best_params= GridSearchOptimizer(Model_XGB, 'Model_XGB2', d_Parametros_XGB, X_train, y_train, X_test, y_test, cv= i_CV)

In [0]:
# Como o procedimento acima levou 372 minutos para executar, então vou estimar Model_XGB2 abaixo usando os parâmetros acima estimados
best_params= {'colsample_bytree': 0.8, 'gamma': 0.5, 'learning_rate': 0.51, 'max_depth': 5, 'min_child_weight': 1, 'subsample': 0.6}

Model_XGB2= XGBClassifier(min_child_weight= best_params['min_child_weight'],
                          gamma= best_params['gamma'],
                          subsample= best_params['subsample'],
                          colsample_bytree= best_params['colsample_bytree'],
                          max_depth= best_params['max_depth'],
                          learning_rate= best_params['learning_rate'], 
                          random_state= i_Seed)

# Submitting my model

In [0]:
X_Val= df10[l_Vars][df10['EXTRA_BAGGAGE'].isna()]
y_Val= df10[['EXTRA_BAGGAGE']][df10['EXTRA_BAGGAGE'].isna()]

print(X_Val.shape, y_Val.shape)

In [0]:
X_Val['EXTRA_BAGGAGE'] = clf.predict(X_Val)
y_pred_submission= X_Val[['EXTRA_BAGGAGE']]
y_pred_submission.head()

In [0]:
y_pred_submission['EXTRA_BAGGAGE']= y_pred_submission['EXTRA_BAGGAGE'].map({0.0: False, 1.0: True})
y_pred_submission.head()

In [0]:
y_pred_submission.to_csv(r'eDreams_Submission.csv')