# DIPLODATOS 2020
# Aprendizaje Supervisado - Trabajo practico final

Detalles del objetivo del TP en el sgte link https://github.com/DiploDatos/AprendizajeSupervisado/tree/master/practico

### Grupo: Martin Basmadjian - Adrian Lucero

In [52]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection    import train_test_split
from sklearn.ensemble           import RandomForestClassifier, VotingClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.model_selection    import GridSearchCV, RandomizedSearchCV
from sklearn.naive_bayes        import GaussianNB

from sklearn.linear_model       import SGDClassifier, Perceptron
from sklearn.dummy              import DummyClassifier
from sklearn.tree               import DecisionTreeClassifier
from sklearn.neighbors          import KNeighborsClassifier
from sklearn.metrics            import accuracy_score

import math
from scipy import stats

### PARTE 1: Armado del dataset

In [53]:
def first_n_digits(num, n):
    return num // 10 ** (int(math.log(num, 10)) - n + 1)

In [54]:
#transformamos los datos con la función utilizada en la notebook "baseline"
def transform_data(train_data_fname, test_data_fname):
    df_train = pd.read_csv(train_data_fname)
    df_train['is_train_set'] = 1
    df_test = pd.read_csv(test_data_fname)
    df_test['is_train_set'] = 0

    # we  get the TripType for the train set. To do that, we group by VisitNumber and
    # then we get the max (or min or avg)
    y = df_train.groupby(["VisitNumber", "Weekday"], as_index=False).max().TripType

    # we remove the TripType now, and concat training and testing data
    # the concat is done so that we have the same columns for both datasets
    # after one-hot encoding
    df_train = df_train.drop("TripType", axis=1)
    df = pd.concat([df_train, df_test])
    
#    new_upc = []
#    for i in df['Upc']:
#        if not np.isnan(i):
#            i = first_n_digits(i, 4)
#        new_upc.append(i)
#    df['Upc'] = new_upc
        
    # one-hot encoding for the DepartmentDescription
    df = pd.get_dummies(df, columns=["DepartmentDescription"], dummy_na=True)
    
    

    # now we add the groupby values
    df = df.groupby(["VisitNumber", "Weekday"], as_index=False).sum()
    
    # finally, we do one-hot encoding for the Weekday
    df = pd.get_dummies(df, columns=["Weekday"], dummy_na=True)

    # get train and test back
    print(df_train)
    df_train = df[df.is_train_set != 0]
    df_test = df[df.is_train_set == 0]
    
    X = df_train.drop(["is_train_set"], axis=1)
    yy = None
    XX = df_test.drop(["is_train_set"], axis=1)

    return X, y, XX, yy

In [55]:
def print_results(df, dict, rows=5):
  df = df.sort_values(by='rank_test_score').head(rows)
  param = ['rank_test_score', 'mean_test_score']
  for i in dict.keys():
    param.append('param_'+str(i))
  new_df = df.loc[:, param]
  return new_df

In [56]:
X, y, XX, yy = transform_data('https://raw.githubusercontent.com/martinbas/AprendizajeSupervisado/master/practico/data/train.csv', 
                              'https://raw.githubusercontent.com/martinbas/AprendizajeSupervisado/master/practico/data/test.csv')

        VisitNumber Weekday           Upc  ScanCount DepartmentDescription  \
0                 5  Friday  6.811315e+10         -1    FINANCIAL SERVICES   
1                 9  Friday  1.070081e+09          1   IMPULSE MERCHANDISE   
2                 9  Friday  3.107000e+03          1               PRODUCE   
3                 9  Friday  4.011000e+03          1               PRODUCE   
4                10  Friday  6.414410e+09          1           DSD GROCERY   
...             ...     ...           ...        ...                   ...   
453406       191344  Sunday  7.315096e+10          1                BEAUTY   
453407       191344  Sunday  6.505300e+10          1              WIRELESS   
453408       191344  Sunday  7.918131e+09          1                BEAUTY   
453409       191347  Sunday  4.190008e+09          1                 DAIRY   
453410       191347  Sunday  3.800060e+09          1     GROCERY DRY GOODS   

        FinelineNumber  is_train_set  
0               1000.0  

In [57]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=42)

### PARTE 2: Sondeo de modelos
### Realizamos predicciones con varios modelos con parámetros por defecto 

In [58]:
rm  = 42
dict_models   =  {'GaussianNB':                 GaussianNB(),
                  'AdaBoostClassifier':         AdaBoostClassifier(random_state=rm),
                  'RandomForestClassifier':     RandomForestClassifier(random_state=rm),
                  'KNeighborsClassifier':       KNeighborsClassifier(),
                  'Dummy_frequent':             DummyClassifier(strategy='most_frequent'),
                  'Dummy_stratified':           DummyClassifier(strategy= 'stratified', random_state=rm),
                  #'GradientBoostingClassifier': GradientBoostingClassifier(random_state=rm)
                  }

In [59]:
# probamos con algunos modelos con valores por defecto
# a partir de los resultados obtenidos decidimos optimizar los parámetros de
# los siguientes algoritmos: AdaBoost, RandomForestClassifier, KNeighborsClassifier,
# GradientBoostingClassifier
dict_acc = {}
c = 0
for i in dict_models.keys():
  model =       dict_models[i]
  model.fit(X_train, y_train)
  y_pred =      model.predict(X_valid)
  dict_acc[i] = accuracy_score(y_valid, y_pred)
  print(i, round(dict_acc[i], 2))
  c += 1

GaussianNB 0.19
AdaBoostClassifier 0.2
RandomForestClassifier 0.7
KNeighborsClassifier 0.3
Dummy_frequent 0.13
Dummy_stratified 0.06


### PARTE 3: Optimizacion de parámetros: para elejir aquellos con mejor performance 

### KNeighborsClassifier: optimización de parámetros

In [60]:
# generamos un diccionario para realizar una búsqueda por grilla para el algoritmo de vecinos más cercanos
dict_kn_grid = {  'n_neighbors':    [5            , 20          , 40    , 60  ],
                  'weights':        [ 'uniform'   , 'distance'                ],
                  'p':              [ 1           , 2                         ]
                }

In [61]:
model     = KNeighborsClassifier()
clf       = GridSearchCV(model, dict_kn_grid, scoring='accuracy', cv=3)
search    = clf.fit(X_train, y_train)
search_df = pd.DataFrame(search.cv_results_)

In [62]:
print_results(search_df, dict_kn_grid, 20)

Unnamed: 0,rank_test_score,mean_test_score,param_n_neighbors,param_weights,param_p
8,1,0.310401,40,uniform,1
4,2,0.309186,20,uniform,1
10,3,0.308866,40,uniform,2
6,4,0.308014,20,uniform,2
12,5,0.307396,60,uniform,1
14,6,0.306202,60,uniform,2
13,7,0.303943,60,distance,1
9,8,0.301897,40,distance,1
15,9,0.301066,60,distance,2
11,10,0.299446,40,distance,2


In [63]:
# generamos un diccionario para realizar una búsqueda en grilla para el algoritmo de vecinos más cercanos
# con los mejores parámetros encontrados en la búsqueda en grilla anterior para estimar el mejor número
# de vecinos más cercanos

dict_kn_grid = {  'n_neighbors':    [ 50           , 80          , 100     , 150     , 200      , 300 ],
                  'weights':        [ 'distance'                                                      ],
                  'p':              [ 1                                                               ]
                }

In [64]:
model     = KNeighborsClassifier()
clf       = GridSearchCV(model, dict_kn_grid, scoring='accuracy', cv=3)
search    = clf.fit(X_train, y_train)
search_df = pd.DataFrame(search.cv_results_)

In [65]:
print_results(search_df, dict_kn_grid)

Unnamed: 0,rank_test_score,mean_test_score,param_n_neighbors,param_weights,param_p
5,1,0.311189,300,distance,1
4,2,0.309484,200,distance,1
3,3,0.30876,150,distance,1
2,4,0.306735,100,distance,1
1,5,0.305499,80,distance,1


In [66]:
# definimos el diccionario de variables elejidas para el algoritmo de vecinos más cercanos
dict_kn_selected = {'n_neighbors':150, 'weights':'distance',  'p':1}

### RandomForestClassifier: optimización de parámetros

In [67]:
# generamos un diccionario para realizar una búsqueda por grilla 
dict_forest_grid = {'n_estimators':        [ 10                                                    ],
                    'criterion':           [ 'gini'      , 'entropy'                               ],
                    'max_features':        [ 'sqrt'      , 'log2'    , None    , 0.5     ,     0.1 ],
                    'bootstrap':           [ True        , False                                   ],
                    'warm_start':          [ False                                           ],
                    'class_weight':        [ None        , 'balanced', 'balanced_subsample'        ],
                    'max_samples':         [ None        , 0.1       , 0.5                         ]
                    }

In [68]:
model     = RandomForestClassifier(random_state=42)
clf       = RandomizedSearchCV(model, dict_forest_grid, scoring='accuracy', cv=2)
search    = clf.fit(X_train, y_train)
search_df = pd.DataFrame(search.cv_results_)

In [69]:
print_results(search_df, dict_forest_grid)

Unnamed: 0,rank_test_score,mean_test_score,param_n_estimators,param_criterion,param_max_features,param_bootstrap,param_warm_start,param_class_weight,param_max_samples
1,1,0.657481,10,gini,,True,False,,
5,2,0.634484,10,entropy,0.1,False,False,,
2,3,0.633355,10,gini,0.1,True,False,balanced,0.5
4,4,0.62598,10,entropy,0.1,False,False,balanced_subsample,0.5
9,4,0.62598,10,entropy,0.1,False,False,balanced_subsample,


In [70]:
# generamos un diccionario para realizar una búsqueda en grilla para el algoritmo de RandomForestClassifier
# con los mejores parámetros encontrados en la búsqueda aleatoria anterior para optimizar un poco más algunos
# de ellos
dict_forest_grid = {'n_estimators':        [ 10                                                         ],
                    'criterion':           [ 'gini'       ,                                             ],
                    'max_features':        [ 'sqrt'       ,         0.5                                 ],
                    'bootstrap':           [ True                                                       ],
                    'warm_start':          [ False                                                      ],
                    'class_weight':        [ 'balanced_subsample'                                       ],
                    'max_samples':         [ None        ,          0.1       , 0.05   , 0.15    , 0.2  ]
                    }

In [71]:
model     = RandomForestClassifier()
clf       = GridSearchCV(model, dict_forest_grid, scoring='accuracy', cv=2)
search    = clf.fit(X_train, y_train)
search_df = pd.DataFrame(search.cv_results_)

In [72]:
print_results(search_df, dict_forest_grid)

Unnamed: 0,rank_test_score,mean_test_score,param_n_estimators,param_criterion,param_max_features,param_bootstrap,param_warm_start,param_class_weight,param_max_samples
5,1,0.654923,10,gini,0.5,True,False,balanced_subsample,
0,2,0.643308,10,gini,sqrt,True,False,balanced_subsample,
8,3,0.634506,10,gini,0.5,True,False,balanced_subsample,0.15
9,4,0.634463,10,gini,0.5,True,False,balanced_subsample,0.2
4,5,0.62191,10,gini,sqrt,True,False,balanced_subsample,0.2


In [73]:
# generamos un diccionario con los parámetros seleccionados imputando 300 estimadores arbitrariamente
dict_forest_selected = {  'n_estimators':        [ 300                    ],
                          'criterion':           [ 'gini'                 ],
                          'max_features':        [ 0.5                    ],
                          'bootstrap':           [ True                   ],
                          'warm_start':          [ False                  ],
                          'class_weight':        [ 'Balanced_subsample'   ],
                          'max_samples':         [ None                   ]
                        }

### GaussianNB: optimización de parámetros

In [74]:
# generamos un diccionario para realizar una búsqueda por grilla tomando sólo 5 estimadores para no generar
# tiempos de ejecución muy elevados
dict_gau_grid = { 'var_smoothing': [ 1e-8, 1e-9, 1e-10, 1e-11, 1e-12 ],
                 }

In [75]:
model     = GaussianNB()
clf       = GridSearchCV(model, dict_gau_grid, scoring='accuracy', cv=3)
search    = clf.fit(X_train, y_train)
search_df = pd.DataFrame(search.cv_results_)

In [76]:
print_results(search_df, dict_gau_grid)

Unnamed: 0,rank_test_score,mean_test_score,param_var_smoothing
4,1,0.190516,1e-12
3,2,0.190175,1e-11
0,3,0.190111,1e-08
1,3,0.190111,1e-09
2,3,0.190111,1e-10


In [77]:
dict_gau_selected = {'var_smoothing': 1e-10}

### AdaBoostClassifier: optimización de parámetros

In [78]:
# generamos un diccionario para realizar una búsqueda por grilla 
dict_ada_grid = { 'base_estimator': [ RandomForestClassifier()        ],
                  'n_estimators':   [ 50                      , 100   ],
                  'learning_rate':  [ 0.1                     , 1     ],
                  }

In [79]:
model     = AdaBoostClassifier(random_state=42)
clf       = GridSearchCV(model, dict_ada_grid, scoring='accuracy', cv=2)
search    = clf.fit(X_train, y_train)
search_df = pd.DataFrame(search.cv_results_)

In [80]:
print_results(search_df, dict_ada_grid, 20)

Unnamed: 0,rank_test_score,mean_test_score,param_base_estimator,param_n_estimators,param_learning_rate
0,1,0.683674,RandomForestClassifier(),50,0.1
1,1,0.683674,RandomForestClassifier(),100,0.1
2,1,0.683674,RandomForestClassifier(),50,1.0
3,1,0.683674,RandomForestClassifier(),100,1.0


In [81]:
dict_ada_selected = { 'base_estimator': [ RandomForestClassifier()  ],
                      'n_estimators':   [ 5                         ],
                      'learning_rate':  [ 0.1                       ],
                      'algorithm':      [ 'SAMME'                   ]
                     }

### Gradient Boosting Classifier

In [82]:
# generamos un diccionario para realizar una búsqueda en grilla probando unicamente distintos valores
# de tasa de aprendizaje cercanos a su valor por defecto (0.1), el cuál genero un buen desempeño en la prueba 
# anterior para evitar tiempos de ejecución muy elevados
dict_grad_grid = {'learning_rate':  [ 0.05   , 0.1 , 0.15]
                  }

In [83]:
model     = GradientBoostingClassifier(random_state=42)
clf       = GridSearchCV(model, dict_grad_grid, scoring='accuracy', cv=2)
search    = clf.fit(X_train, y_train)
search_df = pd.DataFrame(search.cv_results_)

In [84]:
print_results(search_df, dict_grad_grid)

Unnamed: 0,rank_test_score,mean_test_score,param_learning_rate
1,1,0.692178,0.1
0,2,0.677856,0.05
2,3,0.677366,0.15


### PARTE 4: Voting
### Generamos un modelo mediante VotingClassifier con los parámetros optimizados de los modelos anteriores

In [85]:
model   =       KNeighborsClassifier(n_neighbors=150, weights='distance',  p=1)
clf_1   =       model.fit(X_train, y_train)
y_pred1 =       clf_1.predict(X_valid)
acc1     =       accuracy_score(y_valid, y_pred1)
acc1

0.32284051917052065

In [86]:
model   =   RandomForestClassifier(n_estimators=100, max_features=0.5, 
                                   class_weight='balanced_subsample')
clf_2   =   model.fit(X_train, y_train)
y_pred2 =   clf_2.predict(X_valid)
acc2    =   accuracy_score(y_valid, y_pred2)
acc2

0.692774379631011

In [87]:
model   =   GaussianNB(var_smoothing=1e-10)
clf_3   =   model.fit(X_train, y_train)
y_pred3 =   clf_3.predict(X_valid)
acc3    =   accuracy_score(y_valid, y_pred3)
acc3

0.1861355611915063

In [88]:
model   =   AdaBoostClassifier(base_estimator=RandomForestClassifier(),
                               n_estimators=5, learning_rate=0.1, algorithm='SAMME.R', random_state=42)
clf_4   =   model.fit(X_train, y_train)
y_pred4 =   clf_4.predict(X_valid)
acc4     =   accuracy_score(y_valid, y_pred4)
acc4

0.7030682778855238

In [89]:
# Generamos un modelo mediante VotingClassifier con los parámetros optimizados de los modelos anteriores
# y realizamos la predicción mediante la votación "hard" ajustando los pesos del voto según la comparación
# de los porcentajes de accuracy obtenidos en cada modelo
acc_total = acc1 + acc2 + acc3 + acc4
model     =   VotingClassifier(estimators=[('f1', clf_1),
                                           ('f2', clf_2),
                                           ('f3', clf_3),
                                           ('f4', clf_4),
                                           ],
                               voting='hard', weights=[acc1/acc_total,
                                                       acc2/acc_total,
                                                       acc3/acc_total,
                                                       acc4/acc_total])
clf_v     =   model.fit(X_train, y_train)
y_pred_v  =   clf_v.predict(X_valid)
acc =         accuracy_score(y_valid, y_pred_v)
acc

0.7010791188025263

### PARTE 5: Generacion del archivo para ser subido a Kaggle

In [92]:
yy =         clf_v.predict(XX)
submission = pd.DataFrame(list(zip(XX.VisitNumber, yy)), 
                          columns=["VisitNumber", "TripType"])
submission.to_csv('tp_AS_basmadjian_lucero.csv', header=True, index=False)

In [93]:
#from google.colab import files
#
#files.download('tp_AS_basmadjian_lucero.csv')