In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from sklearn.linear_model    import SGDClassifier, Perceptron
from sklearn.dummy           import DummyClassifier
from sklearn.tree            import DecisionTreeClassifier
from sklearn.neighbors       import KNeighborsClassifier
from sklearn.metrics         import accuracy_score
from sklearn import svm

import math
from scipy import stats

In [2]:
def first_n_digits(num, n):
    return num // 10 ** (int(math.log(num, 10)) - n + 1)

In [3]:
#transformamos los datos con la función utilizada en la notebook "baseline"
def transform_data(train_data_fname, test_data_fname):
    df_train = pd.read_csv(train_data_fname)
    df_train['is_train_set'] = 1
    df_test = pd.read_csv(test_data_fname)
    df_test['is_train_set'] = 0

    # we  get the TripType for the train set. To do that, we group by VisitNumber and
    # then we get the max (or min or avg)
    y = df_train.groupby(["VisitNumber", "Weekday"], as_index=False).max().TripType

    # we remove the TripType now, and concat training and testing data
    # the concat is done so that we have the same columns for both datasets
    # after one-hot encoding
    df_train = df_train.drop("TripType", axis=1)
    df = pd.concat([df_train, df_test])
    
    # one-hot encoding for the DepartmentDescription
    df = pd.get_dummies(df, columns=["DepartmentDescription"], dummy_na=True)

    # now we add the groupby values
    df = df.groupby(["VisitNumber", "Weekday"], as_index=False).sum()
    
    # finally, we do one-hot encoding for the Weekday
    df = pd.get_dummies(df, columns=["Weekday"], dummy_na=True)

    # get train and test back
    print(df_train)
    df_train = df[df.is_train_set != 0]
    df_test = df[df.is_train_set == 0]
    
    X = df_train.drop(["is_train_set"], axis=1)
    yy = None
    XX = df_test.drop(["is_train_set"], axis=1)

    return X, y, XX, yy

In [4]:
def print_results(df, rows=5):
  df = df.sort_values(by='rank_test_score').head(rows)
  param = ['rank_test_score', 'mean_test_score']
  for i in dict_tree.keys():
    param.append('param_'+str(i))
  new_df = df.loc[:, param]
  return new_df

In [5]:
X, y, XX, yy = transform_data('https://raw.githubusercontent.com/martinbas/AprendizajeSupervisado/master/practico/data/train.csv', 
                              'https://raw.githubusercontent.com/martinbas/AprendizajeSupervisado/master/practico/data/test.csv')

        VisitNumber Weekday           Upc  ScanCount DepartmentDescription  \
0                 5  Friday  6.811315e+10         -1    FINANCIAL SERVICES   
1                 9  Friday  1.070081e+09          1   IMPULSE MERCHANDISE   
2                 9  Friday  3.107000e+03          1               PRODUCE   
3                 9  Friday  4.011000e+03          1               PRODUCE   
4                10  Friday  6.414410e+09          1           DSD GROCERY   
...             ...     ...           ...        ...                   ...   
453406       191344  Sunday  7.315096e+10          1                BEAUTY   
453407       191344  Sunday  6.505300e+10          1              WIRELESS   
453408       191344  Sunday  7.918131e+09          1                BEAUTY   
453409       191347  Sunday  4.190008e+09          1                 DAIRY   
453410       191347  Sunday  3.800060e+09          1     GROCERY DRY GOODS   

        FinelineNumber  is_train_set  
0               1000.0  

In [6]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=42)

In [9]:
random_state  = 42
dict_models   =  {'DecisionTreeClassifier':  DecisionTreeClassifier(random_state=random_state),
                  'KNeighborsClassifier':    KNeighborsClassifier(),
                  'SGDClassifier':           SGDClassifier(random_state=random_state),
                  'Perceptron':              Perceptron(),
                  'Dummy_frequent':          DummyClassifier(strategy='most_frequent'),
                  'Dummy_stratified':        DummyClassifier(strategy= 'stratified', random_state=random_state)
                  }

In [10]:
# probamos con algunos modelos con valores por defecto
dict_acc = {}
c = 0
for i in dict_models.keys():
  model =       dict_models[i]
  model.fit(X_train, y_train)
  y_pred =      model.predict(X_valid)
  dict_acc[i] = accuracy_score(y_valid, y_pred)
  print(i, round(dict_acc[i], 2))
  c += 1

DecisionTreeClassifier 0.6
KNeighborsClassifier 0.3
SGDClassifier 0.02
Perceptron 0.1
Dummy_frequent 0.13
Dummy_stratified 0.06


In [11]:
dict_tree = {'criterion':             ['gini', 'entropy'],
             'min_samples_split':     [2, 10, 50, 100, 200, 500],
             'max_features':          [None, 'auto', 'sqrt', 'log2'],
            }

In [12]:
# hacemos una búsqueda aleatoria de parámetros con el mejor modelo observado en el punto anterior (DecisionTreeClassifier)
model =     DecisionTreeClassifier(random_state=42)
clf =       RandomizedSearchCV(model, dict_tree, scoring='accuracy', cv=3)
search =    clf.fit(X_train, y_train)
search_df = pd.DataFrame(search.cv_results_)

In [13]:
print_results(search_df)

Unnamed: 0,rank_test_score,mean_test_score,param_criterion,param_min_samples_split,param_max_features
7,1,0.630435,gini,200,
1,2,0.616155,entropy,100,
4,3,0.606095,gini,10,
5,4,0.586509,entropy,500,
0,5,0.539066,gini,10,log2


In [14]:
# A partir de los resultados anteriores volvemos a hacer una búsqueda de parámetros
# pero esta vez a través de grilla de parámetros para profundizar el ajuste de los mejores parámetros
dict_tree_grid = {'criterion'         : ['gini', 'entropy'],
                  'max_features'      : [None, 'auto'],
                  'min_samples_split' : [25, 50, 75, 100, 150, 200],
                  'ccp_alpha'         : [0, 0.5, 1]
                  }

In [15]:
model =      DecisionTreeClassifier(random_state=42)
clf =        GridSearchCV(model, dict_tree_grid, scoring='accuracy', cv=3)
search =     clf.fit(X_train, y_train)
search_df =  pd.DataFrame(search.cv_results_)
best_tree =  search.best_estimator_
best_param = search.best_params_

In [16]:
print_results(search_df)
# los mejores parámetros para "criterion" y "max_features" son gini y None respectivamente

Unnamed: 0,rank_test_score,mean_test_score,param_criterion,param_min_samples_split,param_max_features
2,1,0.635763,gini,75,
3,2,0.634676,gini,100,
1,3,0.632396,gini,50,
5,4,0.630435,gini,200,
4,5,0.629881,gini,150,


In [17]:
# generamos un random forest con los parámetros anteriores
model =      RandomForestClassifier(min_samples_split=75, max_features= None, 
                                    random_state=42)
clf_forest = model.fit(X_train, y_train)
y_pred =     clf_forest.predict(X_valid)
acc =        accuracy_score(y_valid, y_pred)
acc

0.6777562285543787

In [18]:
# generamos un random forest con los parámetros anteriores
model =      RandomForestClassifier(min_samples_split=25, max_features= "auto", 
                                    random_state=42)
clf_forest = model.fit(X_train, y_train)
y_pred =     clf_forest.predict(X_valid)
acc =        accuracy_score(y_valid, y_pred)
print(acc)

0.6952110995076831


In [19]:
model =      RandomForestClassifier(min_samples_split=25, bootstrap=False,
                                    random_state=42)
clf_forest = model.fit(X_train, y_train)
y_pred =     clf_forest.predict(X_valid)
acc =        accuracy_score(y_valid, y_pred)
print(acc)

0.7000845392610274


In [20]:
model =      RandomForestClassifier(min_samples_split=25, bootstrap=False,
                                    random_state=42)
clf_forest = model.fit(X, y)
yy =         clf_forest.predict(XX)
submission = pd.DataFrame(list(zip(XX.VisitNumber, yy)), 
                          columns=["VisitNumber", "TripType"])

In [None]:
from google.colab import files

submission.to_csv('tp_AS_basmadjian_lucero.csv', header=True, index=False)
files.download('tp_AS_basmadjian_lucero.csv')