In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from sklearn.linear_model    import SGDClassifier, Perceptron
from sklearn.dummy           import DummyClassifier
from sklearn.tree            import DecisionTreeClassifier
from sklearn.neighbors       import KNeighborsClassifier
from sklearn.metrics         import accuracy_score

import math
from scipy import stats

In [2]:
def first_n_digits(num, n):
    return num // 10 ** (int(math.log(num, 10)) - n + 1)

In [3]:
#transformamos los datos con la función utilizada en la notebook "baseline"
def transform_data(train_data_fname, test_data_fname):
    df_train = pd.read_csv(train_data_fname)
    df_train['is_train_set'] = 1
    df_test = pd.read_csv(test_data_fname)
    df_test['is_train_set'] = 0

    # we  get the TripType for the train set. To do that, we group by VisitNumber and
    # then we get the max (or min or avg)
    y = df_train.groupby(["VisitNumber", "Weekday"], as_index=False).max().TripType

    # we remove the TripType now, and concat training and testing data
    # the concat is done so that we have the same columns for both datasets
    # after one-hot encoding
    df_train = df_train.drop("TripType", axis=1)
    df = pd.concat([df_train, df_test])
    
    new_upc = []
    for i in df['Upc']:
        if not np.isnan(i):
            i = first_n_digits(i, 4)
        new_upc.append(i)
    df['Upc'] = new_upc
        
    # one-hot encoding for the DepartmentDescription
    df = pd.get_dummies(df, columns=["DepartmentDescription"], dummy_na=True)
    
    

    # now we add the groupby values
    df = df.groupby(["VisitNumber", "Weekday"], as_index=False).sum()
    
    # finally, we do one-hot encoding for the Weekday
    df = pd.get_dummies(df, columns=["Weekday"], dummy_na=True)

    # get train and test back
    print(df_train)
    df_train = df[df.is_train_set != 0]
    df_test = df[df.is_train_set == 0]
    
    X = df_train.drop(["is_train_set"], axis=1)
    yy = None
    XX = df_test.drop(["is_train_set"], axis=1)

    return X, y, XX, yy

In [4]:
def print_results(df, rows=5):
  df = df.sort_values(by='rank_test_score').head(rows)
  param = ['rank_test_score', 'mean_test_score']
  for i in dict_tree.keys():
    param.append('param_'+str(i))
  new_df = df.loc[:, param]
  return new_df

In [5]:
X, y, XX, yy = transform_data('https://raw.githubusercontent.com/martinbas/AprendizajeSupervisado/master/practico/data/train.csv', 
                              'https://raw.githubusercontent.com/martinbas/AprendizajeSupervisado/master/practico/data/test.csv')

        VisitNumber Weekday  ...  FinelineNumber  is_train_set
0                 5  Friday  ...          1000.0             1
1                 9  Friday  ...           115.0             1
2                 9  Friday  ...           103.0             1
3                 9  Friday  ...          5501.0             1
4                10  Friday  ...          2008.0             1
...             ...     ...  ...             ...           ...
453406       191344  Sunday  ...          3405.0             1
453407       191344  Sunday  ...          1712.0             1
453408       191344  Sunday  ...          3405.0             1
453409       191347  Sunday  ...          1512.0             1
453410       191347  Sunday  ...          3600.0             1

[453411 rows x 7 columns]


In [6]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=42)

In [8]:
random_state  = 42
dict_models   =  {'DecisionTreeClassifier':  DecisionTreeClassifier(random_state=random_state),
                  'KNeighborsClassifier':    KNeighborsClassifier(),
                  'SGDClassifier':           SGDClassifier(random_state=random_state),
                  'Perceptron':              Perceptron(),
                  'Dummy_frequent':          DummyClassifier(strategy='most_frequent'),
                  'Dummy_stratified':        DummyClassifier(strategy= 'stratified', random_state=random_state)
                  }

In [12]:
# probamos con algunos modelos con valores por defecto
dict_acc = {}
c = 0
for i in dict_models.keys():
  model =       dict_models[i]
  model.fit(X_train, y_train)
  y_pred =      model.predict(X_valid)
  dict_acc[i] = accuracy_score(y_valid, y_pred)
  print(i, round(dict_acc[i], 2))
  c += 1

DecisionTreeClassifier 0.61
KNeighborsClassifier 0.26
SGDClassifier 0.16
Perceptron 0.07
Dummy_frequent 0.13
Dummy_stratified 0.06


In [13]:
dict_tree = {'criterion':             ['gini', 'entropy'],
             'min_samples_split':     [2, 10, 50, 100, 200, 500],
             'max_features':          [None, 'auto', 'sqrt', 'log2'],
            }

In [14]:
# hacemos una búsqueda aleatoria de parámetros con el mejor modelo observado en el punto anterior (DecisionTreeClassifier)
model =     DecisionTreeClassifier(random_state=42)
clf =       RandomizedSearchCV(model, dict_tree, scoring='accuracy', cv=3)
search =    clf.fit(X_train, y_train)
search_df = pd.DataFrame(search.cv_results_)

In [15]:
print_results(search_df)

Unnamed: 0,rank_test_score,mean_test_score,param_criterion,param_min_samples_split,param_max_features
4,1,0.595801,gini,2,
5,2,0.585166,gini,10,sqrt
7,3,0.583078,entropy,10,
9,4,0.566922,entropy,50,auto
6,5,0.564003,entropy,100,sqrt


In [16]:
# A partir de los resultados anteriores volvemos a hacer una búsqueda de parámetros
# pero esta vez a través de grilla de parámetros para profundizar el ajuste de los mejores parámetros
dict_tree_grid = {'criterion'         : ['gini', 'entropy'],
                  'max_features'      : [None, 'auto'],
                  'min_samples_split' : [25, 50, 75, 100, 150, 200],
                  'ccp_alpha'         : [0, 0.5, 1]
                  }

In [17]:
model =      DecisionTreeClassifier(random_state=42)
clf =        GridSearchCV(model, dict_tree_grid, scoring='accuracy', cv=3)
search =     clf.fit(X_train, y_train)
search_df =  pd.DataFrame(search.cv_results_)
best_tree =  search.best_estimator_
best_param = search.best_params_

In [18]:
print_results(search_df)
# los mejores parámetros para "criterion" y "max_features" son gini y None respectivamente

Unnamed: 0,rank_test_score,mean_test_score,param_criterion,param_min_samples_split,param_max_features
2,1,0.64058,gini,75,
3,2,0.638512,gini,100,
1,3,0.638086,gini,50,
4,4,0.634506,gini,150,
5,5,0.632225,gini,200,


In [None]:
# generamos un random forest con los parámetros anteriores
model =      RandomForestClassifier(min_samples_split=75, max_features= None, 
                                    random_state=42)
clf_forest = model.fit(X_train, y_train)
y_pred =     clf_forest.predict(X_valid)
acc =        accuracy_score(y_valid, y_pred)
acc

In [None]:
model =      RandomForestClassifier(min_samples_split=75, max_features= None, 
                                    random_state=42)
clf_forest = model.fit(X, y)
yy =         clf_forest.predict(XX)
submission = pd.DataFrame(list(zip(XX.VisitNumber, yy)), 
                          columns=["VisitNumber", "TripType"])

In [None]:
from google.colab import files

submission.to_csv('tp_AS_basmadjian_lucero.csv', header=True, index=False)
files.download('tp_AS_basmadjian_lucero.csv')