In [1]:
# Import the required packages
import os
import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn.utils import resample
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
pd.set_option('display.max_rows', 20000)
pd.set_option('display.max_columns', 20000)
pd.set_option('display.max_colwidth', -1)
import warnings
warnings.filterwarnings("ignore")
import datetime

  pd.set_option('display.max_colwidth', -1)


In [2]:
def transform_data(train_data_fname, test_data_fname):
    df_train = pd.read_csv(train_data_fname)
    df_train['is_train_set'] = 1
    df_test = pd.read_csv(test_data_fname)
    df_test['is_train_set'] = 0
    
    # Comienzo con los datos de TRAIN
    # -------------------------------------------------------------------------
    
    # Elimino los datos que tienen Nan en el Department Description
    clean_df = df_train.dropna(subset=['DepartmentDescription'])

    # Los datos que tienen Nan en Upc son todos de Pharmacy RX
    # Luego los completo con esos valores, obtenidos por inspección
    clean_df.loc[clean_df.Upc.isna(), "FinelineNumber"] = 4822
    clean_df.loc[clean_df.Upc.isna(), "Upc"] = 30169183702
    
#    unique_tt = df_train.TripType.unique()
#    lista_df = []
#    for i in unique_tt:
#        iterator = clean_df[clean_df.TripType == i]
#        max = iterator.Upc.value_counts().idxmax()
#        iterator.Upc = max
#        lista_df.append(iterator)

    unique_vn = df_train.VisitNumber.unique()
    lista_df = []
    for i in unique_vn:
        iterator = clean_df[clean_df.VisitNumber == i]
        max = iterator.Upc.max()
        iterator.Upc = max
        lista_df.append(iterator)


    df_total = pd.concat(lista_df)
    df_total = df_total.drop(["FinelineNumber"], axis=1)
    y = df_total.groupby(["VisitNumber", "Weekday"], as_index=False).max().TripType

    df_total = df_total.drop("TripType", axis=1)
    
    # VISITNUMBER, DEPARTMENT, UPC, WEEKDAY
    
    #--------------------------------------------------------------------------
    
    df_test.loc[df_test.DepartmentDescription.isna(), "Upc"] = 0
    df_test.loc[df_test.Upc.isna(), "Upc"] = 30169183702
    unique_vn = df_test.VisitNumber.unique()
    lista_df = []
    for i in unique_vn:
        iterator = df_test[df_test.VisitNumber == i]
        max = iterator.Upc.max()
        iterator.Upc = max
        lista_df.append(iterator)

    test_total = pd.concat(lista_df)
    
    test_total = test_total.drop(["FinelineNumber"], axis=1)
    
    # VISITNUMBER, DEPARTMENT, UPC, WEEKDAY
    
    temp_concat = pd.concat([df_total, test_total])
    temp_concat = pd.get_dummies(temp_concat, columns=["DepartmentDescription"], dummy_na=True)
    temp_concat = temp_concat.groupby(["VisitNumber", "Weekday", "Upc"], as_index=False).sum()
#    temp_concat.loc[temp_concat.ScanCount<0, "ScanCount"] = 0
    
    temp_concat = pd.get_dummies(temp_concat, columns=["Weekday"], dummy_na=True)
    
    df_train = temp_concat[temp_concat.is_train_set != 0]
    df_test = temp_concat[temp_concat.is_train_set == 0]

    X = df_train.drop(["is_train_set"], axis=1)
    yy = None
    XX = df_test.drop(["is_train_set"], axis=1)


    return X, y, XX, yy

In [3]:
X, y, XX, yy = transform_data("../data/train.csv", "../data/test.csv")

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=42)

In [6]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn import ensemble

clf = ensemble.RandomForestClassifier(random_state=2)
clf.fit(X_train, y_train);
#rf = RandomForestClassifier(max_features='auto', oob_score=True, random_state=1, n_jobs=-1)
#predictions = clf.predict(X_train)
from sklearn.metrics import classification_report

print(classification_report(y_valid, clf.predict(X_valid)))

#param_grid = { "criterion" : ["gini", "entropy"], "min_samples_leaf" : [1, 5, 10], "min_samples_split" : [2, 4, 10, 12, 16], "n_estimators": [50, 100, 400, 700, 1000]}

#tree_clf = GridSearchCV(estimator=rf, param_grid=param_grid, scoring='accuracy', cv=3, n_jobs=-1)

#tree_clf.fit(X_train, y_train)
#best_tree_clf = tree_clf.best_estimator_


RandomForestClassifier(random_state=2)

              precision    recall  f1-score   support

           3       0.92      0.95      0.94       785
           4       0.38      0.12      0.19        82
           5       0.71      0.83      0.76       935
           6       0.76      0.74      0.75       260
           7       0.66      0.67      0.66      1173
           8       0.78      0.88      0.83      2525
           9       0.73      0.78      0.75      2038
          12       0.40      0.06      0.11        65
          14       0.00      0.00      0.00         1
          15       0.55      0.38      0.45       205
          18       0.57      0.31      0.40       126
          19       0.56      0.31      0.40        91
          20       0.57      0.50      0.54       115
          21       0.72      0.63      0.67       142
          22       0.46      0.29      0.36       198
          23       0.64      0.56      0.60        25
          24       0.59      0.53      0.56       538
          25       0.61    

In [7]:
print('Best Random Forest accuracy: ', tree_clf.best_score_)
print(best_tree_clf)
results = results.append({'clf': best_tree_clf, 'best_acc': tree_clf.best_score_}, ignore_index=True)

print('The best classifier so far is: ')
print(results.loc[results['best_acc'].idxmax()]['clf'])

              precision    recall  f1-score   support

           3       0.78      0.92      0.84       785
           4       0.00      0.00      0.00        82
           5       0.67      0.84      0.75       935
           6       0.55      0.60      0.57       260
           7       0.65      0.66      0.65      1173
           8       0.61      0.88      0.72      2525
           9       0.64      0.71      0.67      2038
          12       0.45      0.08      0.13        65
          14       0.00      0.00      0.00         1
          15       0.44      0.39      0.41       205
          18       0.40      0.33      0.36       126
          19       0.38      0.03      0.06        91
          20       0.41      0.50      0.45       115
          21       0.52      0.62      0.57       142
          22       0.34      0.42      0.38       198
          23       0.17      0.16      0.16        25
          24       0.54      0.66      0.59       538
          25       0.61    

In [8]:
X.shape, XX.shape

((66917, 80), (28645, 80))

In [7]:
#yy = results.clf.iloc[0].predict(XX)
yy = clf.predict(XX)

In [8]:
submission = pd.DataFrame(list(zip(XX.VisitNumber, yy)), columns=["VisitNumber", "TripType"])

In [9]:
submission.to_csv("submission_randomForest.csv", header=True, index=False)