# Diplodatos Kaggle Competition Grupo 23

We present this peace of code to create the baseline for the competition, and as an example of how to deal with these kind of problems. The main goals are that you:

1. Learn
1. Try different models and see which one fits the best the given data
1. Get a higher score than the given one in the current baseline example
1. Try to get the highest score in the class :)

In [20]:
# Import the required packages
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

Read the *original* dataset...

In [21]:
original_df = pd.read_csv('../data/train.csv')

In [22]:
original_df.columns

Index(['TripType', 'VisitNumber', 'Weekday', 'Upc', 'ScanCount',
       'DepartmentDescription', 'FinelineNumber'],
      dtype='object')

**TripType** is the column that we should predict. That column is not present in the test set

In [4]:
original_df.describe()

Unnamed: 0,TripType,VisitNumber,Upc,ScanCount,FinelineNumber
count,453411.0,453411.0,450559.0,453411.0,450559.0
mean,58.027039,95909.846115,30752430000.0,1.108584,3727.366554
std,155.97337,55399.801147,91639400000.0,0.707029,2779.958546
min,3.0,5.0,834.0,-10.0,0.0
25%,27.0,49390.0,3400004000.0,1.0,1404.0
50%,39.0,96781.0,7056082000.0,1.0,3352.0
75%,40.0,143930.0,30132010000.0,1.0,5501.0
max,999.0,191347.0,978970700000.0,71.0,9998.0


In [23]:
def transform_data(train_data_fname, test_data_fname):
    df_train = pd.read_csv(train_data_fname)
    df_train['is_train_set'] = 1
    df_test = pd.read_csv(test_data_fname)
    df_test['is_train_set'] = 0

    # we  get the TripType for the train set. To do that, we group by VisitNumber and
    # then we get the max (or min or avg)
    y = df_train.groupby(["VisitNumber", "Weekday"], as_index=False).max().TripType

    # we remove the TripType now, and concat training and testing data
    # the concat is done so that we have the same columns for both datasets
    # after one-hot encoding
    df_train = df_train.drop("TripType", axis=1)
    df = pd.concat([df_train, df_test])
    
    # the next three operations are the ones we have just presented in the previous lines
    
    # drop the columns we won't use (it may be good to use them somehow)
    df = df.drop(["Upc", "FinelineNumber"], axis=1)

    # one-hot encoding for the DepartmentDescription
    df = pd.get_dummies(df, columns=["DepartmentDescription"], dummy_na=True)

    # now we add the groupby values
    df = df.groupby(["VisitNumber", "Weekday"], as_index=False).sum()
    
    # finally, we do one-hot encoding for the Weekday
    df = pd.get_dummies(df, columns=["Weekday"], dummy_na=True)

    # get train and test back
    df_train = df[df.is_train_set != 0]
    df_test = df[df.is_train_set == 0]
    
    import re
    df_train = df_train.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
    
    
    X = df_train.drop(["is_train_set"], axis=1)
    yy = None
    XX = df_test.drop(["is_train_set"], axis=1)

    return X, y, XX, yy

Load the data...

In [24]:
X, y, XX, yy = transform_data("../data/train.csv", "../data/test.csv")

In [25]:
print(X.shape)
print(y.shape)
print(XX.shape)
print(yy)

(67029, 79)
(67029,)
(28645, 79)
None


In [26]:
X.columns=pd.io.parsers.ParserBase({'names':X.columns})._maybe_dedup_names(X.columns)
# renombrando con .1 la columna con nombre repetido
#'DepartmentDescription_MENSWEAR',
#'DepartmentDescription_MENSWEAR.1'

In [27]:
# split training dataset into train and "validation" 
# (we won't be using validation set in this example, because of the cross-validation;
# but it could be useful for you depending on your approach)
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.20, random_state=42)

In [10]:
print(X_train.shape)
print(y_train.shape)
print(X_valid.shape)
print(y_valid.shape)

(53623, 79)
(53623,)
(13406, 79)
(13406,)


In [28]:
# results dataframe is used to store the computed results
results = pd.DataFrame(columns=('clf', 'best_acc'))

In [29]:
# we will use a DesicionTree to classify and GridSearch to determine the parameters
from sklearn.tree import DecisionTreeClassifier as DT
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

tree_param = {'criterion':('gini', 'entropy'), 'min_samples_leaf':(1, 2, 5),
              'min_samples_split':(2, 3, 5, 10, 50, 100)}
tree = DT(random_state=42)
tree_clf = GridSearchCV(tree, tree_param, cv=3, scoring='accuracy') #scoring='balanced_accuracy')
tree_clf.fit(X_train, y_train)
best_tree_clf = tree_clf.best_estimator_

In [30]:
print('Best Decision Tree accuracy: ', tree_clf.best_score_)
print(best_tree_clf)
results = results.append({'clf': best_tree_clf, 'best_acc': tree_clf.best_score_}, ignore_index=True)

print('The best classifier so far is: ')
print(results.loc[results['best_acc'].idxmax()]['clf'])

Best Decision Tree accuracy:  0.6308859671155743
DecisionTreeClassifier(min_samples_leaf=2, min_samples_split=100,
                       random_state=42)
The best classifier so far is: 
DecisionTreeClassifier(min_samples_leaf=2, min_samples_split=100,
                       random_state=42)


**And finally**, we predict the unknown label for the testing set

In [39]:
X.shape, XX.shape

((67029, 79), (28645, 79))

In [40]:
yy = results.clf.iloc[0].predict(XX)

The last thing we do is generating a file that should be *submitted* on kaggle

In [43]:
submission = pd.DataFrame(list(zip(XX.VisitNumber, yy)), columns=["VisitNumber", "TripType"])

In [44]:
submission.to_csv("../data/submission_grupo23.csv", header=True, index=False)

### Best LIGHTGBM accuracy:  0.6892005384492798

In [35]:
# results dataframe is used to store the computed results
results2 = pd.DataFrame(columns=('Lightgbm', 'best_acc'))

In [48]:
import lightgbm as lgb
from sklearn.metrics import (roc_curve, auc, accuracy_score)
from sklearn.model_selection import GridSearchCV

#results_lgb = pd.DataFrame(columns=('lgb', 'best_acc'))


params = {
          'boosting_type': 'gbdt',
          'max_depth' : -3,
          'objective': 'binary',
          'nthread': 3, # Updated from nthread
          'num_leaves': 400,#31
          'learning_rate': 0.2,
          'max_bin': 1600,
          'subsample_for_bin': 200000,#200000
          'subsample': 1,
          'subsample_freq': 1,
          'colsample_bytree': 0.8,
          'reg_alpha': 0.1,
          'reg_lambda': 10,
          'min_split_gain': 0.5,
          'min_child_weight': 1,#0.001
          'min_child_samples': 5,
          'scale_pos_weight': 1,
          'num_class' : 1,
          'metric' : 'binary_error'
         }

  
gridParams = {
    'learning_rate': [0.01],
    'n_estimators': [1700],
    'num_leaves': [1400], #31,50,100, 300, 500,
    'boosting_type' : ['gbdt'],
    'objective' : ['binary'],
    'random_state' : [42], # Updated from 'seed' probar 314
    'colsample_bytree' : [1.0],
    'subsample' : [1],
    'reg_alpha' : [1], #1,1.2, 1e-1,
    'reg_lambda' : [50], #1,1.4, 5, 10, 20,
    }                                     


mdl = lgb.LGBMClassifier(         
          boosting_type= 'gbdt',
          objective = 'binary',
          n_jobs = 3, # Updated from 'nthread'
          silent = True,
          max_depth = params['max_depth'],
          max_bin = params['max_bin'],
          subsample_for_bin = params['subsample_for_bin'],
          subsample = params['subsample'],
          subsample_freq = params['subsample_freq'],
          min_split_gain = params['min_split_gain'],
          min_child_weight = params['min_child_weight'],
          min_child_samples = params['min_child_samples'],
          scale_pos_weight = params['scale_pos_weight']
          )
                                                        

# Create the grid
grid = GridSearchCV(mdl, gridParams,
                    verbose=1,
                    cv=3,
                    n_jobs=-1,
                    scoring='accuracy')
# Run the grid
grid.fit(X_train, y_train)

best_lgb_clf = grid.best_estimator_
print('Best LIGHTGBM accuracy: ', grid.best_score_)
print(best_lgb_clf)

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  5.3min finished


Best LIGHTGBM accuracy:  0.6892005384492798
LGBMClassifier(learning_rate=0.01, max_bin=1600, max_depth=-3,
               min_child_samples=5, min_child_weight=1, min_split_gain=0.5,
               n_estimators=1700, n_jobs=3, num_leaves=1400, objective='binary',
               random_state=42, reg_alpha=1, reg_lambda=50, scale_pos_weight=1,
               subsample=1, subsample_freq=1)


In [52]:
predictions = grid.predict(XX)

In [53]:
submission = pd.DataFrame(list(zip(XX.VisitNumber, predictions)), columns=["VisitNumber", "TripType"])
submission.to_csv("../data/submission_LightGBM.csv", header=True, index=False)