# Diplodatos Kaggle Competition

We present this peace of code to create the baseline for the competition, and as an example of how to deal with these kind of problems. The main goals are that you:

1. Learn
1. Try different models and see which one fits the best the given data
1. Get a higher score than the given one in the current baseline example
1. Try to get the highest score in the class :)

In [None]:
# Import the required packages
import warnings

#import numpy as np
import pandas as pd

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [None]:
def transform_data(train_data_fname, test_data_fname):
    df_train = pd.read_csv(train_data_fname)
    df_train['is_train_set'] = 1
    df_test = pd.read_csv(test_data_fname)
    df_test['is_train_set'] = 0

    y = df_train.groupby(["VisitNumber", "Weekday"], as_index=False).max().TripType
    df_train = df_train.drop("TripType", axis=1)
    df = pd.concat([df_train, df_test])

    df.drop_duplicates(keep='first', ignore_index=True, inplace=True)

    mask = df.Upc.isna()
    column_name = 'Upc'
    df.loc[mask, column_name] = 0

    mask = (df.FinelineNumber.isna())&(df.DepartmentDescription=='PHARMACY RX')
    column_name = 'FinelineNumber'
    df.loc[mask, column_name] = 4822.0

    df = pd.get_dummies(df, columns=["DepartmentDescription"], dummy_na=False)

    df = df.groupby(["VisitNumber", "Weekday"], as_index=False).sum()
   
    df.Weekday=[1 if day in ['Sunday', 'Saturday'] else 0 for day in df.Weekday]

    df = pd.get_dummies(df, columns=["Weekday"], dummy_na=False)

    #df=df.dropna()

    # get train and test back
    df_train = df[df.is_train_set != 0]
    df_test = df[df.is_train_set == 0]
    
    X = df_train.drop(["is_train_set"], axis=1)
    yy = None
    XX = df_test.drop(["is_train_set"], axis=1)

    return X, y, XX, yy

Load the data...

In [None]:
X, y, XX, yy = transform_data("https://raw.githubusercontent.com/DiploDatos/AprendizajeSupervisado/master/practico/data/train.csv", "https://raw.githubusercontent.com/DiploDatos/AprendizajeSupervisado/master/practico/data/test.csv")

Create the model and evaluate it

In [None]:
(X.shape,y.shape)

((67029, 74), (67029,))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

xgb_params = {
'learning_rate': [0.01, 0.1],
'n_estimators': [2000],
'max_depth': [3, 5, 7, 9],
'gamma': [0, 1],
'subsample': [0.7, 1],
'colsample_bytree': [0.7, 1]
}

model=XGBClassifier()
clf = GridSearchCV(model, xgb_params,cv=3)  

model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [None]:
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]

accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 69.80%


In [None]:
yy = model.predict(XX)

Exportamos Resultados


In [None]:
submission9 = pd.DataFrame(list(zip(XX.VisitNumber, yy)), columns=["VisitNumber", "TripType"])

In [None]:
submission9.to_csv("sample_data/submission_xgb3.csv", header=True, index=False)

---

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.25, random_state=42)

parameters = {
    'randomforestclassifier__n_estimators' : [50,100,120],
    'randomforestclassifier__random_state' : [2],
    'randomforestclassifier__max_features' : ['auto'],
    'randomforestclassifier__criterion'    : ['gini','entropy']
}

with warnings.catch_warnings():
    warnings.filterwarnings("ignore")
    model=make_pipeline(StandardScaler(), RandomForestClassifier(random_state=0))
    randomtree_clf = GridSearchCV(model, parameters, scoring='accuracy', cv=3)
    randomtree_clf.fit(X_train, y_train)
    best_tree_clf = randomtree_clf.best_estimator_

best_tree_clf

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('randomforestclassifier',
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=None, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1, min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                                        n_estimators=100, n_jobs=None,
                                        oob_score=False, random_state=2,
                                        verbose=0, warm_start=False))],
         verbose=Fals

In [None]:
print('Best Decision Tree accuracy: ', randomtree_clf.best_score_)

Best Decision Tree accuracy:  0.689542678681546


In [None]:
y_pred = randomtree_clf.predict(X_valid)
predictions = [round(value) for value in y_pred]

accuracy = accuracy_score(y_valid, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 69.70%


In [None]:
yy = randomtree_clf.predict(XX)

Exportamos Resultados


In [None]:
submission5 = pd.DataFrame(list(zip(XX.VisitNumber, yy)), columns=["VisitNumber", "TripType"])

In [None]:
submission5.to_csv("sample_data/submission_randomforest5.csv", header=True, index=False)