# Diplodatos Kaggle Competition

We present this peace of code to create the baseline for the competition, and as an example of how to deal with these kind of problems. The main goals are that you:

1. Learn
1. Try different models and see which one fits the best the given data
1. Get a higher score than the given one in the current baseline example
1. Try to get the highest score in the class :)

In [1]:
# Import the required packages
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_rows', 20000)
pd.set_option('display.max_columns', 100000)
pd.set_option('display.max_colwidth', -1)
import warnings
warnings.filterwarnings("ignore")

  pd.set_option('display.max_colwidth', -1)


Read the *original* dataset...

In [2]:
original_df = pd.read_csv('../data/train.csv')

### **Missing values**

In [3]:
# percentage of non-missing values in each column
original_df.count() / len(original_df)

TripType                 1.000000
VisitNumber              1.000000
Weekday                  1.000000
Upc                      0.993710
ScanCount                1.000000
DepartmentDescription    0.997856
FinelineNumber           0.993710
dtype: float64

There are `nan`s in the column, let us find them...

In [24]:
original_df['num_of_products_for_VisitNumber']= original_df['VisitNumber'].apply(lambda x:products_per_visit.get(x,0))

In [4]:
def flot_to_str(obj):
    """
    Convert Upc code from float to string.
    Use this function by applying lambda
    Parameters: "Upc" column of DataFrame
    Return:string converted Upc removing dot
    """
    while obj != 'np.nan':
        obj = str(obj).split('.')[0]
        if len(obj) == 10:
            obj = obj + '0'
        elif len(obj) == 4:
            obj = obj + '0000000' 
        return obj

In [5]:
def company(upcData):
    """
    Return company code from given Upc code.
    Parameters:'Upc' column of DataFrame
    Return: company code
    """
    try:
        code = upcData[: 6]
        if code == '000000':
            return x[-5]
        return code
    except:
        return -9999


In [6]:
def prodct(upcData):
    """
    Return company code from given Upc code.
    Parameters:'Upc' column of DataFrame
    Return: company code
    """
    try:
        code = upcData[6 :]
        return code
    except:
        return -9999

----
### **Now we create the function...**

In [8]:
def transform_data(train_data_fname, test_data_fname):
    df_train = pd.read_csv(train_data_fname)
    df_train['is_train_set'] = 1
    df_test = pd.read_csv(test_data_fname)
    df_test['is_train_set'] = 0
    
    # drop nan rows
    df_train.loc[df_train.Upc.isna(), "Upc"] = 0
    df_test.loc[df_test.Upc.isna(), "Upc"] = 0
   
    
    # we  get the TripType for the train set. To do that, we group by VisitNumber and
    # then we get the max (or min or avg)
    y = df_train.groupby(["VisitNumber", "Weekday"], as_index=False).max().TripType

    # we remove the TripType now, and concat training and testing data
    # the concat is done so that we have the same columns for both datasets
    # after one-hot encoding
    df_train = df_train.drop("TripType", axis=1)
    df = pd.concat([df_train, df_test])

    # create new columns based on upc
    products_per_visit= dict(df.groupby(['VisitNumber'])['Upc'].count())
    df['num_of_products_for_VisitNumber']= df['VisitNumber'].apply(lambda x:products_per_visit.get(x,0))
    df['company_code'] = df['Upc'].apply(flot_to_str).apply(company).apply(pd.to_numeric)
    df['company_code_cat'] = pd.cut(df['company_code'],bins=50,labels=False)
    
    # create the column fineline categoric
    df['FinelineCat'] = pd.cut(df['FinelineNumber'],bins=50,labels=False)
    
    # drop the columns we won't use 
    df = df.drop(["Upc", "FinelineNumber", "company_code"], axis=1)

    # one-hot encoding for the DepartmentDescription
    df = pd.get_dummies(df, columns=["DepartmentDescription", "num_of_products_for_VisitNumber", "FinelineCat"], dummy_na=True)

    # now we add the groupby values
    df = df.groupby(["VisitNumber", "Weekday"], as_index=False).sum()
    
    # finally, we do one-hot encoding for the Weekday
    df = pd.get_dummies(df, columns=["Weekday"], dummy_na=True)

    # get train and test back
    df_train = df[df.is_train_set != 0]
    df_test = df[df.is_train_set == 0]
    
    X = df_train.drop(["is_train_set"], axis=1)
    yy = None
    XX = df_test.drop(["is_train_set"], axis=1)

    return X, y, XX, yy

Load the data...

In [9]:
X, y, XX, yy = transform_data('../data/train.csv',"../data/test.csv")

In [10]:
X.shape

(67029, 230)

Create the model and evaluate it

In [12]:
# split training dataset into train and "validation" 
# (we won't be using validation set in this example, because of the cross-validation;
# but it could be useful for you depending on your approach)
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=42)

In [13]:
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_classification

#X, y = make_classification(n_features=4, random_state=0)
clf = make_pipeline(StandardScaler(),
                     LinearSVC(random_state=0, tol=1e-5))
clf.fit(X_train, y_train)
#Pipeline(steps=[('standardscaler', StandardScaler()),
#                ('linearsvc', LinearSVC(random_state=0, tol=1e-05))])

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('linearsvc', LinearSVC(random_state=0, tol=1e-05))])

In [14]:
from sklearn.metrics import classification_report

print(classification_report(y_valid, clf.predict(X_valid)))

              precision    recall  f1-score   support

           3       0.79      0.97      0.87       794
           4       0.61      0.40      0.49        67
           5       0.78      0.86      0.82       976
           6       0.73      0.84      0.78       260
           7       0.66      0.71      0.68      1205
           8       0.72      0.90      0.80      2540
           9       0.65      0.79      0.71      1968
          12       0.45      0.10      0.17        49
          14       0.00      0.00      0.00         1
          15       0.50      0.33      0.40       204
          18       0.54      0.40      0.46       121
          19       0.64      0.43      0.51        75
          20       0.61      0.55      0.57       130
          21       0.61      0.59      0.60       155
          22       0.57      0.41      0.48       206
          23       0.67      0.77      0.71        26
          24       0.61      0.57      0.59       581
          25       0.69    

**And finally**, we predict the unknown label for the testing set

In [15]:
X.shape, XX.shape

((67029, 230), (28645, 230))

In [16]:
yy = clf.predict(XX)

The last thing we do is generating a file that should be *submitted* on kaggle

In [17]:
submission = pd.DataFrame(list(zip(XX.VisitNumber, yy)), columns=["VisitNumber", "TripType"])

In [18]:
submission.to_csv("submission_manu_SVC.csv", header=True, index=False)