In [1]:
# Import the required packages
import os
import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn.utils import resample
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
pd.set_option('display.max_rows', 20000)
pd.set_option('display.max_columns', 20000)
pd.set_option('display.max_colwidth', -1)
import warnings
warnings.filterwarnings("ignore")
import datetime

  pd.set_option('display.max_colwidth', -1)


In [2]:
def transform_data(train_data_fname, test_data_fname):
    df_train = pd.read_csv(train_data_fname)
    df_train['is_train_set'] = 1
    df_test = pd.read_csv(test_data_fname)
    df_test['is_train_set'] = 0
    
    # Comienzo con los datos de TRAIN
    # -------------------------------------------------------------------------
    
    # Elimino los datos que tienen Nan en el Department Description
    clean_df = df_train.dropna(subset=['DepartmentDescription'])

    # Los datos que tienen Nan en Upc son todos de Pharmacy RX
    # Luego los completo con esos valores, obtenidos por inspección
    clean_df.loc[clean_df.Upc.isna(), "FinelineNumber"] = 4822
    clean_df.loc[clean_df.Upc.isna(), "Upc"] = 30169183702
    
#    unique_tt = df_train.TripType.unique()
#    lista_df = []
#    for i in unique_tt:
#        iterator = clean_df[clean_df.TripType == i]
#        max = iterator.Upc.value_counts().idxmax()
#        iterator.Upc = max
#        lista_df.append(iterator)

    unique_vn = df_train.VisitNumber.unique()
    lista_df = []
    for i in unique_vn:
        iterator = clean_df[clean_df.VisitNumber == i]
        max = iterator.Upc.max()
        iterator.Upc = max
        lista_df.append(iterator)


    df_total = pd.concat(lista_df)
    df_total = df_total.drop(["FinelineNumber"], axis=1)
    y = df_total.groupby(["VisitNumber", "Weekday"], as_index=False).max().TripType

    df_total = df_total.drop("TripType", axis=1)
    
    # VISITNUMBER, DEPARTMENT, UPC, WEEKDAY
    
    #--------------------------------------------------------------------------
    
    df_test.loc[df_test.DepartmentDescription.isna(), "Upc"] = 0
    df_test.loc[df_test.Upc.isna(), "Upc"] = 30169183702
    unique_vn = df_test.VisitNumber.unique()
    lista_df = []
    for i in unique_vn:
        iterator = df_test[df_test.VisitNumber == i]
        max = iterator.Upc.max()
        iterator.Upc = max
        lista_df.append(iterator)

    test_total = pd.concat(lista_df)
    
    test_total = test_total.drop(["FinelineNumber"], axis=1)
    
    # VISITNUMBER, DEPARTMENT, UPC, WEEKDAY
    
    temp_concat = pd.concat([df_total, test_total])
    temp_concat = pd.get_dummies(temp_concat, columns=["DepartmentDescription"], dummy_na=True)
    temp_concat = temp_concat.groupby(["VisitNumber", "Weekday", "Upc"], as_index=False).sum()
    
    temp_concat = pd.get_dummies(temp_concat, columns=["Weekday"], dummy_na=True)
    
    df_train = temp_concat[temp_concat.is_train_set != 0]
    df_test = temp_concat[temp_concat.is_train_set == 0]

    X = df_train.drop(["is_train_set"], axis=1)
    yy = None
    XX = df_test.drop(["is_train_set"], axis=1)


    return X, y, XX, yy

In [3]:
X, y, XX, yy = transform_data("../data/train.csv", "../data/test.csv")

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=42)

In [5]:
results = pd.DataFrame(columns=('clf', 'best_acc'))

In [22]:
X_train

Unnamed: 0,VisitNumber,Upc,ScanCount,DepartmentDescription_1-HR PHOTO,DepartmentDescription_ACCESSORIES,DepartmentDescription_AUTOMOTIVE,DepartmentDescription_BAKERY,DepartmentDescription_BATH AND SHOWER,DepartmentDescription_BEAUTY,DepartmentDescription_BEDDING,DepartmentDescription_BOOKS AND MAGAZINES,DepartmentDescription_BOYS WEAR,DepartmentDescription_BRAS & SHAPEWEAR,DepartmentDescription_CAMERAS AND SUPPLIES,"DepartmentDescription_CANDY, TOBACCO, COOKIES",DepartmentDescription_CELEBRATION,DepartmentDescription_COMM BREAD,DepartmentDescription_CONCEPT STORES,DepartmentDescription_COOK AND DINE,DepartmentDescription_DAIRY,DepartmentDescription_DSD GROCERY,DepartmentDescription_ELECTRONICS,DepartmentDescription_FABRICS AND CRAFTS,DepartmentDescription_FINANCIAL SERVICES,DepartmentDescription_FROZEN FOODS,DepartmentDescription_FURNITURE,"DepartmentDescription_GIRLS WEAR, 4-6X AND 7-14",DepartmentDescription_GROCERY DRY GOODS,DepartmentDescription_HARDWARE,DepartmentDescription_HEALTH AND BEAUTY AIDS,DepartmentDescription_HOME DECOR,DepartmentDescription_HOME MANAGEMENT,DepartmentDescription_HORTICULTURE AND ACCESS,DepartmentDescription_HOUSEHOLD CHEMICALS/SUPP,DepartmentDescription_HOUSEHOLD PAPER GOODS,DepartmentDescription_IMPULSE MERCHANDISE,DepartmentDescription_INFANT APPAREL,DepartmentDescription_INFANT CONSUMABLE HARDLINES,DepartmentDescription_JEWELRY AND SUNGLASSES,DepartmentDescription_LADIES SOCKS,DepartmentDescription_LADIESWEAR,DepartmentDescription_LARGE HOUSEHOLD GOODS,DepartmentDescription_LAWN AND GARDEN,"DepartmentDescription_LIQUOR,WINE,BEER",DepartmentDescription_MEAT - FRESH & FROZEN,DepartmentDescription_MEDIA AND GAMING,DepartmentDescription_MENS WEAR,DepartmentDescription_MENSWEAR,DepartmentDescription_OFFICE SUPPLIES,DepartmentDescription_OPTICAL - FRAMES,DepartmentDescription_OPTICAL - LENSES,DepartmentDescription_OTHER DEPARTMENTS,DepartmentDescription_PAINT AND ACCESSORIES,DepartmentDescription_PERSONAL CARE,DepartmentDescription_PETS AND SUPPLIES,DepartmentDescription_PHARMACY OTC,DepartmentDescription_PHARMACY RX,DepartmentDescription_PLAYERS AND ELECTRONICS,DepartmentDescription_PLUS AND MATERNITY,DepartmentDescription_PRE PACKED DELI,DepartmentDescription_PRODUCE,DepartmentDescription_SEAFOOD,DepartmentDescription_SEASONAL,DepartmentDescription_SERVICE DELI,DepartmentDescription_SHEER HOSIERY,DepartmentDescription_SHOES,DepartmentDescription_SLEEPWEAR/FOUNDATIONS,DepartmentDescription_SPORTING GOODS,DepartmentDescription_SWIMWEAR/OUTERWEAR,DepartmentDescription_TOYS,DepartmentDescription_WIRELESS,DepartmentDescription_nan,Weekday_Friday,Weekday_Monday,Weekday_Saturday,Weekday_Sunday,Weekday_Thursday,Weekday_Tuesday,Weekday_Wednesday,Weekday_nan
16430,32945,7.650110e+09,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0
67689,135532,4.897014e+11,11,0,0,0,0,0,2,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,3,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
38646,77378,6.460077e+10,3,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
17535,35152,7.874206e+09,-4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
47984,96227,6.053886e+10,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,3,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52968,106308,8.860280e+10,15,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,4,2,0,0,0,0,0,0,2,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
8813,17484,1.292628e+09,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
78351,156816,1.484474e+09,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1230,2407,6.811316e+10,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0


In [21]:
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB().fit(X_train, y_train)

ValueError: Negative values in data passed to MultinomialNB (input X)

In [15]:
print(classification_report(y_valid, clf.predict(X_valid)))

Best SGDC accuracy:  0.6134796409183801
Pipeline(steps=[('scaler', StandardScaler()),
                ('SGDClassifier',
                 SGDClassifier(loss='log', n_jobs=-1, random_state=0))])
The best classifier so far is: 
Pipeline(steps=[('scaler', StandardScaler()),
                ('SGDClassifier',
                 SGDClassifier(loss='log', n_jobs=-1, random_state=0))])


In [16]:
X.shape, XX.shape

((66917, 80), (28645, 80))

In [17]:
yy = results.clf.iloc[0].predict(XX)

In [18]:
submission = pd.DataFrame(list(zip(XX.VisitNumber, yy)), columns=["VisitNumber", "TripType"])

In [19]:
submission.to_csv("submission_sgd.csv", header=True, index=False)