In [3]:
# Import the required packages
import os
import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn.utils import resample
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
pd.set_option('display.max_rows', 20000)
pd.set_option('display.max_columns', 20000)
pd.set_option('display.max_colwidth', -1)
import warnings
warnings.filterwarnings("ignore")
import datetime

  pd.set_option('display.max_colwidth', -1)


In [4]:
def transform_data(train_data_fname, test_data_fname):
    df_train = pd.read_csv(train_data_fname)
    df_train['is_train_set'] = 1
    df_test = pd.read_csv(test_data_fname)
    df_test['is_train_set'] = 0
    
    # Comienzo con los datos de TRAIN
    # -------------------------------------------------------------------------
    
    # Elimino los datos que tienen Nan en el Department Description
    clean_df = df_train.dropna(subset=['DepartmentDescription'])

    # Los datos que tienen Nan en Upc son todos de Pharmacy RX
    # Luego los completo con esos valores, obtenidos por inspección
    clean_df.loc[clean_df.Upc.isna(), "FinelineNumber"] = 4822
    clean_df.loc[clean_df.Upc.isna(), "Upc"] = 30169183702
    
#    unique_tt = df_train.TripType.unique()
#    lista_df = []
#    for i in unique_tt:
#        iterator = clean_df[clean_df.TripType == i]
#        max = iterator.Upc.value_counts().idxmax()
#        iterator.Upc = max
#        lista_df.append(iterator)

    unique_vn = df_train.VisitNumber.unique()
    lista_df = []
    for i in unique_vn:
        iterator = clean_df[clean_df.VisitNumber == i]
        max = iterator.Upc.max()
        iterator.Upc = max
        lista_df.append(iterator)


    df_total = pd.concat(lista_df)
    df_total = df_total.drop(["FinelineNumber"], axis=1)
    y = df_total.groupby(["VisitNumber", "Weekday"], as_index=False).max().TripType

    df_total = df_total.drop("TripType", axis=1)
    
    # VISITNUMBER, DEPARTMENT, UPC, WEEKDAY
    
    #--------------------------------------------------------------------------
    
    df_test.loc[df_test.DepartmentDescription.isna(), "Upc"] = 0
    df_test.loc[df_test.Upc.isna(), "Upc"] = 30169183702
    unique_vn = df_test.VisitNumber.unique()
    lista_df = []
    for i in unique_vn:
        iterator = df_test[df_test.VisitNumber == i]
        max = iterator.Upc.max()
        iterator.Upc = max
        lista_df.append(iterator)

    test_total = pd.concat(lista_df)
    
    test_total = test_total.drop(["FinelineNumber"], axis=1)
    
    # VISITNUMBER, DEPARTMENT, UPC, WEEKDAY
    
    temp_concat = pd.concat([df_total, test_total])
    temp_concat = pd.get_dummies(temp_concat, columns=["DepartmentDescription"], dummy_na=True)
    temp_concat = temp_concat.groupby(["VisitNumber", "Weekday", "Upc"], as_index=False).sum()
    
    temp_concat = pd.get_dummies(temp_concat, columns=["Weekday"], dummy_na=True)
    
    df_train = temp_concat[temp_concat.is_train_set != 0]
    df_test = temp_concat[temp_concat.is_train_set == 0]

    X = df_train.drop(["is_train_set"], axis=1)
    yy = None
    XX = df_test.drop(["is_train_set"], axis=1)


    return X, y, XX, yy

In [5]:
X, y, XX, yy = transform_data("../data/train.csv", "../data/test.csv")

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=42)

In [7]:
results = pd.DataFrame(columns=('clf', 'best_acc'))

In [13]:
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

sgdc = SGDClassifier()

sgdc_params = {
  #  'loss':['log','hinge','perceptron'],
    'penalty':['l2'],
    'alpha':[0.0001]
}

sgdc_gs = GridSearchCV(sgdc, sgdc_params, cv=5)#, verbose=1, n_jobs=1)

#Here I fit the model to my dataset
sgdc_gs.fit(X_train, y_train)

best_sgd = sgdc_gs.best_estimator_

GridSearchCV(cv=5, estimator=SGDClassifier(),
             param_grid={'alpha': [0.0001], 'penalty': ['l2']})

In [14]:
print('Best SGDC accuracy: ', sgdc_gs.best_score_)
print(best_sgd)
results = results.append({'clf': best_sgd, 'best_acc': sgdc_gs.best_score_}, ignore_index=True)

print('The best classifier so far is: ')
print(results.loc[results['best_acc'].idxmax()]['clf'])

Best SGDC accuracy:  0.025576300514652178
SGDClassifier()
The best classifier so far is: 
SGDClassifier(alpha=0.001, penalty='l1')


In [11]:
X.shape, XX.shape

((66917, 80), (28645, 80))

In [12]:
yy = results.clf.iloc[0].predict(XX)

In [None]:
submission = pd.DataFrame(list(zip(XX.VisitNumber, yy)), columns=["VisitNumber", "TripType"])

In [None]:
submission.to_csv("submission_sgd.csv", header=True, index=False)