# Table of Contents
[1.Stochastic Gradient Descent](#sgd)

[2.Gaussian Naive Bayes](#gnb)

[3.Logistic Regresstion](#lr)

[4.Bernoulli Naive Bayes](#bnb)

[5.Hyper Params](#hyper_params)

In [1]:
import datetime as dt
import numpy as np
import os
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import train_test_split
from sklearn.ensemble.voting_classifier import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import log_loss
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

%matplotlib notebook

In [2]:
DATA_DIR = ".\\data"

TRAINING_FILE = os.path.join(DATA_DIR, "train.csv")
TEST_FILE = os.path.join(DATA_DIR, "test.csv")
SUBMISSION_FILE = os.path.join(DATA_DIR, "sampleSubmission.csv")

In [38]:
class SFCrimesUtils(object):
    @staticmethod
    def get_df(fileName):
        df = pd.read_csv(fileName,parse_dates=["Dates"])
        df["year"] = df["Dates"].dt.year
        df["month"] = df["Dates"].dt.month
        df["day"] = df["Dates"].dt.day
        df["hour"] = df["Dates"].dt.hour
        df["minute"] = df["Dates"].dt.minute
        return df
    
    @staticmethod
    def format_df(df):
        df_train = pd.get_dummies(df[["DayOfWeek"]])
        df_train["hour"] = df["hour"]
        if 'Category' in df.columns:
            df_train["Category"] = df["Category"]
        df_train["X"] = df["X"]
        df_train["Y"] = df["Y"]

        return df_train
    
    @staticmethod
    def get_train_test_data(df, size=0, rdm_state=0):
        Y = df.Category.values
        df_train = df.drop(labels="Category", axis=1)
        X = df_train[df_train.columns.values].values
        X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=size,random_state=rdm_state)
        return (X_train, X_test, Y_train, Y_test)
    
    @staticmethod
    def scale_data(X, sc=None, fit=True):
        if sc is None:
            sc = StandardScaler()
        if fit==True:
            sc.fit(X)
        return sc.transform(X),sc
    
    @staticmethod
    def generate_csv(csv_target_name, csv_template_name, clf, Y_Pred):
        df_sub = pd.read_csv(csv_template_name, index_col=0)
        columns = df_sub.columns.values
        if False in np.equal(columns, clf.classes_):
            print("columns from submission %s different from prediction %s",columns,clf.classes_)
            return
        for i in df_sub.index.values:
            df_sub.iloc[i] = Y_Pred[i]        
        df_sub.to_csv(csv_target_name)
    
    @staticmethod
    def predict(clf, test_file_name, scaler):
        df_test = SFCrimesUtils.get_df(test_file_name)
        df_test = SFCrimesUtils.format_df(df_test)
        X_test = df_test[df_test.columns.values].values
        if not scaler is None:
            X_test, sc = SFCrimesUtils.scale_data(X_test,scaler,fit=False)
            return clf.predict_proba(X_test)  
        return clf.predict_proba(X_test)    
    
    @staticmethod
    def plot_conf_matrix(Y_test, Y_pred):
        labels = np.unique(Y_test)
        cm = confusion_matrix(Y_test, Y_pred, labels)
        plt.figure(figsize=(10,10))
        plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
        plt.title('Confusion matrix')
        plt.colorbar()
        tick_marks = np.arange(len(labels))
        plt.xticks(tick_marks, labels, rotation=90)
        plt.yticks(tick_marks, labels)
        plt.tight_layout()
        plt.ylabel('True label')
        plt.xlabel('Predicted label')

In [4]:
train = SFCrimesUtils.get_df(TRAINING_FILE)

In [5]:
train = SFCrimesUtils.format_df(train)

In [6]:
TEST_RATIO = 0
%time X_train , X_test, Y_train, Y_test = SFCrimesUtils.get_train_test_data(train,TEST_RATIO, 0)

Wall time: 562 ms


<a id='sgd'></a>
# Stochastic Gradient Descent

In [7]:
sgd  = Pipeline([
                    ('scl', StandardScaler()),
                    ('clf', SGDClassifier(loss="log", n_iter=5))
                ])

In [8]:
%time scores_sgd = cross_val_score(estimator=sgd, X=X_train, y=Y_train, cv=3, scoring='log_loss')

Wall time: 1min 28s


In [9]:
print("SCORES %s"% scores_sgd)
print("MEAN %f / STD: %f" % (np.mean(scores_sgd) ,np.std(scores_sgd)))

SCORES [-2.73088742 -2.79413319 -2.72143317]
MEAN -2.748818 / STD: 0.032274


<a id='gnb'></a>
# Gaussian Naive Bayes

In [10]:
gnb = Pipeline([
                    ('scl', StandardScaler()),
                    ('clf', GaussianNB())
                ])

In [11]:
%time scores_gnb = cross_val_score(estimator= gnb, X=X_train, y=Y_train, cv=3, scoring='log_loss')

Wall time: 25.6 s


In [12]:
print("SCORES %s"% scores_gnb)
print("MEAN %f / STD: %f" % (np.mean(scores_gnb) ,np.std(scores_gnb)))

SCORES [-16.40102654 -13.27895076 -15.60691481]
MEAN -15.095631 / STD: 1.324864


<a id='lr'></a>
# Logistic Regresstion

In [27]:
lr = Pipeline([
                    ('scl', StandardScaler()),
                    ('clf', LogisticRegression())
             ])

In [14]:
%time scores_lr = cross_val_score(estimator= lr, X=X_train, y=Y_train, cv=3, scoring='log_loss')

Wall time: 10min 8s


In [15]:
print("SCORES %s"% scores_lr)
print("MEAN %f / STD: %f" % (np.mean(scores_lr) ,np.std(scores_lr)))

SCORES [-2.65541547 -2.65497172 -2.65567417]
MEAN -2.655354 / STD: 0.000290


<a id='bnb'></a>
# Bernoulli Naive Bayes

In [16]:
bnb  = Pipeline([
                    ('scl', StandardScaler()),
                    ('clf', BernoulliNB())
                ])

In [17]:
%time scores_bnb = cross_val_score(estimator= bnb, X=X_train, y=Y_train, cv=3, scoring='log_loss')

Wall time: 30.7 s


In [18]:
print("SCORES %s"% scores_bnb)
print("MEAN %f / STD: %f" % (np.mean(scores_bnb) ,np.std(scores_bnb)))

SCORES [-2.63956975 -2.63966411 -2.63930682]
MEAN -2.639514 / STD: 0.000151


<a id='hyper_params'></a>
# Hyper Params

In [22]:
params = [
               {
                'C'      : [0.001, 0.01, 0.1, 1]
               }
        ]

In [30]:
gs = GridSearchCV(estimator=LogisticRegression(),
                  param_grid=params,
                  scoring="log_loss",
                  cv=2)

In [31]:
%time gs.fit(X_train, Y_train)

GridSearchCV(cv=2, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'C': [0.001, 0.01, 0.1, 1]}], pre_dispatch='2*n_jobs',
       refit=True, scoring='log_loss', verbose=0)

In [32]:
print("Best score %s",gs.best_score_)
print("Best params %s",gs.best_params_)

('Best score %s', -2.6645798095558706)
('Best params %s', {'C': 1})


In [33]:
%time scores_best_estimator = cross_val_score(estimator=gs.best_estimator_, X=X_train, y=Y_train, cv=3, scoring='log_loss')

Wall time: 7min 52s


In [34]:
print("SCORES %s"% scores_best_estimator)
print("MEAN %f / STD: %f" % (np.mean(scores_best_estimator) ,np.std(scores_best_estimator)))

SCORES [-2.66460594 -2.66399112 -2.66422581]
MEAN -2.664274 / STD: 0.000253


In [35]:
%time Y_pred_kaggle = SFCrimesUtils.predict(gs.best_estimator_, TEST_FILE, None)

Wall time: 5.19 s


In [40]:
Y_pred_kaggle

array([[  1.12876722e-03,   8.62255637e-02,   8.42608254e-05, ...,
          8.51252044e-02,   4.22392191e-02,   1.16215695e-02],
       [  1.12961242e-03,   8.62146138e-02,   8.42779857e-05, ...,
          8.51200608e-02,   4.22359904e-02,   1.16345891e-02],
       [  9.19583009e-04,   8.87519043e-02,   8.66870086e-05, ...,
          8.75852077e-02,   4.35763154e-02,   8.01535142e-03],
       ..., 
       [  3.48298354e-03,   9.14697084e-02,   1.09085277e-03, ...,
          3.26441121e-02,   5.26651476e-02,   1.37403327e-02],
       [  2.95866465e-03,   9.61491588e-02,   1.15003096e-03, ...,
          3.43250412e-02,   5.55068728e-02,   1.00039005e-02],
       [  3.38352619e-03,   9.25144087e-02,   1.10140242e-03, ...,
          3.30011890e-02,   5.32720718e-02,   1.29879318e-02]])

In [39]:
TARGET_CSV_NAME = os.path.join(DATA_DIR, "submission_best_model_with_hyper_params.csv")
%time SFCrimesUtils.generate_csv(TARGET_CSV_NAME, SUBMISSION_FILE, gs.best_estimator_, Y_pred_kaggle)

Wall time: 3min 12s
