# Table of Contents
[1.Stochastic Gradient Descent](#sgd)

[2.Gaussian Naive Bayes](#gnb)

[3.Logistic Regresstion](#lr)

[4.Bernoulli Naive Bayes](#bnb)

[5.Meta Classifier](#meta_classifier)

In [25]:
import datetime as dt
import numpy as np
import os
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import train_test_split
from sklearn.ensemble.voting_classifier import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import log_loss
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

%matplotlib notebook

In [2]:
DATA_DIR = ".\\data"

TRAINING_FILE = os.path.join(DATA_DIR, "train.csv")
TEST_FILE = os.path.join(DATA_DIR, "test.csv")
SUBMISSION_FILE = os.path.join(DATA_DIR, "sampleSubmission.csv")

In [3]:
class SFCrimesUtils(object):
    @staticmethod
    def get_df(fileName):
        df = pd.read_csv(fileName,parse_dates=["Dates"])
        df["year"] = df["Dates"].dt.year
        df["month"] = df["Dates"].dt.month
        df["day"] = df["Dates"].dt.day
        df["hour"] = df["Dates"].dt.hour
        df["minute"] = df["Dates"].dt.minute
        return df
    
    @staticmethod
    def format_df(df):
        df_train = pd.get_dummies(df[["DayOfWeek"]])
        df_train["hour"] = df["hour"]
        if 'Category' in df.columns:
            df_train["Category"] = df["Category"]
        df_train["X"] = df["X"]
        df_train["Y"] = df["Y"]

        return df_train
    
    @staticmethod
    def get_train_test_data(df, size=0, rdm_state=0):
        Y = df.Category.values
        df_train = df.drop(labels="Category", axis=1)
        X = df_train[df_train.columns.values].values
        X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=size,random_state=rdm_state)
        return (X_train, X_test, Y_train, Y_test)
    
    @staticmethod
    def scale_data(X, sc=None, fit=True):
        if sc is None:
            sc = StandardScaler()
        if fit==True:
            sc.fit(X)
        return sc.transform(X),sc
    
    @staticmethod
    def generate_csv(csv_target_name, csv_template_name, clf, Y_Pred):
        df_sub = pd.read_csv(csv_template_name, index_col=0)
        columns = df_sub.columns.values
        if False in np.equal(columns, clf.classes_):
            print("columns from submission %s different from prediction %s",columns,clf.classes_)
            return
        for i in df_sub.index.values:
            df_sub.iloc[i] = Y_Pred[i]        
        df_sub.to_csv(csv_target_name,compression="gzip")
    
    @staticmethod
    def predict(clf, test_file_name, scaler):
        df_test = SFCrimesUtils.get_df(test_file_name)
        df_test = SFCrimesUtils.format_df(df_test)
        X_test = df_test[df_test.columns.values].values
        if not scaler is None:
            X_test, sc = SFCrimesUtils.scale_data(X_test,scaler,fit=False)
            return clf.predict_proba(X_test)  
        return clf.predict_proba(X_test)    
    
    @staticmethod
    def plot_conf_matrix(Y_test, Y_pred):
        labels = np.unique(Y_test)
        cm = confusion_matrix(Y_test, Y_pred, labels)
        plt.figure(figsize=(10,10))
        plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
        plt.title('Confusion matrix')
        plt.colorbar()
        tick_marks = np.arange(len(labels))
        plt.xticks(tick_marks, labels, rotation=90)
        plt.yticks(tick_marks, labels)
        plt.tight_layout()
        plt.ylabel('True label')
        plt.xlabel('Predicted label')

In [4]:
train = SFCrimesUtils.get_df(TRAINING_FILE)

In [5]:
train = SFCrimesUtils.format_df(train)

In [7]:
TEST_RATIO = 0
%time X_train , X_test, Y_train, Y_test = SFCrimesUtils.get_train_test_data(train,TEST_RATIO, 0)

Wall time: 548 ms


<a id='sgd'></a>
# Stochastic Gradient Descent

In [11]:
sgd  = Pipeline([
                    ('scl', StandardScaler()),
                    ('clf', SGDClassifier(loss="log", n_iter=5))
                ])

In [12]:
%time scores_sgd = cross_val_score(estimator=sgd, X=X_train, y=Y_train, cv=3, scoring='log_loss')

Wall time: 2min 14s


In [13]:
print("SCORES %s"% scores_sgd)
print("MEAN %f / STD: %f" % (np.mean(scores_sgd) ,np.std(scores_sgd)))

SCORES [-2.71303367 -2.79768186 -2.84409673]
MEAN -2.784937 / STD: 0.054260


<a id='gnb'></a>
# Gaussian Naive Bayes

In [14]:
gnb = Pipeline([
                    ('scl', StandardScaler()),
                    ('clf', GaussianNB())
                ])

In [16]:
%time scores_gnb = cross_val_score(estimator= gnb, X=X_train, y=Y_train, cv=3, scoring='log_loss')

Wall time: 46.4 s


In [17]:
print("SCORES %s"% scores_gnb)
print("MEAN %f / STD: %f" % (np.mean(scores_gnb) ,np.std(scores_gnb)))

SCORES [-16.40102654 -13.27895076 -15.60691481]
MEAN -15.095631 / STD: 1.324864


<a id='lr'></a>
# Logistic Regresstion

In [18]:
lr = Pipeline([
                    ('scl', StandardScaler()),
                    ('clf', LogisticRegression())
             ])

In [19]:
%time scores_lr = cross_val_score(estimator= lr, X=X_train, y=Y_train, cv=3, scoring='log_loss')

Wall time: 10min 36s


In [20]:
print("SCORES %s"% scores_lr)
print("MEAN %f / STD: %f" % (np.mean(scores_lr) ,np.std(scores_lr)))

SCORES [-2.65541547 -2.65497172 -2.65567417]
MEAN -2.655354 / STD: 0.000290


<a id='bnb'></a>
# Bernoulli Naive Bayes

In [26]:
bnb  = Pipeline([
                    ('scl', StandardScaler()),
                    ('clf', BernoulliNB())
                ])

In [27]:
%time scores_bnb = cross_val_score(estimator= bnb, X=X_train, y=Y_train, cv=3, scoring='log_loss')

Wall time: 43.8 s


In [28]:
print("SCORES %s"% scores_bnb)
print("MEAN %f / STD: %f" % (np.mean(scores_bnb) ,np.std(scores_bnb)))

SCORES [-2.63956975 -2.63966411 -2.63930682]
MEAN -2.639514 / STD: 0.000151


<a id='meta_classifier'></a>
# Meta Classifier

In [35]:
meta_clf = VotingClassifier(estimators = [('lr', lr), ('sgd',sgd), ('bnb',bnb), 
                           ("rfc", RandomForestClassifier(min_samples_split=2, n_estimators = 10, criterion="entropy", max_depth=5))],
                            voting="soft", weights=[0.25, 0.25, 0.25,0.25])

In [None]:
%time scores_meta = cross_val_score(estimator= meta_clf, X=X_train, y=Y_train, cv=3, scoring='log_loss')

In [None]:
print("SCORES %s"% scores_meta)
print("MEAN %f / STD: %f" % (np.mean(scores_meta) ,np.std(scores_meta)))

In [None]:
%time Y_pred_kaggle = SFCrimesUtils.predict(meta_clf, TEST_FILE, sc)

In [None]:
TARGET_CSV_NAME = os.path.join(DATA_DIR, "submission_meta_clf.csv")
%time SFCrimesUtils.generate_csv(TARGET_CSV_NAME, SUBMISSION_FILE, meta_clf, Y_pred_kaggle)