In [6]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import datetime as dt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
%matplotlib notebook

In [3]:
DATA_DIR = ".\\data"
SUBMISSION_DIR = ".\\submission"
TRAINING_FILE_NAME = "train.csv"
TEST_FILE_NAME = "test.csv"
SUBMISSION_FILE_NAME = "sampleSubmission.csv"
TRAINING_FILE_FULL_NAME = os.path.join(DATA_DIR, TRAINING_FILE_NAME)
TEST_FILE_FULL_NAME = os.path.join(DATA_DIR, TEST_FILE_NAME)
SUBMISSION_FILE_FULL_NAME = os.path.join(DATA_DIR, SUBMISSION_FILE_NAME)

In [None]:
class Processor(object):
    @staticmethod
    def get_df(fileName):
        df = pd.read_csv(fileName,parse_dates=["Dates"])
        df["year"] = df["Dates"].dt.year
        df["month"] = df["Dates"].dt.month
        df["day"] = df["Dates"].dt.day
        df["hour"] = df["Dates"].dt.hour
        df["minute"] = df["Dates"].dt.minute
        return df
    
    @staticmethod
    def format_df(df):
        df_train = pd.get_dummies(df[["DayOfWeek"]])
        df_train["year"] = df["year"]
        df_train["month"] = df["month"]
        df_train["hour"] = df["hour"]
        df_train["minute"] = df["minute"]
        if 'Category' in df.columns:
            df_train["Category"] = df["Category"]
        df_train["X"] = df["X"]
        df_train["Y"] = df["Y"]

        return df_train
    
    @staticmethod
    def get_train_test_data(df, size=0, rdm_state=0):
        Y = df.Category.values
        df_train = df.drop(labels="Category", axis=1)
        X = df_train[df_train.columns.values].values
        X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=size,random_state=rdm_state)
        return (X_train, X_test, Y_train, Y_test)
    
    @staticmethod
    def scale_data(X, sc=None, fit=True):
        if sc is None:
            sc = StandardScaler()
        if fit==True:
            sc.fit(X)
        return sc.transform(X),sc
    
    @staticmethod
    def generate_csv(csv_target_name, csv_template_name, Y):
        df_sub = pd.read_csv(csv_template_name, index_col=0)
        columns = df_sub.columns.values.tolist()
        for i in df_sub.index.values:
            v = Y[i]
            index = columns.index(v)
            values = np.zeros(len(columns))
            values[index] = 1
            df_sub.iloc[i] = values
        
        df_sub.to_csv(csv_target_name)
    
    @staticmethod
    def plot_conf_matrix(Y_test, Y_pred):
        labels = np.unique(Y_test)
        cm = confusion_matrix(Y_test, Y_pred, labels)
        plt.figure(figsize=(15,15))
        plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
        plt.title('Confusion matrix')
        plt.colorbar()
        tick_marks = np.arange(len(labels))
        plt.xticks(tick_marks, labels, rotation=90)
        plt.yticks(tick_marks, labels)
        plt.tight_layout()
        plt.ylabel('True label')
        plt.xlabel('Predicted label')

## I Linear kernel SVM

In [None]:
p_svm = p_operations = Pipeline([('scl', StandardScaler()),
                                 ('clf', SVC(kernel="linear",C=1.0, random_state=0))
                                ])
df_train = Processor.get_df(TRAINING_FILE_FULL_NAME)
df_train = Processor.format_df(df_train)
%time X_train , X_test, Y_train, Y_test = Processor.get_train_test_data(df_train,TEST_RATIO,0)
%time scores = cross_val_score(estimator=p_svm, X=X_train, y=Y_train, cv=3, n_jobs=1)
print("acuracy %s", scores)
print("mean %f / std: %f", np.mean(scores),np.std(scores))

## II SVM with RBF kernel

In [12]:
p_svm_rbf = p_operations = Pipeline([('scl', StandardScaler()),
                                     ('clf', SVC(kernel="rbf",C=10.0, gamma=0.1, random_state=0))
                                    ])
df_train = Processor.get_df(TRAINING_FILE_FULL_NAME)
df_train = Processor.format_df(df_train)
%time X_train , X_test, Y_train, Y_test = Processor.get_train_test_data(df_train,TEST_RATIO,0)
%time scores_svm_rbf = cross_val_score(estimator=p_svm_rbf, X=X_train, y=Y_train, cv=3, n_jobs=1)
print("acuracy %s", scores_svm_rbf)
print("mean %f / std: %f", np.mean(scores_svm_rbf),np.std(scores_svm_rbf))

Wall time: 301 ms
('Missclassifications: %d', 233587)
('Acuracy: %f', 0.11323576865402502)


## II- Linear SVM

In [None]:
p_svm_linear = p_operations = Pipeline([('scl', StandardScaler()),
                                        ('clf', SVC(C=1))
                                       ])
df_train = Processor.get_df(TRAINING_FILE_FULL_NAME)
df_train = Processor.format_df(df_train)
%time X_train , X_test, Y_train, Y_test = Processor.get_train_test_data(df_train,TEST_RATIO,0)
%time scores_svm_linear= cross_val_score(estimator=p_svm_linear, X=X_train, y=Y_train, cv=3, n_jobs=1)
print("acuracy %s", scores_svm_linear)
print("mean %f / std: %f", np.mean(scores_svm_linear),np.std(scores_svm_linear))