In [616]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from collections import Counter, defaultdict
import pylab as pl
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix

In [617]:
data = pd.read_excel("C:\Users\Niku\Documents\ML Project\data7.5k.xlsx")
print data.shape
data['Date'] = pd.to_datetime(data['Date'])
data['Year'] = data['Date'].dt.year
data['Month'] = data['Date'].dt.month
data['DateOnly'] = data['Date'].dt.day
data_frame = data[['DayOfWeek', 'PdDistrict', 'X', 'Y', 'Year', 'Month', 'DateOnly', 'Time', 'Category']]
print data.shape

(75000, 13)
(75000, 16)


In [618]:
Categories = list(enumerate(sorted(np.unique(data_frame['Category']))))
DaysOfWeeks = list(enumerate(sorted(np.unique(data_frame['DayOfWeek']))))
PdDistricts = list(enumerate(sorted(np.unique(data_frame['PdDistrict']))))
CategoriesDict = {name: i for i, name in Categories}
DaysOfWeeksDict = {name: i for i, name in DaysOfWeeks}
PdDistrictsDict = {name: i for i, name in PdDistricts}
data_frame.Category = data_frame.Category.map(lambda x: CategoriesDict[x]).astype(int)
data_frame.DayOfWeek = data_frame.DayOfWeek.map(lambda x: DaysOfWeeksDict[x]).astype(int)
data_frame.PdDistrict = data_frame.PdDistrict.map(lambda x: PdDistrictsDict[x]).astype(int)
data_frame.X = data_frame.X.map(lambda x: "%.2f" % round(x, 2)).astype(float)
data_frame.Y = data_frame.Y.map(lambda x: "%.2f" % round(x, 2)).astype(float)
print data_frame.head()

   DayOfWeek  PdDistrict       X      Y  Year  Month  DateOnly      Time  \
0          6           0 -122.40  37.72  2003      1         1  00:30:00   
1          6           0 -122.38  37.73  2003      1         1  00:01:00   
2          6           2 -122.45  37.71  2003      1         1  00:01:00   
3          6           0 -122.39  37.73  2003      1         1  07:18:00   
4          6           8 -122.48  37.76  2003      1         1  00:01:00   

   Category  
0        16  
1        13  
2        12  
3        35  
4        31  


In [619]:
X = data_frame[['DayOfWeek', 'PdDistrict', 'X', 'Y', 'Year', 'Month', 'DateOnly']]
y = data_frame[['Category']]
X = np.matrix(X)
y = np.matrix(y)

In [620]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.5, random_state=1)
print X_train.shape
print X_test.shape
print y_test.shape
print type(y_train)

(37500L, 7L)
(37500L, 7L)
(37500L, 1L)
<class 'numpy.matrixlib.defmatrix.matrix'>


In [621]:
def g(z):
    temp = 1 + np.exp(-z)
    temp2 = 1/temp
    return temp2

In [622]:
def gradient(theta, X, y, alpha):
    X = np.matrix(X)
    y = np.matrix(y)
    theta = np.matrix(theta)
    hofx = g(X*theta.T)
    err = hofx - y
    term1 = ((X.T * err) / len(X)).T 
    term2 = ((alpha / len(X)) * theta)
    grad = term1 + term2
    zero_intercept = np.multiply(err, X[:,0])
    grad[0, 0] = np.sum(zero_intercept)
    return np.array(grad).ravel()

In [623]:
def costFunction(theta, X, y, alpha):
    theta = np.matrix(theta)
    #hofx = g(X*theta.T)
    #term1 = np.multiply(y, np.log(hofx))
    #term2 = np.multiply((1-y), np.log(1-hofx))
    #par_to_regul = theta[:,1:theta.shape[1]]
    #sqr_term = np.power(par_to_regul, 2)
    #regul_term = alpha*len(X)*np.sum(sqr_term)
    #cost = np.sum(((-term1)+(-term2))/len(X)) + regul_term
    first = np.multiply(-y, np.log(g(X * theta.T)))
    second = np.multiply((1 - y), np.log(1 - g(X * theta.T)))
    reg = (alpha / 2 * len(X)) * np.sum(np.power(theta[:,1:theta.shape[1]], 2))
    return np.sum(first - second) / (len(X)) + reg
    #return cost

In [624]:
from scipy.optimize import minimize

def one_vs_all(X, y, num_labels, learning_rate):
    rows = X.shape[0]
    params = X.shape[1]
    
    # k X (n + 1) array for the parameters of each of the k classifiers
    all_theta = np.zeros((num_labels, params + 1))
    
    # insert a column of ones at the beginning for the intercept term
    X = np.insert(X, 0, values=np.ones(rows), axis=1)
    
    # labels are 1-indexed instead of 0-indexed
    for i in range(1, num_labels + 1):
        theta = np.zeros(params + 1)
        y_i = np.array([1 if label == i else 0 for label in y])
        y_i = np.reshape(y_i, (rows, 1))
        
        # minimize the objective function
        fmin = minimize(fun=costFunction, x0=theta, args=(X, y_i, learning_rate), method='TNC', jac=gradient)
        all_theta[i-1,:] = fmin.x
    
    return all_theta

In [625]:
rows = X.shape[0]
params = X.shape[1]
y = np.matrix(y)
all_theta = np.zeros((31, params + 1))

X = np.insert(X, 0, values=np.ones(rows), axis=1)

theta = np.zeros(params + 1)

y_0 = np.array([1 if label == 0 else 0 for label in y])
y_0 = np.reshape(y_0, (rows, 1))

X.shape, y_0.shape, theta.shape, all_theta.shape

((75000L, 8L), (75000L, 1L), (8L,), (31L, 8L))

In [626]:
all_theta = one_vs_all(X_train, y_train, y_train.max(), 1)

  from ipykernel import kernelapp as app


In [627]:
def predict_all(X, all_theta):
    rows = X.shape[0]
    params = X.shape[1]
    num_labels = all_theta.shape[0]
    
    # same as before, insert ones to match the shape
    X = np.insert(X, 0, values=np.ones(rows), axis=1)
    
    # convert to matrices
    X = np.matrix(X)
    all_theta = np.matrix(all_theta)
    
    # compute the class probability for each class on each training instance
    h = g(X * all_theta.T)
    
    # create array of the index with the maximum probability
    h_argmax = np.argmax(h, axis=1)
    
    # because our array was zero-indexed we need to add one for the true label prediction
    h_argmax = h_argmax + 1
    
    return h_argmax

In [628]:
y_pred = predict_all(X_test, all_theta)
correct = [1 if a == b else 0 for (a, b) in zip(y_pred, y_test)]
accuracy = (sum(map(int, correct)) / float(len(correct)))
print 'accuracy = {0}%'.format(accuracy * 100)

accuracy = 17.9093333333%
