In [47]:
%matplotlib inline
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import sklearn
import scipy.stats as st
import random
from datetime import datetime
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegressionCV
from patsy import dmatrix, dmatrices
from sklearn import cross_validation
from sklearn.cross_validation import train_test_split, cross_val_score, cross_val_predict
from sklearn.metrics import log_loss
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.grid_search import GridSearchCV
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.base import BaseEstimator
from sklearn.cross_validation import StratifiedKFold

In [69]:
cur_dir = os.path.dirname('__file__')

train = pd.read_csv(os.path.join(cur_dir, "data", "train.csv"))
test = pd.read_csv(os.path.join(cur_dir, "data", "test.csv"))
#mapdata = np.loadtxt(os.path.join(cur_dir, "data", "sf_map.txt"))

In [9]:
def get_random_subset(df, n=5000):
    sub = random.sample(xrange(len(df)), min(n, len(df)))
    return df.iloc[sub]

def preprocess(df):
    res = df.copy()
    res = res[res.X != res.X.max()]
    datetimes = res.Dates.apply(get_datetime)
    res['Hour'] = datetimes.apply(lambda dt: dt.hour)
    res['Month'] = datetimes.apply(lambda dt: dt.month)
    res['Hour_Minutes'] = datetimes.apply(lambda dt: dt.hour + dt.minute / 60.0)
    res['Minutes_Since_03'] = datetimes.apply(lambda dt: (dt-datetime(2003, 1, 1)).total_seconds() / 60)
    res['Minutes_Since_New_Year'] = datetimes.apply(lambda dt: (dt-datetime(dt.year, 1, 1)).total_seconds() / 60)
    res['DOW'] = train.DayOfWeek.apply(lambda x: dow.index(x))
    res['Street_Corner'] = res['Address'].apply(lambda x: 1 if '/' in x else 0)
    return res

def get_datetime(s):
    dt = datetime.strptime(s, "%Y-%m-%d %H:%M:%S")
    return dt

dow = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

def isNight(hour):
    if hour in [0, 1, 2, 3, 4, 5, 6, 19, 20, 21, 22, 23]:
        return "Night"
    else:
        return "Day"

In [43]:
def train_models(models, x_train, y_train):
    for m in models:
        m.fit(x_train, y_train)
        
def get_x_y_matrices(df, formula, target):
    
    x_vals = dmatrix(formula, data=df, return_type='dataframe')
    y_vals = df[target]

    return filter_infrequent(x_vals, y_vals)
    
def filter_infrequent(x, y, threshold=3):
    counts = y.value_counts()
    for cat, count in counts.iteritems():
        if count < 3:
            x = x[y != cat]
            y = y[y != cat]
            
    return x, y

In [67]:
class ProbaRandomForestClassifier(RandomForestClassifier):
    def predict(self, X):
        return self.predict_proba(X)
        
class ProbaLogisticRegression(LogisticRegression):
    def predict(self, X):
        return self.predict_proba(X)

In [10]:
train_df = preprocess(get_random_subset(train, 50000))
# train_df = preprocess(train)

In [None]:
formula_ml = 'C(DayOfWeek) + C(PdDistrict) + Street_Corner + X+Y+Hour+Month'

x_vals = dmatrix(formula_ml, data=train_df, return_type='dataframe')
y_vals = train_df.Category
min_leaves = np.round_(np.logspace(1, 3.5, num=6)) # values from 10 to 3100

parameters = {'min_samples_leaf':min_leaves}

clf = GridSearchCV(RandomForestClassifier(), parameters, scoring='log_loss')

clf.fit(x_vals, y_vals)

for f in clf.grid_scores_:
    print f

In [None]:
formula_ml = 'C(DayOfWeek) + C(PdDistrict) + Street_Corner + X+Y+Hour+Month'

x_vals = dmatrix(formula_ml, data=train_df, return_type='dataframe')
y_vals = train_df.Category

vals = y_vals.value_counts()

for index, value in vals.iteritems():
    if value < 3:
        x_vals = x_vals[y_vals != index]
        y_vals = y_vals[y_vals != index]
    

Cs = np.logspace(-4, 4, 9)

parameters = {'C':Cs}

clf = GridSearchCV(LogisticRegression(), parameters, scoring='log_loss')

clf.fit(x_vals, y_vals)

for f in clf.grid_scores_:
    print f

In [None]:
for f in lg.C_:
    print f

In [36]:
import numpy as np
from sklearn.cross_validation import train_test_split

formula_ml = 'C(DayOfWeek) + C(PdDistrict) + Street_Corner + X+Y+Hour+Month'
training, validation = train_test_split(train_df, train_size=.60)
x_train = dmatrix(formula_ml, data=training)
y_train = training.Category
x_vals = dmatrix(formula_ml, data=train_df, return_type='dataframe')
y_vals = train_df.Category

#print "X_train"
#print x_train
#print "y_train"
#print y_train
#print "X_test"
#print x_vals
#print "y_test"
print y_vals


874473                 OTHER OFFENSES
476903                    DRUNKENNESS
231569                       BURGLARY
718600                  VEHICLE THEFT
404343                        ROBBERY
693644                 OTHER OFFENSES
501631                 OTHER OFFENSES
677702                        ASSAULT
433030                 OTHER OFFENSES
550500                 OTHER OFFENSES
456631                        ASSAULT
182996                  DRUG/NARCOTIC
260023                    WEAPON LAWS
313872                  LARCENY/THEFT
236883                   NON-CRIMINAL
368391                  LARCENY/THEFT
139474                   NON-CRIMINAL
853606                  LARCENY/THEFT
252541                  LARCENY/THEFT
309341                 MISSING PERSON
193629                  VEHICLE THEFT
284984                        ASSAULT
153323                        ASSAULT
662383                   NON-CRIMINAL
813852                  VEHICLE THEFT
795919                  VEHICLE THEFT
95238       

In [39]:
training, validation = train_test_split(train_df, train_size=.60)

formula_ml = 'C(DayOfWeek) + C(PdDistrict) + Street_Corner + X+Y+Hour+Month'

#x_validation = dmatrix(formula_ml, data=validation, return_type='dataframe')
#y_validation = validation.Category

x_validation = x_vals[y_vals.isin(y_train.values)]

scores = []
predictions = []

alg = RandomForestClassifier(min_samples_leaf=1000)
alg.fit(x_train, y_train)
y_validation = validation.Category
y_validation = y_validation[y_validation.isin(y_train.values)]
mlb = MultiLabelBinarizer(classes=alg.classes_)
y_validation = mlb.fit_transform(np.array([y_validation]).T)
print 'xval shape'
print x_validation.shape
predictions = np.array(alg.predict_proba(x_validation))
print 'yval shape ' 
print y_vals.shape
print 'predictions shape'
print predictions.shape

score = log_loss(y_vals, predictions)
correctly_predicted = []
incorrectly_predicted = []

print predictions

#for i in y_vals:
#    if y_vals[i] == predictions[i]:
#        correctly_predicted.append(y_vals)
#    else:
#        incorrectly_preddicted.append(y_vals)
 


print "Score:"
print score
print "Correct"
print correctly_predicted
print "Incorrect"
print incorrectly_predicted

xval shape
(49998, 21)
yval shape 
(49998,)
predictions shape
(49998, 37)
[[  8.67519343e-04   9.38270587e-02   5.99304005e-04 ...,   2.98354831e-02
    6.95966312e-02   1.00060140e-02]
 [  6.10715257e-04   7.08819186e-02   7.36100490e-05 ...,   7.47285559e-02
    5.62110565e-02   1.05121221e-02]
 [  1.15968952e-03   8.51756706e-02   4.02806065e-04 ...,   3.41000274e-02
    4.70512923e-02   7.47854661e-03]
 ..., 
 [  2.97791215e-04   6.20912780e-02   1.96460262e-04 ...,   6.64092453e-02
    4.97820562e-02   7.44324034e-03]
 [  2.16380366e-03   8.31998714e-02   4.40749673e-04 ...,   2.99980835e-02
    5.27462788e-02   7.23740974e-03]
 [  1.55807307e-03   9.55013788e-02   3.40205769e-04 ...,   7.26742190e-02
    3.97856710e-02   8.66926275e-03]]


KeyError: 'OTHER OFFENSES'

In [42]:
clf = GridSearchCV(RandomForestClassifier(), parameters, scoring='log_loss')

clf.fit(x_train, y_train)

for f in clf.grid_scores_:
    print f   

mean: -3.30316, std: 0.09639, params: {'min_samples_leaf': 10.0}
mean: -2.73423, std: 0.03827, params: {'min_samples_leaf': 32.0}
mean: -2.57255, std: 0.00132, params: {'min_samples_leaf': 100.0}
mean: -2.57350, std: 0.00424, params: {'min_samples_leaf': 316.0}
mean: -2.60365, std: 0.00290, params: {'min_samples_leaf': 1000.0}
mean: -2.65727, std: 0.00932, params: {'min_samples_leaf': 3162.0}


In [83]:
formula_ml = 'C(DayOfWeek) + C(PdDistrict) + Street_Corner + X+Y+Hour+Month'

x_vals, y_vals = get_x_y_matrices(train_df, formula_ml, 'Category')

clf_forest = ProbaRandomForestClassifier(min_samples_leaf=300)

predictions = cross_val_predict(clf_forest, x_vals, y_vals)
crimes = y_vals.unique()
scores = []

for crime in crimes:
    predictdf = pd.DataFrame(predictions, index = y_vals.index)
    predictdf = predictdf[y_vals == crime]
    yvals_subset = y_vals[y_vals == crime]
    
    
    predictions_matrix = predictdf.as_matrix()
    print predictdf.shape()
    #print y_vals[y_vals == crime].shape
    #print y_vals.shape
    scores.append(log_loss(y_vals[y_vals == crime], predictdf.as_matrix()))
    

    
  
    
#plt(crimes, scores)

TypeError: 'tuple' object is not callable

In [71]:
import gzip, csv
ids = test.Id.values
print test.describe()

alg = RandomForestClassifier(min_samples_leaf=1000)
alg.fit(x_train, y_train)
x_validation = x_vals[y_vals.isin(y_train.values)]
predictions = np.array(alg.predict_proba(x_validation))


labels =['Id']
for i in alg.classes_:
    labels.append(i)
with gzip.open('sfcrime.csv.gz', 'wb') as outf:
    fo =csv.writer(outf, lineterminator = '\n' )
    fo.writerow(labels)
    
    for i, pred in enumerate(predictions):
        fo.writerow([i] + list(pred))

                  Id              X              Y
count  884262.000000  884262.000000  884262.000000
mean   442130.500000    -122.422693      37.771476
std    255264.596206       0.030985       0.484824
min         0.000000    -122.513642      37.707879
25%    221065.250000    -122.433069      37.752374
50%    442130.500000    -122.416517      37.775421
75%    663195.750000    -122.406959      37.784353
max    884261.000000    -120.500000      90.000000
