# Let's Combine Models!

### Joe and Keenan

In this notebook, we are going to try combining our two most successful models in order to get a better score

In [20]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split
from sklearn import preprocessing
from sklearn.metrics import log_loss
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import log_loss
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import KFold
import numpy as np
import seaborn as sns
from sklearn.ensemble import GradientBoostingClassifier

%matplotlib inline

#Load Data with pandas, and parse the first column into datetime
train=pd.read_csv('train.csv', parse_dates = ['Dates'])
test=pd.read_csv('test.csv', parse_dates = ['Dates'])

# Keep only the ones that are within +3 to -3 standard deviations in the column 'Data'.
train = train[np.abs(train.X-train.X.mean())<=(3*train.X.std())] 
train = train[np.abs(train.Y-train.Y.mean())<=(3*train.Y.std())] 

train.head()


Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541


In [4]:
#Convert crime labels to numbers
le_crime = preprocessing.LabelEncoder()
crime = le_crime.fit_transform(train.Category)
 
#Get binarized weekdays, districts, and hours.
days = pd.get_dummies(train.DayOfWeek)
district = pd.get_dummies(train.PdDistrict)
hour = train.Dates.dt.hour
hour = pd.get_dummies(hour) 
x = train.X
y = train.Y
 
#Build new array
train_data = pd.concat([hour, days, district, x, y], axis=1)
train_data['crime']=crime
 
#Repeat for test data
days = pd.get_dummies(test.DayOfWeek)
district = pd.get_dummies(test.PdDistrict)
 
hour = test.Dates.dt.hour
hour = pd.get_dummies(hour) 
x = test.X
y = test.Y
 
test_data = pd.concat([hour, days, district, x, y], axis=1)

print train_data.head()
print test_data.head()

   0  1  2  3  4  5  6  7  8  9  ...    MISSION  NORTHERN  PARK  RICHMOND  \
0  0  0  0  0  0  0  0  0  0  0  ...          0         1     0         0   
1  0  0  0  0  0  0  0  0  0  0  ...          0         1     0         0   
2  0  0  0  0  0  0  0  0  0  0  ...          0         1     0         0   
3  0  0  0  0  0  0  0  0  0  0  ...          0         1     0         0   
4  0  0  0  0  0  0  0  0  0  0  ...          0         0     1         0   

   SOUTHERN  TARAVAL  TENDERLOIN           X          Y  crime  
0         0        0           0 -122.425892  37.774599     37  
1         0        0           0 -122.425892  37.774599     21  
2         0        0           0 -122.424363  37.800414     21  
3         0        0           0 -122.426995  37.800873     16  
4         0        0           0 -122.438738  37.771541     16  

[5 rows x 44 columns]
   0  1  2  3  4  5  6  7  8  9    ...      INGLESIDE  MISSION  NORTHERN  \
0  0  0  0  0  0  0  0  0  0  0    ...          

In [33]:
features = ['Friday', 'Monday', 'Saturday', 'Sunday', 'Thursday', 'Tuesday',
 'Wednesday', 'BAYVIEW', 'CENTRAL', 'INGLESIDE', 'MISSION',
 'NORTHERN', 'PARK', 'RICHMOND', 'SOUTHERN', 'TARAVAL', 'TENDERLOIN', 'X', 'Y']

# Add in hours of the day into the features
features2 = [x for x in range(0,24)]
features = features + features2

print features

['Friday', 'Monday', 'Saturday', 'Sunday', 'Thursday', 'Tuesday', 'Wednesday', 'BAYVIEW', 'CENTRAL', 'INGLESIDE', 'MISSION', 'NORTHERN', 'PARK', 'RICHMOND', 'SOUTHERN', 'TARAVAL', 'TENDERLOIN', 'X', 'Y', 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]


In [7]:
training, validation = train_test_split(train_data, train_size=.5)

model = BernoulliNB()
model.fit(training[features], training['crime'])
predicted = np.array(model.predict_proba(validation[features]))
print "Naive Bayes: " , log_loss(validation['crime'], predicted) 

Naive Bayes:  2.58264260269


In [44]:
# algorithms = [BernoulliNB(), features, LogisticRegression(C=.01), features]
features = features[0:7]
print features
algorithms = [GradientBoostingClassifier(random_state=1, n_estimators=25, max_depth=3), features, LogisticRegression(random_state=1), features]
full_predictions = []

for alg, features in algorithms:
    alg.fit(training[features], training['crime'])
    predictions = np.array(alg.predict_proba(validation[features]))[:,1]
    full_predictions.append(predictions)
    
print full_predictions




['Friday', 'Monday', 'Saturday', 'Sunday', 'Thursday', 'Tuesday', 'Wednesday']


ValueError: need more than 0 values to unpack