# Model Iteration 1 
To start, we are going to implement a linear regression set for the data. Then we will iterate, and improve, the model.

In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

data = pd.read_csv("train.csv")

First things first, we are going to attempt to use dummy variables to define new columns for more of our data. Using dummies instead of reassignment could increase precision (maybe?).

In [None]:
dummy_predictors = ["Monday", "Tuesday"]

In [13]:
dow = {
    "Monday" : 0,
    "Tuesday" : 1,
    "Wednesday" : 2,
    "Thursday" : 3,
    "Friday" : 4,
    "Saturday" : 5,
    "Sunday" : 6
}
data["DOW"] = data.DayOfWeek.map(dow)

Next step is similar, but now with Police Districts so we can just deal with ints instead of strings.

In [14]:
pds = {
    "SOUTHERN" : 0,
    "MISSION" : 1,
    "NORTHERN" : 2,
    "BAYVIEW" : 3,
    "CENTRAL" : 4,
    "TERNDERLOIN" : 5,
    "INGLESIDE" : 6,
    "TARAVAL" : 7,
    "PARK" : 8,
    "RICHMOND" : 9
}
data["pds"] = data.PdDistrict.map(pds)
# for crimes without PD, use "Other" : 10
data["pds"] = data["pds"].fillna(10)

Next, we are going to use X and Y coordinates as they are pretty continuous. There are some outlying data points, so let's clean that up. First, we check for those outlying data points (which we happen to know are at the north pole).

In [15]:
for val in data.Y:
    if val == 90.0:
        print val
        
for val in data.X:
    if val == -120.5:
        print val

90.0
90.0
90.0
90.0
90.0
90.0
90.0
90.0
90.0
90.0
90.0
90.0
90.0
90.0
90.0
90.0
90.0
90.0
90.0
90.0
90.0
90.0
90.0
90.0
90.0
90.0
90.0
90.0
90.0
90.0
90.0
90.0
90.0
90.0
90.0
90.0
90.0
90.0
90.0
90.0
90.0
90.0
90.0
90.0
90.0
90.0
90.0
90.0
90.0
90.0
90.0
90.0
90.0
90.0
90.0
90.0
90.0
90.0
90.0
90.0
90.0
90.0
90.0
90.0
90.0
90.0
90.0
-120.5
-120.5
-120.5
-120.5
-120.5
-120.5
-120.5
-120.5
-120.5
-120.5
-120.5
-120.5
-120.5
-120.5
-120.5
-120.5
-120.5
-120.5
-120.5
-120.5
-120.5
-120.5
-120.5
-120.5
-120.5
-120.5
-120.5
-120.5
-120.5
-120.5
-120.5
-120.5
-120.5
-120.5
-120.5
-120.5
-120.5
-120.5
-120.5
-120.5
-120.5
-120.5
-120.5
-120.5
-120.5
-120.5
-120.5
-120.5
-120.5
-120.5
-120.5
-120.5
-120.5
-120.5
-120.5
-120.5
-120.5
-120.5
-120.5
-120.5
-120.5
-120.5
-120.5
-120.5
-120.5
-120.5
-120.5


Then we replace those points for more reasonable numbers. Running the code below, then the code above will produce 0 outlying values (because we got rid of them).

In [16]:
data.X.replace(-120.5, data["X"].median(), inplace = True)
data.Y.replace(90, data["Y"].median(), inplace = True)

The data cleaning train has no brakes. Now we are going to clean up date/time.

In [17]:
def parse_date(Dates):
    """ Convert a date in YYYY-MM-DD HH:MM:SS to a tuple
        containing year, month, day, and hours each expressed
        as an integer. Used from Paul Ruvolo's example in bikeshare kaggle dataset

        >>> parse_date("2014-04-05 14:00:00")
        (2014, 4, 5, 14)
    """
    return int(Dates[0:4]), int(Dates[5:7]), int(Dates[8:10]), int(Dates[11:13])

# "now we use the lambda functions" - Mack
data["Year"] = data.Dates.apply(lambda x: parse_date(x)[0])
data["Month"] = data.Dates.apply(lambda x: parse_date(x)[1])
data["Hour"] = data.Dates.apply(lambda x: parse_date(x)[3])

And so now we are going to do the predictive thingamajig. We drew inspiration from this script (https://www.kaggle.com/sonuk7/sf-crime/prediction-with-bernoulinb/code)

In [18]:
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn import cross_validation

cats = data.Category.values
cleanData = data.drop(["Address","Category","Dates","Descript","X", "Y","Resolution", "DayOfWeek", "PdDistrict"], axis=1)

model = GaussianNB()
# model = BernoulliNB()
model.fit(cleanData.dropna(), cats)

scores = cross_validation.cross_val_score(model, cleanData, data["Category"], cv = 3)
print scores

[ 0.19815578  0.2056471   0.08445377]


Wow that's a bad score :\ I think? Not sure really. Let's generate a test file starting with clean test data to mimic our cleaned training data


In [19]:
import gzip, csv

test = pd.read_csv('test.csv')
test["DOW"] = test.DayOfWeek.map(dow)

test["pds"] = test.PdDistrict.map(pds)
# for crimes without PD, use "Other" : 10
test["pds"] = test["pds"].fillna(10)

test.X.replace(-120.5, test["X"].median(), inplace = True)
test.Y.replace(90, test["Y"].median(), inplace = True)

test["Year"] = test.Dates.apply(lambda x: parse_date(x)[0])
test["Month"] = test.Dates.apply(lambda x: parse_date(x)[1])
test["Hour"] = test.Dates.apply(lambda x: parse_date(x)[3])
                                
idx = test.Id.values

cleanTest = test.drop(["Id","Address","Dates","X", "Y", "DayOfWeek", "PdDistrict"], axis=1)

# model = BernoulliNB()
model = GaussianNB()
model.fit(cleanData.dropna(), cats)

predicted = model.predict_proba(cleanTest)
#labels =  "ARSON,ASSAULT,BAD CHECKS,BRIBERY,BURGLARY,DISORDERLY CONDUCT,DRIVING UNDER THE INFLUENCE,DRUG/NARCOTIC,DRUNKENNESS,EMBEZZLEMENT,EXTORTION,FAMILY OFFENSES,FORGERY/COUNTERFEITING,FRAUD,GAMBLING,KIDNAPPING,LARCENY/THEFT,LIQUOR LAWS,LOITERING,MISSING PERSON,NON-CRIMINAL,OTHER OFFENSES,PORNOGRAPHY/OBSCENE MAT,PROSTITUTION,RECOVERED VEHICLE,ROBBERY,RUNAWAY,SECONDARY CODES,SEX OFFENSES FORCIBLE,SEX OFFENSES NON FORCIBLE,STOLEN PROPERTY,SUICIDE,SUSPICIOUS OCC,TREA,TRESPASS,VANDALISM,VEHICLE THEFT,WARRANTS,WEAPON LAWS".split(',')
labels =['Id']
for i in model.classes_:
    labels.append(i)
with gzip.open('bernoulinb.csv.gz', 'wb') as outf:
    fo =csv.writer(outf, lineterminator = '\n' )
    fo.writerow(labels)
    
    for i, pred in enumerate(predicted):
        fo.writerow([i] + list(pred))
predictions = model.predict(cleanTest)

# submission = pd.DataFrame({
#        "Id" : cleanTest["Id"],
#        "Category" : predictions
#    })

# submission.to_csv("CYOA1.csv", index=False)

KeyError: 'Id'