In [1]:
import numpy as np
import pandas as pd
import itertools
from __future__ import division
from sklearn.tree import tree, DecisionTreeClassifier, export_graphviz

%matplotlib inline
pd.set_option("display.max_columns", 500)
pd.set_option("max_rows", 1000)

In [2]:
filePath = 'datasets/NYPD_Motor_Vehicle_Collisions.csv'
colls = pd.read_csv(filePath)

In [8]:
def encode_column(df, target_column):
#     print "copying dataset"
    df_mod = df.copy()
#     print "finding uniques"
    targets = pd.Series(df_mod[target_column].unique())
#     print "mapping to ints"
    map_to_int = {name: n for n, name in enumerate(targets)}
#     print "replacing in data set"
    df_mod[target_column+"_encoded"] = df_mod[target_column].replace(map_to_int)
#     df_mod[target_column+"_encoded"] = df_mod[target_column].replace(targets[targets == target_column])
    return (df_mod, targets)

def train_tree(prediction, features, dataset):
    clf = tree.DecisionTreeClassifier()
    print "TRAINING WITH %d SAMPLES" % len(dataset) 
    X = np.array(dataset[features])
    Y = np.array(list(itertools.chain(*dataset[[prediction]].values)))
    return clf.fit(X, Y)

def test_tree(clf, test_data, features):
    return clf.predict(test_data[features])

def convert_encoded_district_to_str(preditions):
    return map(lambda p: districts[p], preditions)

def test_prediction(target_label, clf, test_data, features, encoded_map):
    corrects = 0
    predictions = test_tree(clf, test_data[features], features)
    for i in range(0, len(predictions)):
        if predictions[i] == test_data.iloc[i][target_label]:
            corrects += 1
    print "FOUND %d CORRECT PREDICTIONS" % corrects
    return corrects / len(predictions)

In [9]:
target_label = 'ZIP CODE'

data = colls[pd.notnull(colls[target_label])]

# Encoding target/label column
# mdata, target = encode_column(data, 'ZIP CODE')
mdata, target = encode_column(data, target_label)

# Encode the feature columns
mdata, _ = encode_column(mdata, 'BOROUGH')
mdata, _ = encode_column(mdata, 'CONTRIBUTING FACTOR VEHICLE 1')
mdata, _ = encode_column(mdata, 'VEHICLE TYPE CODE 1')


# Splitting date and time into month and hour
mdata['HOUR'] = data.TIME.str.split(':').str.get(0)
mdata['MONTH'] = data.DATE.str.split('/').str.get(1)

copying dataset
finding uniques
mapping to ints
replacing in data set
copying dataset
finding uniques
mapping to ints
replacing in data set
copying dataset
finding uniques
mapping to ints
replacing in data set
copying dataset
finding uniques
mapping to ints
replacing in data set


In [10]:
# Features for prediction
features = ['MONTH', 'HOUR', 'BOROUGH_encoded', 'VEHICLE TYPE CODE 1_encoded']

# Split data set into training and test data
train_data = mdata.head(int(mdata.BOROUGH.count() * 0.75))
test_data = mdata.tail(int(mdata.BOROUGH.count() * 0.25))

target_label_encoded = target_label+'_encoded'

# Train the decision tree
clf = train_tree(target_label_encoded, features, train_data)

# Test the decision tree
print "Prediction accuracy %f" % test_prediction(target_label_encoded, clf, test_data, features, target)

TRAINING WITH 438514 SAMPLES
FOUND 7309 CORRECT PREDICTIONS
Prediction accuracy 0.050003


# Decision Tree Log
## Finding ZIP CODE with HOUR, MONTH, BOROUGH
- TRAINING WITH 438514 SAMPLES
- FOUND 7884 CORRECT PREDICTIONS
- Prediction accuracy 0.053937
## Finding ZIP CODE with HOUR, MONTH, BOROUGH, CFV1, VTC1
- TRAINING WITH 526217 SAMPLES
- FOUND 3243 CORRECT PREDICTIONS
- Prediction accuracy 0.055466
## Finding VTC1 with HOUR, MONTH, BOROUGH, CFV1
- TRAINING WITH 438514 SAMPLES
- FOUND 77135 CORRECT PREDICTIONS
- Prediction accuracy 0.527704

In [None]:
colls[colls['DATE'] == "03/08/2015"].head(1000).sort_values(by='TIME')