## Crime Classification

In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
#load dataset
train_df = pd.read_csv('train.csv',parse_dates=['Dates'],index_col=False)
test_df = pd.read_csv('test.csv',parse_dates=['Dates'],index_col=False)

In [3]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 878049 entries, 0 to 878048
Data columns (total 9 columns):
Dates         878049 non-null datetime64[ns]
Category      878049 non-null object
Descript      878049 non-null object
DayOfWeek     878049 non-null object
PdDistrict    878049 non-null object
Resolution    878049 non-null object
Address       878049 non-null object
X             878049 non-null float64
Y             878049 non-null float64
dtypes: datetime64[ns](1), float64(2), object(6)
memory usage: 67.0+ MB


In [6]:
train_df = train_df.drop(['Descript', 'Resolution', 'Address'], axis = 1)

In [7]:
test_df = test_df.drop(['Address'], axis = 1)

In [8]:
def feature_engineering(data):
    data['Day'] = data['Dates'].dt.day
    data['Month'] = data['Dates'].dt.month
    data['Year'] = data['Dates'].dt.year
    data['Hour'] = data['Dates'].dt.hour
    data['Minute'] = data['Dates'].dt.minute
    data['DayOfWeek'] = data['Dates'].dt.dayofweek
    data['WeekOfYear'] = data['Dates'].dt.weekofyear
    return data

In [9]:
train_df = feature_engineering(train_df)

In [10]:
test_df = feature_engineering(test_df)

In [11]:
from sklearn.preprocessing import LabelEncoder

In [13]:
enc = LabelEncoder()
train_df['PdDistrict'] = enc.fit_transform(train_df['PdDistrict'])

In [14]:
category_encoder = LabelEncoder()
category_encoder.fit(train_df['Category'])
train_df['CategoryEncoded'] = category_encoder.transform(train_df['Category'])
print(category_encoder.classes_)

['ARSON' 'ASSAULT' 'BAD CHECKS' 'BRIBERY' 'BURGLARY' 'DISORDERLY CONDUCT'
 'DRIVING UNDER THE INFLUENCE' 'DRUG/NARCOTIC' 'DRUNKENNESS' 'EMBEZZLEMENT'
 'EXTORTION' 'FAMILY OFFENSES' 'FORGERY/COUNTERFEITING' 'FRAUD' 'GAMBLING'
 'KIDNAPPING' 'LARCENY/THEFT' 'LIQUOR LAWS' 'LOITERING' 'MISSING PERSON'
 'NON-CRIMINAL' 'OTHER OFFENSES' 'PORNOGRAPHY/OBSCENE MAT' 'PROSTITUTION'
 'RECOVERED VEHICLE' 'ROBBERY' 'RUNAWAY' 'SECONDARY CODES'
 'SEX OFFENSES FORCIBLE' 'SEX OFFENSES NON FORCIBLE' 'STOLEN PROPERTY'
 'SUICIDE' 'SUSPICIOUS OCC' 'TREA' 'TRESPASS' 'VANDALISM' 'VEHICLE THEFT'
 'WARRANTS' 'WEAPON LAWS']


In [15]:
enc = LabelEncoder()
test_df['PdDistrict'] = enc.fit_transform(test_df['PdDistrict'])

In [17]:
print(train_df.columns)
print(test_df.columns)

Index([u'Dates', u'Category', u'DayOfWeek', u'PdDistrict', u'X', u'Y', u'Day',
       u'Month', u'Year', u'Hour', u'Minute', u'WeekOfYear',
       u'CategoryEncoded'],
      dtype='object')
Index([u'Id', u'Dates', u'DayOfWeek', u'PdDistrict', u'X', u'Y', u'Day',
       u'Month', u'Year', u'Hour', u'Minute', u'WeekOfYear'],
      dtype='object')


In [19]:
x_cols = list(train_df.columns[2:12].values)
x_cols.remove('Minute')
print(x_cols)

['DayOfWeek', 'PdDistrict', 'X', 'Y', 'Day', 'Month', 'Year', 'Hour', 'WeekOfYear']


In [34]:
train_df.head()

Unnamed: 0,Dates,Category,DayOfWeek,PdDistrict,X,Y,Day,Month,Year,Hour,Minute,WeekOfYear,CategoryEncoded
0,2015-05-13 23:53:00,WARRANTS,2,4,-122.425892,37.774599,13,5,2015,23,53,20,37
1,2015-05-13 23:53:00,OTHER OFFENSES,2,4,-122.425892,37.774599,13,5,2015,23,53,20,21
2,2015-05-13 23:33:00,OTHER OFFENSES,2,4,-122.424363,37.800414,13,5,2015,23,33,20,21
3,2015-05-13 23:30:00,LARCENY/THEFT,2,4,-122.426995,37.800873,13,5,2015,23,30,20,16
4,2015-05-13 23:30:00,LARCENY/THEFT,2,5,-122.438738,37.771541,13,5,2015,23,30,20,16


In [22]:
from sklearn.cross_validation import train_test_split

In [23]:
X_train, X_test, y_train, y_test = train_test_split(train_df[x_cols], train_df['CategoryEncoded'], 
                                                    test_size=0.3, random_state=42)

In [35]:
from sklearn.ensemble import RandomForestClassifier
clf_rfc = RandomForestClassifier(n_estimators = 10)

from sklearn.ensemble import AdaBoostClassifier
clf_ada = AdaBoostClassifier(n_estimators = 10)

from sklearn.tree import DecisionTreeClassifier
clf_dtc = DecisionTreeClassifier()

In [39]:
clf_rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [40]:
preds=clf_rfc.predict(X_test)

In [41]:
from sklearn.metrics import accuracy_score
accuracy_score(preds,y_test)

0.26787768350321733

In [42]:
test_df['predictions'] = clf_rfc.predict(test_df[x_cols])

In [43]:
def field_to_columns(data, field, new_columns):
    for i in range(len(new_columns)):
        data[new_columns[i]] = (data[field] == new_columns[i]).astype(int)
    return data


In [44]:
test_df['Category'] = category_encoder.inverse_transform(test_df['predictions'])

In [45]:
categories = list(category_encoder.classes_)

In [46]:
test_df = field_to_columns(test_df, 'Category', categories)

In [47]:
import time
PREDICTIONS_FILENAME_PREFIX = 'predictions_'
PREDICTIONS_FILENAME = PREDICTIONS_FILENAME_PREFIX + time.strftime('%Y%m%d-%H%M%S') + '.csv'

In [48]:
submission_cols = [test_df.columns[0]]+list(test_df.columns[14:])
print(PREDICTIONS_FILENAME)
test_df[submission_cols].to_csv(PREDICTIONS_FILENAME, index = False)

predictions_20160426-014501.csv
