In [1]:
import pandas as pd
import numpy as np
import zipfile

#importing train dataset
z_train = zipfile.ZipFile('train.csv.zip')
train = pd.read_csv(z_train.open('train.csv'), parse_dates=['Dates'], index_col=False)

In [2]:
def make_binary_fields(df, field):
    """
    creates new field with field name as the name of data 
    if the original data match the new field name, the data will be 1
    if the original data does not match the new field name, the data will be 0
    
    returns the new field names as a list
    
    ex 
    make_binary_field(df, 'DayOfWeek')
    will create new fields
    Monday, Tuesday, Wed
    nesday, Thursday, Friday, Saturday and Sunday
    where
    df['Monday'] will have value 1 for all Mondays and 0 for the rest
    """
    fields = []
    for new_field in df[field].unique():
        df[new_field] = df[field]
        df.loc[df[new_field] != new_field, new_field] = 0
        df.loc[df[new_field] == new_field, new_field] = 1
        fields.append(new_field)
    return fields

def time_trim(data):
    data['Day'] = data['Dates'].dt.day
    data['Month'] = data['Dates'].dt.month
    data['Year'] = data['Dates'].dt.year
    data['Hour'] = data['Dates'].dt.hour
    data['Minute'] = data['Dates'].dt.minute
    #data['DayOfWeek'] = data['Dates'].dt.dayofweek
    data['WeekOfYear'] = data['Dates'].dt.weekofyear
    return data

In [3]:
def make_season(df):
    """
    Make new field name Season
    and binary fields for each season
    Has to happen after making 'Month' field
    spring: month 2, 3, 4
    summer: month 5, 6, 7
    autumn: month 8, 9, 10
    winter: month 11, 12, 1
    """
    df['Season'] = df['Month']
    df.loc[(df['Season'] > 10) | (df['Season'] == 1), 'Season'] = 'Winter'
    df.loc[(df['Season'] > 1) & (df['Season'] <= 4), 'Season'] = 'Spring'
    df.loc[(df['Season'] > 4) & (df['Season'] <= 7), 'Season'] = 'Summer'
    df.loc[(df['Season'] > 7) & (df['Season'] <= 10), 'Season'] = 'Autumn'
    seasons = make_binary_fields(df, 'Season')
    return seasons

def convert_data_to_int(df, field):
    """
    Converts fields into numbers
    """
    for i, f in enumerate(df[field].unique()):
        df.loc[df[field] == f, field] = i+1
    return
        

In [None]:
time_trim(train)
seasons = make_season(train)
dow = make_binary_fields(train, 'DayOfWeek')
pdd = make_binary_fields(train, 'PdDistrict')
categories = make_binary_fields(train, 'Category')

In [None]:
predictors = ['Day','Month','Year','Hour','Minute','WeekOfYear']
predictors.extend(pdd)
predictors.extend(seasons)
predictors.extend(dow)

In [30]:
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn import cross_validation
from sklearn.cross_validation import KFold
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import LabelEncoder
import scipy as sp

In [7]:
sfkaggle = ['ARSON','ASSAULT','BAD CHECKS','BRIBERY','BURGLARY','DISORDERLY CONDUCT',
            'DRIVING UNDER THE INFLUENCE','DRUG/NARCOTIC','DRUNKENNESS','EMBEZZLEMENT','EXTORTION',
            'FAMILY OFFENSES','FORGERY/COUNTERFEITING','FRAUD','GAMBLING','KIDNAPPING','LARCENY/THEFT',
            'LIQUOR LAWS','LOITERING','MISSING PERSON','NON-CRIMINAL','OTHER OFFENSES','PORNOGRAPHY/OBSCENE MAT',
            'PROSTITUTION','RECOVERED VEHICLE','ROBBERY','RUNAWAY','SECONDARY CODES','SEX OFFENSES FORCIBLE',
            'SEX OFFENSES NON FORCIBLE','STOLEN PROPERTY','SUICIDE','SUSPICIOUS OCC','TREA','TRESPASS','VANDALISM',
            'VEHICLE THEFT','WARRANTS','WEAPON LAWS']
print sorted(train.Category.unique()) == sfkaggle

True


In [23]:
def logloss(y,p):
    """
    information derived from following sources
    https://www.kaggle.com/wiki/LogarithmicLoss
    https://www.kaggle.com/c/sf-crime/details/evaluation
    """
    eps = 1e-15
    p = p/p.sum(axis=1)[:,np.newaxis]
    p = np.maximum(eps,p)
    p = np.minimum(1-eps,p)
    

    # Calculate logloss
    ll = 0
    for i in range(len(p)):
        ll += np.log(p[i, y.iloc[i]])
    ll /= float(-len(p))

    return ll

In [9]:
enc = LabelEncoder()
enc.fit(train['Category'])
train['CategoryEncoded'] = enc.transform(train['Category'])

In [24]:
x = train[predictors]
y = train['CategoryEncoded']
xtr, xtest, ytr, ytest = cross_validation.train_test_split(x, y, test_size = 0.5, stratify = np.array(y) )

In [25]:
alg = LogisticRegression()
alg.fit(xtr, ytr)
#scores = cross_validation.cross_val_score(alg, train[predictors], train[categories])
#print(scores.mean())

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [26]:
prediction = alg.predict_proba(xtest)
print logloss(ytest,prediction)

2.63790122535


In [34]:
import sklearn as sk

alg = sk.tree.DecisionTreeClassifier(max_depth=4, min_samples_leaf=4)
alg.fit(xtr, ytr)
prediction = alg.predict_proba(xtest)
logloss(ytest,prediction)

2.5611908178510951

In [None]:
alg = GradientBoostingClassifier(random_state=1, n_estimators=10, max_depth=3)
alg.fit(xtr, ytr)
prediction = alg.predict_proba(xtest)
logloss(ytest,prediction)