In [1]:
import pandas as pd
import numpy as np
import zipfile

#importing train dataset
z_train = zipfile.ZipFile('train.csv.zip')
train = pd.read_csv(z_train.open('train.csv'), parse_dates=['Dates'], index_col=False)

In [2]:
def make_binary_fields(df, field):
    """
    creates new field with field name as the name of data 
    if the original data match the new field name, the data will be 1
    if the original data does not match the new field name, the data will be 0
    
    returns the new field names as a list
    
    ex 
    make_binary_field(df, 'DayOfWeek')
    will create new fields
    Monday, Tuesday, Wed
    nesday, Thursday, Friday, Saturday and Sunday
    where
    df['Monday'] will have value 1 for all Mondays and 0 for the rest
    """
    fields = []
    for new_field in df[field].unique():
        df[new_field] = df[field]
        df.loc[df[new_field] != new_field, new_field] = 0
        df.loc[df[new_field] == new_field, new_field] = 1
        fields.append(new_field)
    return fields

def time_trim(data):
    data['Day'] = data['Dates'].dt.day
    data['Month'] = data['Dates'].dt.month
    data['Year'] = data['Dates'].dt.year
    data['Hour'] = data['Dates'].dt.hour
    data['Minute'] = data['Dates'].dt.minute
    #data['DayOfWeek'] = data['Dates'].dt.dayofweek
    data['WeekOfYear'] = data['Dates'].dt.weekofyear
    return data

In [3]:
def make_season(df):
    """
    Make new field name Season
    and binary fields for each season
    Has to happen after making 'Month' field
    spring: month 2, 3, 4
    summer: month 5, 6, 7
    autumn: month 8, 9, 10
    winter: month 11, 12, 1
    """
    df['Season'] = df['Month']
    df.loc[(df['Season'] > 10) | (df['Season'] == 1), 'Season'] = 'Winter'
    df.loc[(df['Season'] > 1) & (df['Season'] <= 4), 'Season'] = 'Spring'
    df.loc[(df['Season'] > 4) & (df['Season'] <= 7), 'Season'] = 'Summer'
    df.loc[(df['Season'] > 7) & (df['Season'] <= 10), 'Season'] = 'Autumn'
    seasons = make_binary_fields(df, 'Season')
    return seasons

def convert_data_to_int(df, field):
    """
    Converts fields into numbers
    """
    for i, f in enumerate(df[field].unique()):
        df.loc[df[field] == f, field] = i+1
    return
        

In [4]:
time_trim(train)
seasons = make_season(train)
dow = make_binary_fields(train, 'DayOfWeek')
#pdd = make_binary_fields(train, 'PdDistrict')
#categories = make_binary_fields(train, 'Category')

In [5]:
predictors = ['Day','Month','Year','Hour','Minute','WeekOfYear']
#predictors.extend(pdd)
predictors.extend(seasons)
#predictors.extend(dow)

In [6]:
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn import cross_validation
from sklearn.cross_validation import KFold
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import LabelEncoder
import scipy as sp

In [7]:
sfkaggle = ['ARSON','ASSAULT','BAD CHECKS','BRIBERY','BURGLARY','DISORDERLY CONDUCT',
            'DRIVING UNDER THE INFLUENCE','DRUG/NARCOTIC','DRUNKENNESS','EMBEZZLEMENT','EXTORTION',
            'FAMILY OFFENSES','FORGERY/COUNTERFEITING','FRAUD','GAMBLING','KIDNAPPING','LARCENY/THEFT',
            'LIQUOR LAWS','LOITERING','MISSING PERSON','NON-CRIMINAL','OTHER OFFENSES','PORNOGRAPHY/OBSCENE MAT',
            'PROSTITUTION','RECOVERED VEHICLE','ROBBERY','RUNAWAY','SECONDARY CODES','SEX OFFENSES FORCIBLE',
            'SEX OFFENSES NON FORCIBLE','STOLEN PROPERTY','SUICIDE','SUSPICIOUS OCC','TREA','TRESPASS','VANDALISM',
            'VEHICLE THEFT','WARRANTS','WEAPON LAWS']
print sorted(train.Category.unique()) == sfkaggle

True


In [8]:
def score(r, p):
    """
    r: result
    p: prediction from training

    calculate the score as evaluated by kaggle 

    https://www.kaggle.com/c/sf-crime/details/evaluation

    Submissions are evaluated using the multi-class logarithmic loss. 
    Each incident has been labeled with one true class. 
    For each incident, you must submit a set of predicted probabilities (one for every class). 
    The formula is then,

    where N is the number of cases in the test set, 
    M is the number of class labels, 
    \\(log\\) is the natural logarithm, 
    \\(y_{ij}\\) is 1 if observation \\(i\\) is in class \\(j\\) and 0 otherwise, 
    and \\(p_{ij}\\) is the predicted probability that observation \\(i\\) belongs to class \\(j\\).

    The submitted probabilities for a given incident are not required to sum to one 
    because they are rescaled prior to being scored (each row is divided by the row sum). 
    In order to avoid the extremes of the log function, 
    predicted probabilities are replaced with \\(max(min(p,1-10^{-15}),10^{-15})\\).
    """
    return

def logloss(act, pred):
    """
    https://www.kaggle.com/wiki/LogarithmicLoss
    """
    epsilon = 1e-15
    pred = sp.maximum(epsilon, pred)
    pred = sp.minimum(1-epsilon, pred)
    ll = sum(act*sp.log(pred) + sp.subtract(1,act)*sp.log(sp.subtract(1,pred)))
    ll = ll * -1.0/len(act)
    return ll

def log_loss(y_true,y_pred):
    eps = 1e-15
    y_pred = y_pred/y_pred.sum(axis=1)[:,np.newaxis]
    y_pred = np.maximum(eps,y_pred)
    y_pred = np.minimum(1-eps,y_pred)
    y_pred = np.log(y_pred)
    ll = 0
    for i in range(len(y_true)):
        ll -= y_pred[i,int(y_true[i])]
    return ll/len(y_true)

In [9]:
enc = LabelEncoder()
enc.fit(train['Category'])
train['CategoryEncoded'] = enc.transform(train['Category'])

In [None]:
x = train[predictors]
y = train['CategoryEncoded']
xtr, xtest, ytr, ytest = cross_validation.train_test_split(x, y, test_size = 0.5, stratify = np.array(y) )

In [None]:
alg = LogisticRegression()
alg.fit(xtr, ytr)

prediction = alg.predict_proba(xtest)
logloss(ytest,prediction)
#scores = cross_validation.cross_val_score(alg, train[predictors], train[categories])
#print(scores.mean())