# Соревнование: San Francisco Crime Classification

_Given time and location, you must predict the category of crime that occurred. Kaggle is hosting this competition for the machine learning community to use for fun and practice._

https://www.kaggle.com/c/sf-crime

## 1. Подготовка работы. Загрузка библиотек и настройка отображения
Импорты и настроийки, которые необходимы для шаблона.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
import datetime

%matplotlib inline

## 2. Загрузка, очистка данных.

Загрузка тренировочных данных из csv:

In [28]:
train_data = pd.read_csv('data/train.csv')

Удаляем преступления с аномальными координатами.

In [29]:
train_data = train_data[(train_data['X']!=-120.5) | (train_data['Y']!=90.0)]

Немного преобразуем данные, конвертируем строку дата в DateTime объект, а другие столбцы конвертируем в категориальные переменные.

In [8]:
train_data.Dates = pd.to_datetime(train_data.Dates)
train_data.Category = train_data.Category.astype('category')
train_data.Descript = train_data.Descript.astype('category')
train_data.DayOfWeek = train_data.DayOfWeek.astype('category')
train_data.PdDistrict = train_data.PdDistrict.astype('category')
train_data.Resolution = train_data.Resolution.astype('category')
train_data.Address = train_data.Address.astype('category')

Разобьем нашу дату на час, день месяца, месяц и год, чтобы получить больше информации:

In [11]:
def convert_time(data):
    data['Hour'] = data.Dates.apply(lambda x: x.hour)
    data['Day'] = data.Dates.apply(lambda x: x.day)
    data['Month'] = data.Dates.apply(lambda x: x.month)
    data['Year'] = data.Dates.apply(lambda x: x.year)
    return data
train_data = convert_time(train_data)

## 3. Построение дополнительных признаков.

Напишем дополнительные функции, чтобы преобразовать наши данные.

In [16]:
from sklearn import preprocessing

In [17]:
fiature_converters = dict()

def feature_binarizer(data, feature_name, fit=False):
    if(fit):
        fiature_converters[feature_name]=preprocessing.LabelBinarizer()
        #fiature_converters[feature_name]=preprocessing.OneHotEncoder()
        fiature_converters[feature_name].fit(data[feature_name].values)
    return fiature_converters[feature_name].transform(data[feature_name].values)

def conv_street(street):
    if street.find('/') != -1:
        return map(str.strip, street.split('/'))    
    pos = street.find('Block of ')
    if pos != -1:        
        return [street[pos+9:]]    
    return [street]

def build_streets(data):
    streets = set()
    for x in data.Address[0:10]:
        streets |= set(conv_street(x))
    return streets

def append_streets(adr_column, streets):
    streets_cols = np.zeros((len(adr_column),len(streets)), dtype=int)
    for i, street in enumerate(streets):
        for j, address in enumerate(adr_column):
            if address.find(street) != -1:
                streets_cols[j,i] = 1
    return streets_cols

streets = build_streets(train_data)

coords_scaler = preprocessing.MinMaxScaler()
def transform_coords(data,fit=False):
    if fit:
        coords_scaler.fit(data)
    return coords_scaler.transform(data)

category_enc = None
def conv_category(data, fit=False):
    if fit:
        category_enc = preprocessing.LabelEncoder()
        category_enc.fit(data)
    return category_enc.transform(data)

coords_net = None
coords_net_size = 50
def coords_to_net(coords,fit=False,net_size=50):
    if fit:
        coords_net = np.array([[np.min(coords[:,0]), np.max(coords[:,0])],
                               [np.min(coords[:,1]), np.max(coords[:,1])]])
        coords_net_size = net_size
    result = np.zeros((coords.shape[0],2))
    x_scale = coords_net[0,1] - coords_net[0,0]
    y_scale = coords_net[1,1] - coords_net[1,0]
    for i in range(0,coords.shape[0]):
        x = int(((coords[i,0]-coords_net[0,0])/x_scale)*(net_size-1))
        y = int(((coords[i,1]-coords_net[1,0])/y_scale)*(net_size-1))
        result[i] = [x,y]
    return result

In [18]:
X_train = np.hstack((transform_coords(train_data[['X', 'Y']].values,True),
                     feature_binarizer(train_data,'Hour',True),
                     feature_binarizer(train_data,'Day',True),
                     feature_binarizer(train_data,'Month',True),
                     feature_binarizer(train_data,'Year',True),
                     feature_binarizer(train_data,'DayOfWeek',True),
                     #feature_binarizer(train_data,'Address',True),
                     append_streets(train_data.Address, streets),
                     feature_binarizer(train_data,'PdDistrict',True)))

In [19]:
X_train2 = coords_to_net(train_data[['X', 'Y']].values,True)

In [20]:
X_train3 = np.hstack((transform_coords(train_data[['X', 'Y']].values,True),
                     feature_binarizer(train_data,'Hour',True),
                     feature_binarizer(train_data,'Month',True),
                     feature_binarizer(train_data,'Year',True),
                     feature_binarizer(train_data,'DayOfWeek',True),
                     feature_binarizer(train_data,'PdDistrict',True)))

In [21]:
X_train4 = train_data[['X', 'Y']].values

In [22]:
y_train = conv_category(train_data.Category.values, True)

## 4. Обучение

In [23]:
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import KFold
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import log_loss

def cross_evaluate_model(clf,X_train,y_train):
    start_time = datetime.datetime.now()
    scores = cross_val_score(clf, X_train, y_train, cv=5,scoring='log_loss')
    duration = (datetime.datetime.now() - start_time).total_seconds()

    print('Classifier: {0}'.format(type(clf)))
    print('Time elapsed: {0}'.format(duration))
    print('average score (5 folds): {0}'.format(scores.mean()))
    
def evaluate_model(clf,X,y,test_size=0.8):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=42)
    
    start_time = datetime.datetime.now()
    clf.fit(X_train,y_train)
    fit_duration = (datetime.datetime.now() - start_time).total_seconds()
    
    start_time = datetime.datetime.now()
    y_pred_train = clf.predict_proba(X_train)
    y_pred_test = clf.predict_proba(X_test)
    predict_duration = (datetime.datetime.now() - start_time).total_seconds()
    
    train_score = log_loss(y_train,y_pred_train)
    test_score = log_loss(y_test,y_pred_test)
    
    print('Classifier: {0}'.format(type(clf)))
    print('Fit time: {0:.1f} sec, Predict time: {1:.1f} sec, All time: {0:.1f} sec'.format(
            fit_duration, predict_duration, fit_duration + predict_duration))    
    print('Train Score: {0:.2f}'.format(train_score))
    print('Test Score: {0:.2f}'.format(test_score))
    if hasattr(clf, 'feature_importances_'):
        print('Feature importances: {0}'.format(clf.feature_importances_))
    if hasattr(clf, 'coef_'):
        print('Feature importances: {0}'.format(clf.coef_))

In [24]:
# LogisticRegression
from sklearn.linear_model import LogisticRegression
evaluate_model(LogisticRegression(random_state=241), X_train, y_train)

Classifier: <class 'sklearn.linear_model.logistic.LogisticRegression'>
Fit time: 109.6 sec, Predict time: 2.9 sec, All time: 109.6 sec
Train Score: 2.54
Test Score: 2.56
Feature importances: [[ 0.4260425  -2.37900431 -0.18095444 ..., -0.55165875 -0.5371095
  -0.38077906]
 [ 0.63097472 -1.32164378 -0.03395671 ..., -0.18995058 -0.27449741
   0.1203483 ]
 [-0.90251563 -1.14900262  1.00811082 ...,  0.07159907 -0.87106772
  -0.29468388]
 ..., 
 [-1.09656555  0.25015721 -0.3082151  ..., -0.66563084  0.08317785
  -1.7097407 ]
 [ 0.32787852 -0.72066001 -0.28781827 ...,  0.15783213 -0.84877703
   0.65009328]
 [ 1.19305218 -1.51775111 -0.14500234 ..., -0.60010389 -0.30176007
  -0.16925273]]


In [32]:
# LogisticRegression
from sklearn.linear_model import LogisticRegression
evaluate_model(LogisticRegression(random_state=241), X_train3, y_train)

Classifier: <class 'sklearn.linear_model.logistic.LogisticRegression'>
Fit time: 75.7 sec, Predict time: 3.1 sec, All time: 75.7 sec
Train Score: 2.55
Test Score: 2.56
Feature importances: [[ 0.40320172 -2.39265506 -0.19692019 ..., -0.53739474 -0.53571149
  -0.39546545]
 [ 0.59028407 -1.29495601 -0.03281479 ..., -0.18601102 -0.27169606
   0.09844458]
 [-0.91578802 -1.17026874  1.02206048 ...,  0.08327702 -0.87614901
  -0.32907348]
 ..., 
 [-1.0920884   0.24138742 -0.31489525 ..., -0.68077315  0.06549942
  -1.6716455 ]
 [ 0.31065281 -0.7553505  -0.2997602  ...,  0.16615282 -0.86128215
   0.68044639]
 [ 1.20047539 -1.51811054 -0.14228482 ..., -0.61395144 -0.31355415
  -0.17782009]]


In [None]:
# KNeighbors
from sklearn.neighbors import KNeighborsClassifier
evaluate_model(KNeighborsClassifier(n_neighbors=5, n_jobs=4), X_train, y_train)

In [None]:
# RandomForest
from sklearn.ensemble import RandomForestClassifier
evaluate_model(RandomForestClassifier(n_estimators=30, max_depth=10, n_jobs=4,
                                      verbose=True, random_state=241), X_train, y_train)

In [26]:
# GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingClassifier
evaluate_model(GradientBoostingClassifier(n_estimators=10, max_depth=10,
                                          verbose=True, random_state=241), X_train, y_train)

      Iter       Train Loss   Remaining Time 
         1      545703.7535           89.66m
         2      504936.3204           80.10m
         3      476422.2271           71.12m
         4      454011.9280           60.83m
         5      435188.5451           50.63m
         6      419436.6596           40.40m
         7      405828.1451           30.31m
         8      393996.1933           20.23m
         9      383566.9101           10.10m
        10      374426.4463            0.00s
Classifier: <class 'sklearn.ensemble.gradient_boosting.GradientBoostingClassifier'>
Fit time: 6058.2 sec, Predict time: 50.4 sec, All time: 6058.2 sec
Train Score: 2.13
Test Score: 2.68
Feature importances: [  1.71809100e-01   1.90983281e-01   6.04780814e-03   3.90130414e-03
   5.07280191e-03   3.25905392e-03   2.57038197e-03   2.07971129e-03
   3.86272932e-03   4.12840414e-03   5.90287469e-03   5.75341267e-03
   6.75773339e-03   7.43860086e-03   6.60888515e-03   8.32867424e-03
   7.03182287e-03   5

In [None]:
# XGBoost
import xgboost as xgb
evaluate_model(xgb.XGBClassifier(n_estimators=30, max_depth=10, learning_rate=0.2,
                                 silent=False, randome_state=241), X_train, y_train)

## 5. Напишем свой классификатор

In [64]:
from sklearn.base import BaseEstimator, ClassifierMixin
from collections import defaultdict, Counter

class MyClassifier(BaseEstimator, ClassifierMixin):
    """Predicts the majority class of its training data."""
    def __init__(self, net_size = 50, n_max=5, verbose=False):
        self.net_size_ = net_size
        self.n_max_ = n_max
        self.verbose_ = verbose

    def fit(self, X, y):
        self.n_classes_ = np.max(y) + 1
        self.min_x_ = np.min(X[:,0])
        self.max_x_ = np.max(X[:,0])
        self.min_y_ = np.min(X[:,1])
        self.max_y_ = np.max(X[:,1])
        self.x_scale_ = self.max_x_ - self.min_x_
        self.y_scale_ = self.max_y_ - self.min_y_
        net = defaultdict(list)
        self.net_weights_ = dict()
        for i in range(0,X.shape[0]):
            x_ = np.max([np.floor((X[i,0] - self.min_x_)/self.x_scale_ * self.net_size_),self.net_size_-1])
            y_ = np.max([np.floor((X[i,1] - self.min_y_)/self.y_scale_ * self.net_size_),self.net_size_-1])
            
            net[(x_,y_)].append(y[i])
        
        for k,v in net.items():
            cnt = Counter(v).most_common(self.n_max_)
            self.net_weights_[k] = np.array([float(cnt[x])/len(v) if x in cnt else 0.0 for x in range(0,self.n_classes_)])
            
        majority_cnt = Counter(y)
        self.majority_ = np.array([float(majority_cnt[x])/len(v) if x in majority_cnt else 0.0
                                   for x in range(0,self.n_classes_)])

        return self
    def predict_proba(self, X):
        result = np.empty((X.shape[0],self.n_classes_))
        
        for i in range(0,X.shape[0]):
            x_ = np.max([np.floor((X[i,0] - self.min_x_)/self.x_scale_ * self.net_size_),self.net_size_-1])
            y_ = np.max([np.floor((X[i,1] - self.min_y_)/self.y_scale_ * self.net_size_),self.net_size_-1])
            
            if (x_,y_) in self.net_weights_:
                result[i] = self.net_weights_[(x_,y_)]
            else:
                result[i] = self.majority_
                
        return result

In [93]:
from sklearn.base import BaseEstimator, ClassifierMixin

class MeanClassifier(BaseEstimator, ClassifierMixin):
    """Predicts the majority class of its training data."""
    def __init__(self, classifiers, weights=None):
        self.classifiers_ = classifiers
        if weights:
            self.weights_ = weights
        else:
            self.weights_ = [1.0/len(classifiers)]*len(classifiers)

    def fit(self, X, y):
        for clf in self.classifiers_:
            clf.fit(X,y)

        return self
    def predict_proba(self, X):
        predicts = [clf.predict_proba(X) for clf in self.classifiers_]
        
        result = np.zeros(predicts[0].shape)
        
        for i, w in enumerate(self.weights_):
            result += w*predicts[i]
                
        return result

In [94]:
evaluate_model(MeanClassifier(
        [MyClassifier(net_size=100, n_max=15),
         MyClassifier(net_size=50, n_max=10),
         MyClassifier(net_size=25, n_max=10)]),
               X_train4, y_train, test_size=0.5)

Classifier: <class '__main__.MeanClassifier'>
Fit time: 35.9 sec, Predict time: 87.0 sec, All time: 35.9 sec
Train Score: 3.66
Test Score: 3.66
