# Соревнование: San Francisco Crime Classification

_Given time and location, you must predict the category of crime that occurred. Kaggle is hosting this competition for the machine learning community to use for fun and practice._

https://www.kaggle.com/c/sf-crime

## 1. Подготовка работы. Загрузка библиотек и настройка отображения
Импорты и настроийки, которые необходимы для шаблона.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
import datetime

%matplotlib inline

## 2. Загрузка, очистка данных.

Загрузка тренировочных данных из csv:

In [2]:
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv', index_col='Id')

In [3]:
feature_columns_to_use = ['Dates', 'DayOfWeek', 'PdDistrict', 'Address', 'X', 'Y']
nonnumeric_columns = ['DayOfWeek','PdDistrict','Address']

In [4]:
data = train_data[feature_columns_to_use].append(test_data[feature_columns_to_use])

In [5]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
for feature in nonnumeric_columns:
    data["{}_Conv".format(feature)] = le.fit_transform(data[feature])

In [6]:
cat_conv = LabelEncoder()
train_data['Category'] = cat_conv.fit_transform(train_data['Category'])

In [7]:
def convert_time(data):
    data['Min'] = data.Dates.apply(lambda x: x.minute)
    data['Hour'] = data.Dates.apply(lambda x: x.hour)
    data['Day'] = data.Dates.apply(lambda x: x.day)
    data['Month'] = data.Dates.apply(lambda x: x.month)
    data['Year'] = data.Dates.apply(lambda x: x.year)
    data['WeekNumber'] = data.Dates.apply(lambda x: x.isocalendar()[1])
    return data

data.Dates = pd.to_datetime(data.Dates)
data = convert_time(data)

In [8]:
from collections import defaultdict
from sklearn.decomposition import PCA

def conv_street(street):
    if street.find('/') != -1:
        return map(str.strip, street.split('/'))
    pos = street.find('Block of ')
    if pos != -1:        
        return [street[pos+9:]]    
    return [street]

def get_streets(adr_column):
    streets = set()
    street_to_addr = defaultdict(list)
    for i, x in enumerate(adr_column):
        adr_streets = conv_street(x)
        for street in adr_streets:
            street_to_addr[street].append(i)
        streets |= set(adr_streets)
        
    street_names = []
    streets_cols = np.zeros((len(adr_column),len(streets)), dtype=bool)
    for i, street in enumerate(streets):
        street_names.append("street_{}".format(street))
        for j in street_to_addr[street]:
            streets_cols[j,i] = 1

    n_components = 10
    pca = PCA(n_components=n_components)
    pca.fit(streets_cols)
    
    return pd.DataFrame(pca.transform(streets_cols), columns=["PCA_{}".format(i) for i in range(n_components)])

In [None]:
streets = get_streets(data.Address)

In [None]:
data = pd.concat([data,streets], axis=1, ignore_index=True)

In [10]:
feature_columns = ['Min', 'Hour', 'Day', 'Month', 'Year',
                   'DayOfWeek_Conv', 'PdDistrict_Conv', 'Address_Conv', 'X', 'Y']

train_X = data[feature_columns][0:train_data.shape[0]][data[0:train_data.shape[0]].WeekNumber % 10 != 0].as_matrix()
eval_X = data[feature_columns][0:train_data.shape[0]][data[0:train_data.shape[0]].WeekNumber % 10 == 0].as_matrix()

test_X = data[feature_columns][train_data.shape[0]::].as_matrix()

train_y = train_data[data[0:train_data.shape[0]].WeekNumber % 10 != 0]['Category']
eval_y = train_data[data[0:train_data.shape[0]].WeekNumber % 10 == 0]['Category']

In [11]:
evals = [(eval_X, eval_y)]

In [12]:
import xgboost as xgb

start_time = datetime.datetime.now()

gbm = xgb.XGBClassifier(max_depth=10,
                        n_estimators=1000,
                        learning_rate=0.05,
                        nthread=4,
                        silent=False).fit(train_X, train_y,
                                         eval_metric='mlogloss',
                                         eval_set=evals,
                                         early_stopping_rounds=10,
                                         verbose=True
                                         )

duration = (datetime.datetime.now() - start_time).total_seconds()
print('Time elapsed: {0}'.format(duration))

Will train until validation_0 error hasn't decreased in 10 rounds.
[0]	validation_0-mlogloss:3.533614
[1]	validation_0-mlogloss:3.434867
[2]	validation_0-mlogloss:3.353081
[3]	validation_0-mlogloss:3.282965
[4]	validation_0-mlogloss:3.222069
[5]	validation_0-mlogloss:3.167467
[6]	validation_0-mlogloss:3.118550
[7]	validation_0-mlogloss:3.073960
[8]	validation_0-mlogloss:3.033144
[9]	validation_0-mlogloss:2.995590
[10]	validation_0-mlogloss:2.960981
[11]	validation_0-mlogloss:2.928759
[12]	validation_0-mlogloss:2.898824
[13]	validation_0-mlogloss:2.870691
[14]	validation_0-mlogloss:2.844501
[15]	validation_0-mlogloss:2.819782
[16]	validation_0-mlogloss:2.796278
[17]	validation_0-mlogloss:2.774411
[18]	validation_0-mlogloss:2.753544
[19]	validation_0-mlogloss:2.733886
[20]	validation_0-mlogloss:2.715266
[21]	validation_0-mlogloss:2.697665
[22]	validation_0-mlogloss:2.680903
[23]	validation_0-mlogloss:2.665045
[24]	validation_0-mlogloss:2.649951
[25]	validation_0-mlogloss:2.635643
[26]	va

Time elapsed: 7753.783476


Stopping. Best iteration:
[366]	validation_0-mlogloss:2.253899



In [13]:
predictions = gbm.predict_proba(test_X)

In [14]:
predictions = np.split(predictions, 2, axis=1)[1].T

In [15]:
def save_predicts(name, predictions, index, columns, float_format='%.6f'):
    df = pd.DataFrame(predictions, index=index, columns=columns)
    df.index.name = 'Id'
    df.to_csv('./output/{}.csv.gz'.format(name), compression='gzip', float_format=float_format)

In [16]:
save_predicts('xgb', predictions, data[feature_columns][train_data.shape[0]::].index, cat_conv.classes_)

In [20]:
from sklearn.externals import joblib

def save_model(name, model):    
    out = joblib.dump(model, 'models/{}.pkl'.format(name)) 

In [21]:
save_model('xgb6', gbm)