# Соревнование: San Francisco Crime Classification

_Given time and location, you must predict the category of crime that occurred. Kaggle is hosting this competition for the machine learning community to use for fun and practice._

https://www.kaggle.com/c/sf-crime

## 1. Подготовка работы. Загрузка библиотек и настройка отображения
Импорты и настроийки, которые необходимы для шаблона.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
import datetime

%matplotlib inline

## 2. Загрузка, очистка данных.

Загрузка тренировочных данных из csv:

In [2]:
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv', index_col='Id')

In [3]:
feature_columns_to_use = ['Dates', 'DayOfWeek', 'PdDistrict', 'Address', 'X', 'Y']
nonnumeric_columns = ['DayOfWeek','PdDistrict','Address']

In [4]:
data = train_data[feature_columns_to_use].append(test_data[feature_columns_to_use])

In [5]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
for feature in nonnumeric_columns:
    data[feature] = le.fit_transform(data[feature])

In [6]:
cat_conv = LabelEncoder()
train_data['Category'] = cat_conv.fit_transform(train_data['Category'])

In [7]:
def convert_time(data):
    data['Hour'] = data.Dates.apply(lambda x: x.hour)
    data['Day'] = data.Dates.apply(lambda x: x.day)
    data['Month'] = data.Dates.apply(lambda x: x.month)
    data['Year'] = data.Dates.apply(lambda x: x.year)
    data['WeekNumber'] = data.Dates.apply(lambda x: x.isocalendar()[1])
    return data

data.Dates = pd.to_datetime(data.Dates)
data = convert_time(data)

In [8]:
feature_columns = ['Hour', 'Day', 'Month', 'Year', 'DayOfWeek', 'PdDistrict', 'Address', 'X', 'Y']

train_X = data[feature_columns][0:train_data.shape[0]][data[0:train_data.shape[0]].WeekNumber % 4 != 0].as_matrix()
eval_X = data[feature_columns][0:train_data.shape[0]][data[0:train_data.shape[0]].WeekNumber % 4 == 0].as_matrix()

test_X = data[feature_columns][train_data.shape[0]::].as_matrix()

train_y = train_data[data[0:train_data.shape[0]].WeekNumber % 4 != 0]['Category']
eval_y = train_data[data[0:train_data.shape[0]].WeekNumber % 4 == 0]['Category']

In [9]:
evals = [(eval_X, eval_y)]

In [None]:
import xgboost as xgb

start_time = datetime.datetime.now()

gbm = xgb.XGBClassifier(max_depth=5,
                        n_estimators=300,
                        learning_rate=0.05,
                        nthread=4,
                        silent=False).fit(train_X, train_y,
                                         eval_metric='mlogloss',
                                         eval_set=evals,
                                         early_stopping_rounds=10,
                                         verbose=True
                                         )

duration = (datetime.datetime.now() - start_time).total_seconds()
print('Time elapsed: {0}'.format(duration))

In [None]:
predictions = gbm.predict_proba(test_X)

In [None]:
predictions = np.split(predictions, 2, axis=1)[1].T

In [None]:
predictions.shape

In [21]:
def save_predicts(name, predictions, index, columns, float_format='%.6f'):
    df = pd.DataFrame(predictions, index=index, columns=columns)
    df.index.name = 'Id'
    df.to_csv('./output/{}.csv.gz'.format(name), compression='gzip', float_format=float_format)

In [22]:
save_predicts('xgb4', predictions, data[feature_columns][train_data.shape[0]::].index, cat_conv.classes_)