In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing

In [2]:
data = pd.read_csv('data/train.csv')

In [3]:
data.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541


In [72]:
test = pd.read_csv('data/test.csv', index_col=0)

In [73]:
test.head()

Unnamed: 0_level_0,Dates,DayOfWeek,PdDistrict,Address,X,Y
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,2015-05-10 23:59:00,Sunday,BAYVIEW,2000 Block of THOMAS AV,-122.399588,37.735051
1,2015-05-10 23:51:00,Sunday,BAYVIEW,3RD ST / REVERE AV,-122.391523,37.732432
2,2015-05-10 23:50:00,Sunday,NORTHERN,2000 Block of GOUGH ST,-122.426002,37.792212
3,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412
4,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412


In [69]:
# конвертируем PdDistrict в массив бинарных признаков
def append_districts(distr_column):
    enc = preprocessing.LabelBinarizer()
    enc.fit(distr_column)
    districts = enc.transform(distr_column)
    return districts

In [102]:
# коневертируем строковые даты в объекты datetime
data.Dates = pd.to_datetime(data.Dates)
test.Dates = pd.to_datetime(test.Dates)

In [26]:
# эта функция выдергивает названия улиц из Address и возвращает списком
def conv_street(street):
    if street.find('/') != -1:
        return map(str.strip, street.split('/'))
    
    pos = street.find('Block of ')
    if pos != -1:
        
        return [street[pos+9:]]
    
    return [street]

In [35]:
# строим список всех улиц
streets = set()
for x in data.Address[0:10]:
    streets |= set(conv_street(x))

In [54]:
# конвертируем Address в набор бинарных признаков, каждый соответствует улице
def append_streets(adr_column, streets):
    streets_cols = np.zeros((len(adr_column),len(streets)), dtype=int)
    for i, street in enumerate(streets):
        for j, address in enumerate(adr_column):
            if address.find(street) != -1:
                streets_cols[j,i] = 1
    return streets_cols

In [56]:
append_streets(data.Address,streets)[0:10]

array([[0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1]])

In [67]:
# конвертируем день недели в бинарный признак
def append_day_of_week(day_column):
    days = {'Sunday':0, 'Monday':1, 'Tuesday':2, 'Wednesday':3, 'Thursday':4, 'Friday':5, 'Saturday':6}
    days_cols = np.zeros((len(day_column),len(days)),dtype=int)
    for i, day in enumerate(day_column):
        days_cols[i,days[day]] = 1
    return days_cols

In [58]:
conv_day_of_week(data.DayOfWeek)[0:10]

array([[0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0]])

In [96]:
# конвертируем дату в бинарные признаки по часам, дню, месяцу и году
def append_times(date_column):
    hours_cols = np.zeros((len(date_column),24),dtype=int)
    day_cols = np.zeros((len(date_column),31),dtype=int)
    month_cols = np.zeros((len(date_column),12),dtype=int)
    year_cols = np.zeros((len(date_column),13),dtype=int)
    for i, date in enumerate(date_column):
        hours_cols[i,date.hour] = 1
        day_cols[i,date.day-1] = 1
        month_cols[i,date.month-1] = 1
        year_cols[i,date.year-2003] = 1
    return np.hstack((hours_cols,day_cols,month_cols,year_cols))

In [79]:
# конвертирер категорий в числовые признаки
category_enc = preprocessing.LabelEncoder()
category_enc.fit(data.Category)

LabelEncoder()

In [99]:
X_train = data[['X', 'Y']].values
X_train = np.hstack((X_train,
                     append_times(data.Dates),
                     append_day_of_week(data.DayOfWeek),
                     append_streets(data.Address,streets),
                     append_districts(data.PdDistrict)))

In [80]:
y_train = category_enc.transform(data.Category)

In [103]:
X_test = test[['X', 'Y']].values
X_test = np.hstack((X_test,
                    append_times(test.Dates),
                    append_day_of_week(test.DayOfWeek),
                    append_streets(test.Address,streets),
                    append_districts(test.PdDistrict)))

In [82]:
from sklearn.ensemble import GradientBoostingClassifier

In [108]:
clf = GradientBoostingClassifier(n_estimators=50, verbose=1)

In [None]:
clf.fit(X_train,y_train)

      Iter       Train Loss   Remaining Time 
         1     2906381.1536          268.32m
         2     2773591.7348          261.57m
         3     2677401.7928          256.23m
         4     2603497.7155          250.63m
         5     2546031.4881          245.42m
         6     2498351.3305          239.61m
         7     2459085.2586          236.60m
         8     2426578.4044          244.14m
         9     2398570.7251          254.63m
        10     2374933.7061          260.57m

In [110]:
clf

GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=50,
              presort='auto', random_state=None, subsample=1.0, verbose=1,
              warm_start=False)

In [122]:
from sklearn.externals import joblib
out = joblib.dump(clf, 'rf_clf_1.pkl') 

In [112]:
y_pred = clf.predict_proba(X_test)

In [123]:
def build_predicts_matrix(pred, labels):
    m = np.zeros((len(pred),len(labels)))
    for i,x in enumerate(pred):
        for j,v in enumerate(x):
            m[i,j] = v
    return m

In [124]:
result = build_predicts_matrix(y_pred, list(category_enc.classes_))
df = pd.DataFrame(result,columns=list(category_enc.classes_))
df.index.name = 'Id'
df.to_csv('gb_out.csv.gz',compression='gzip')