In [3]:
import pandas as pd
import numpy as np

In [4]:
data = pd.read_csv('data/train.csv')

In [5]:
data.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541


In [6]:
from sklearn import preprocessing

In [7]:
test = pd.read_csv('data/test.csv', index_col=0)

In [8]:
test.head()

Unnamed: 0_level_0,Dates,DayOfWeek,PdDistrict,Address,X,Y
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,2015-05-10 23:59:00,Sunday,BAYVIEW,2000 Block of THOMAS AV,-122.399588,37.735051
1,2015-05-10 23:51:00,Sunday,BAYVIEW,3RD ST / REVERE AV,-122.391523,37.732432
2,2015-05-10 23:50:00,Sunday,NORTHERN,2000 Block of GOUGH ST,-122.426002,37.792212
3,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412
4,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412


In [9]:
def append_districts(distr_column):
    enc = preprocessing.LabelBinarizer()
    enc.fit(distr_column)
    districts = enc.transform(distr_column)
    return districts

In [10]:
data.Dates = pd.to_datetime(data.Dates)
test.Dates = pd.to_datetime(test.Dates)

In [11]:
def conv_street(street):
    if street.find('/') != -1:
        return map(str.strip, street.split('/'))
    
    pos = street.find('Block of ')
    if pos != -1:
        
        return [street[pos+9:]]
    
    return [street]

In [12]:
streets = set()
for x in data.Address[0:10]:
    streets |= set(conv_street(x))

In [13]:
def append_streets(adr_column, streets):
    streets_cols = np.zeros((len(adr_column),len(streets)), dtype=int)
    for i, street in enumerate(streets):
        for j, address in enumerate(adr_column):
            if address.find(street) != -1:
                streets_cols[j,i] = 1
    return streets_cols

In [14]:
append_streets(data.Address,streets)[0:10]

array([[0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1]])

In [15]:
def append_day_of_week(day_column):
    days = {'Sunday':0, 'Monday':1, 'Tuesday':2, 'Wednesday':3, 'Thursday':4, 'Friday':5, 'Saturday':6}
    days_cols = np.zeros((len(day_column),len(days)),dtype=int)
    for i, day in enumerate(day_column):
        days_cols[i,days[day]] = 1
    return days_cols

In [17]:
def append_times(date_column):
    hours_cols = np.zeros((len(date_column),24),dtype=int)
    day_cols = np.zeros((len(date_column),31),dtype=int)
    month_cols = np.zeros((len(date_column),12),dtype=int)
    year_cols = np.zeros((len(date_column),13),dtype=int)
    for i, date in enumerate(date_column):
        hours_cols[i,date.hour] = 1
        day_cols[i,date.day-1] = 1
        month_cols[i,date.month-1] = 1
        year_cols[i,date.year-2003] = 1
    return np.hstack((hours_cols,day_cols,month_cols,year_cols))

In [18]:
category_enc = preprocessing.LabelEncoder()
category_enc.fit(data.Category)

LabelEncoder()

In [19]:
X_train = data[['X', 'Y']].values
X_train = np.hstack((X_train,
                     append_times(data.Dates),
                     append_day_of_week(data.DayOfWeek),
                     append_streets(data.Address,streets),
                     append_districts(data.PdDistrict)))

In [20]:
y_train = category_enc.transform(data.Category)

In [21]:
X_test = test[['X', 'Y']].values
X_test = np.hstack((X_test,
                    append_times(test.Dates),
                    append_day_of_week(test.DayOfWeek),
                    append_streets(test.Address,streets),
                    append_districts(test.PdDistrict)))

In [22]:
import xgboost as xgb

In [None]:
gbm = xgb.XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.05).fit(X_train, y_train)

In [24]:
gbm

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.05, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=300, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [25]:
X_pred = gbm.predict(X_test)

In [26]:
y_pred = X_pred

In [27]:
y_pred[0:10]

array([21, 21, 16, 21, 21, 16, 16, 21, 21, 16])

In [28]:
from sklearn.externals import joblib
joblib.dump(gbm, 'gbm_1.pkl') 

['gbm_1.pkl', 'gbm_1.pkl_01.npy']

In [29]:
list(category_enc.classes_)

['ARSON',
 'ASSAULT',
 'BAD CHECKS',
 'BRIBERY',
 'BURGLARY',
 'DISORDERLY CONDUCT',
 'DRIVING UNDER THE INFLUENCE',
 'DRUG/NARCOTIC',
 'DRUNKENNESS',
 'EMBEZZLEMENT',
 'EXTORTION',
 'FAMILY OFFENSES',
 'FORGERY/COUNTERFEITING',
 'FRAUD',
 'GAMBLING',
 'KIDNAPPING',
 'LARCENY/THEFT',
 'LIQUOR LAWS',
 'LOITERING',
 'MISSING PERSON',
 'NON-CRIMINAL',
 'OTHER OFFENSES',
 'PORNOGRAPHY/OBSCENE MAT',
 'PROSTITUTION',
 'RECOVERED VEHICLE',
 'ROBBERY',
 'RUNAWAY',
 'SECONDARY CODES',
 'SEX OFFENSES FORCIBLE',
 'SEX OFFENSES NON FORCIBLE',
 'STOLEN PROPERTY',
 'SUICIDE',
 'SUSPICIOUS OCC',
 'TREA',
 'TRESPASS',
 'VANDALISM',
 'VEHICLE THEFT',
 'WARRANTS',
 'WEAPON LAWS']

In [39]:
def build_predicts_matrix(pred, labels):
    m = np.zeros((len(pred),len(labels)))
    for i,x in enumerate(pred):
        #m[i,0] = i
        m[i,x] = 1
    return m

In [40]:
result = build_predicts_matrix(y_pred, list(category_enc.classes_))

In [42]:
df = pd.DataFrame(result,columns=list(category_enc.classes_))

In [44]:
df.to_csv('xgb_out.csv')