In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

## Data Loaded

In [3]:
train = pd.read_csv("train.csv", parse_dates=["Dates"])
print(train.shape)
train.head()

(878049, 9)


Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541


In [5]:
test = pd.read_csv("test.csv", parse_dates=["Dates"], index_col="Id")
print(test.shape)
test.head()

(884262, 6)


Unnamed: 0_level_0,Dates,DayOfWeek,PdDistrict,Address,X,Y
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,2015-05-10 23:59:00,Sunday,BAYVIEW,2000 Block of THOMAS AV,-122.399588,37.735051
1,2015-05-10 23:51:00,Sunday,BAYVIEW,3RD ST / REVERE AV,-122.391523,37.732432
2,2015-05-10 23:50:00,Sunday,NORTHERN,2000 Block of GOUGH ST,-122.426002,37.792212
3,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412
4,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412


## Score

In [9]:
feature_names = ["X", "Y"]
feature_names

['X', 'Y']

In [13]:
X_train = train[feature_names]

print(X_train.shape)
X_train.head(1)

(878049, 2)


Unnamed: 0,X,Y
0,-122.425892,37.774599


In [12]:
X_test = test[feature_names]

print(X_test.shape)
X_test.head(1)

(884262, 2)


Unnamed: 0_level_0,X,Y
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,-122.399588,37.735051


In [15]:
label_name = "Category"

y_train = train[label_name]
print(y_train.shape)
y_train.head(1)

(878049,)


0    WARRANTS
Name: Category, dtype: object

## Use RandomForestClassifier

In [17]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

seed = 33

model = RandomForestClassifier(random_state=seed,
                              n_jobs=-1)
model

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=-1, oob_score=False, random_state=33,
            verbose=0, warm_start=False)

* n_jobs는 노트북의 코어를 다 활용한다는 의미

In [21]:
%time score = cross_val_score(model, X_train, y_train, cv=5, scoring="neg_log_loss").mean()

print("Score = {0:.5f}".format(score))

CPU times: user 2min 9s, sys: 5.71 s, total: 2min 15s
Wall time: 59.3 s
Score = -6.73939


* 스코어는 0에 가까울 수록 좋음

## Use Xgboost

In [34]:
import xgboost as xgb

model = xgb.XGBClassifier(n_estimators=15,
                         seed=seed,
                         nthread=-1)
model



XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=15, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=33, silent=True, subsample=1)

In [36]:
%time score = cross_val_score(model, X_train, y_train, \
                              cv=5, scoring="neg_log_loss").mean()

print("Score: {0:.5f}".format(score))

CPU times: user 9min 10s, sys: 3.8 s, total: 9min 14s
Wall time: 12min 49s
Score: -2.81218


## Predict

In [37]:
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=15, nthread=-1,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=33, silent=True, subsample=1)

In [38]:
predictions = model.predict_proba(X_test)

print(predictions.shape)
predictions[0:1]

(884262, 39)


array([[ 0.01356191,  0.07389767,  0.01177335,  0.01186914,  0.03413098,
         0.01260796,  0.01293965,  0.03607652,  0.01262432,  0.01223478,
         0.01168966,  0.01196344,  0.01726453,  0.01776261,  0.01164579,
         0.0138286 ,  0.07055465,  0.01240707,  0.01190741,  0.04663624,
         0.0551002 ,  0.10478871,  0.01158336,  0.01182128,  0.01543034,
         0.03157561,  0.01253409,  0.01941617,  0.01395577,  0.01168982,
         0.01397707,  0.0119202 ,  0.03370471,  0.01157401,  0.0153227 ,
         0.04126912,  0.06713752,  0.03883071,  0.02099236]], dtype=float32)

## Submit

In [39]:
submission = pd.read_csv("sampleSubmission.csv", index_col="Id")

for i, column in enumerate(model.classes_):
    submission[column] = predictions[:, i]
    
print(submission.shape)
submission.head()

(884262, 39)


Unnamed: 0_level_0,ARSON,ASSAULT,BAD CHECKS,BRIBERY,BURGLARY,DISORDERLY CONDUCT,DRIVING UNDER THE INFLUENCE,DRUG/NARCOTIC,DRUNKENNESS,EMBEZZLEMENT,...,SEX OFFENSES NON FORCIBLE,STOLEN PROPERTY,SUICIDE,SUSPICIOUS OCC,TREA,TRESPASS,VANDALISM,VEHICLE THEFT,WARRANTS,WEAPON LAWS
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.013562,0.073898,0.011773,0.011869,0.034131,0.012608,0.01294,0.036077,0.012624,0.012235,...,0.01169,0.013977,0.01192,0.033705,0.011574,0.015323,0.041269,0.067138,0.038831,0.020992
1,0.013898,0.075728,0.012065,0.012163,0.034976,0.01292,0.01326,0.03697,0.012937,0.012538,...,0.011979,0.014323,0.012215,0.034539,0.011861,0.015702,0.042291,0.048611,0.039792,0.021512
2,0.011959,0.051974,0.011695,0.011447,0.064669,0.012965,0.012434,0.021471,0.013255,0.012004,...,0.01141,0.01429,0.011561,0.0274,0.011346,0.015279,0.041543,0.053878,0.024129,0.013872
3,0.012403,0.073928,0.011567,0.011662,0.031971,0.012387,0.012669,0.029338,0.012499,0.011942,...,0.011485,0.01382,0.011712,0.032231,0.011372,0.014205,0.045358,0.075044,0.026464,0.017711
4,0.012403,0.073928,0.011567,0.011662,0.031971,0.012387,0.012669,0.029338,0.012499,0.011942,...,0.011485,0.01382,0.011712,0.032231,0.011372,0.014205,0.045358,0.075044,0.026464,0.017711


In [40]:
submission.to_csv("xbg_baseline_script.csv")