# San Francisco Crime Classification

In [3]:
import numpy as np
import pandas as pd

seed = 37

## Load Dataset

In [2]:
train = pd.read_csv("data/train.csv", parse_dates=["Dates"])

print(train.shape)
train.head()

(878049, 9)


Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541


In [3]:
test = pd.read_csv("data/test.csv", parse_dates=["Dates"])

print(test.shape)
test.head()

(884262, 7)


Unnamed: 0,Id,Dates,DayOfWeek,PdDistrict,Address,X,Y
0,0,2015-05-10 23:59:00,Sunday,BAYVIEW,2000 Block of THOMAS AV,-122.399588,37.735051
1,1,2015-05-10 23:51:00,Sunday,BAYVIEW,3RD ST / REVERE AV,-122.391523,37.732432
2,2,2015-05-10 23:50:00,Sunday,NORTHERN,2000 Block of GOUGH ST,-122.426002,37.792212
3,3,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412
4,4,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412


In [4]:
# train = train.sample(frac=0.1)
# test = test.sample(frac=0.1)

# train = train[~train["Category"].isin(["PORNOGRAPHY/OBSCENE MAT", "TREA"])]

# print(train.shape, test.shape)

## Preprocessing

### Parse Dates

In [5]:
train["Dates-year"] = train["Dates"].dt.year
train["Dates-month"] = train["Dates"].dt.month
train["Dates-day"] = train["Dates"].dt.day
train["Dates-hour"] = train["Dates"].dt.hour
train["Dates-minute"] = train["Dates"].dt.minute
train["Dates-second"] = train["Dates"].dt.second

print(train.shape)
train[["Dates", "Dates-year", "Dates-month", "Dates-day", "Dates-hour", "Dates-minute", "Dates-second"]].head()

(878049, 15)


Unnamed: 0,Dates,Dates-year,Dates-month,Dates-day,Dates-hour,Dates-minute,Dates-second
0,2015-05-13 23:53:00,2015,5,13,23,53,0
1,2015-05-13 23:53:00,2015,5,13,23,53,0
2,2015-05-13 23:33:00,2015,5,13,23,33,0
3,2015-05-13 23:30:00,2015,5,13,23,30,0
4,2015-05-13 23:30:00,2015,5,13,23,30,0


In [6]:
test["Dates-year"] = test["Dates"].dt.year
test["Dates-month"] = test["Dates"].dt.month
test["Dates-day"] = test["Dates"].dt.day
test["Dates-hour"] = test["Dates"].dt.hour
test["Dates-minute"] = test["Dates"].dt.minute
test["Dates-second"] = test["Dates"].dt.second

print(test.shape)
test[["Dates", "Dates-year", "Dates-month", "Dates-day", "Dates-hour", "Dates-minute", "Dates-second"]].head()

(884262, 13)


Unnamed: 0,Dates,Dates-year,Dates-month,Dates-day,Dates-hour,Dates-minute,Dates-second
0,2015-05-10 23:59:00,2015,5,10,23,59,0
1,2015-05-10 23:51:00,2015,5,10,23,51,0
2,2015-05-10 23:50:00,2015,5,10,23,50,0
3,2015-05-10 23:45:00,2015,5,10,23,45,0
4,2015-05-10 23:45:00,2015,5,10,23,45,0


### Encode DayOfWeek

In [7]:
train_pd_district = pd.get_dummies(train["DayOfWeek"], prefix="DayOfWeek")

train = pd.concat([train, train_pd_district], axis=1)
train[["DayOfWeek"] + list(train_pd_district.columns)].head()

Unnamed: 0,DayOfWeek,DayOfWeek_Friday,DayOfWeek_Monday,DayOfWeek_Saturday,DayOfWeek_Sunday,DayOfWeek_Thursday,DayOfWeek_Tuesday,DayOfWeek_Wednesday
0,Wednesday,0,0,0,0,0,0,1
1,Wednesday,0,0,0,0,0,0,1
2,Wednesday,0,0,0,0,0,0,1
3,Wednesday,0,0,0,0,0,0,1
4,Wednesday,0,0,0,0,0,0,1


In [8]:
test_dayofweek = pd.get_dummies(test["DayOfWeek"], prefix="DayOfWeek")

test = pd.concat([test, test_dayofweek], axis=1)
test[["DayOfWeek"] + list(test_dayofweek.columns)].head()

Unnamed: 0,DayOfWeek,DayOfWeek_Friday,DayOfWeek_Monday,DayOfWeek_Saturday,DayOfWeek_Sunday,DayOfWeek_Thursday,DayOfWeek_Tuesday,DayOfWeek_Wednesday
0,Sunday,0,0,0,1,0,0,0
1,Sunday,0,0,0,1,0,0,0
2,Sunday,0,0,0,1,0,0,0
3,Sunday,0,0,0,1,0,0,0
4,Sunday,0,0,0,1,0,0,0


### Encode PdDistrict

In [9]:
train_pd_district = pd.get_dummies(train["PdDistrict"], prefix="PdDistrict")

train = pd.concat([train, train_pd_district], axis=1)
train[["PdDistrict"] + list(train_pd_district.columns)].head()

Unnamed: 0,PdDistrict,PdDistrict_BAYVIEW,PdDistrict_CENTRAL,PdDistrict_INGLESIDE,PdDistrict_MISSION,PdDistrict_NORTHERN,PdDistrict_PARK,PdDistrict_RICHMOND,PdDistrict_SOUTHERN,PdDistrict_TARAVAL,PdDistrict_TENDERLOIN
0,NORTHERN,0,0,0,0,1,0,0,0,0,0
1,NORTHERN,0,0,0,0,1,0,0,0,0,0
2,NORTHERN,0,0,0,0,1,0,0,0,0,0
3,NORTHERN,0,0,0,0,1,0,0,0,0,0
4,PARK,0,0,0,0,0,1,0,0,0,0


In [10]:
test_pd_district = pd.get_dummies(test["PdDistrict"], prefix="PdDistrict")

test = pd.concat([test, test_pd_district], axis=1)
test[["PdDistrict"] + list(test_pd_district.columns)].head()

Unnamed: 0,PdDistrict,PdDistrict_BAYVIEW,PdDistrict_CENTRAL,PdDistrict_INGLESIDE,PdDistrict_MISSION,PdDistrict_NORTHERN,PdDistrict_PARK,PdDistrict_RICHMOND,PdDistrict_SOUTHERN,PdDistrict_TARAVAL,PdDistrict_TENDERLOIN
0,BAYVIEW,1,0,0,0,0,0,0,0,0,0
1,BAYVIEW,1,0,0,0,0,0,0,0,0,0
2,NORTHERN,0,0,0,0,1,0,0,0,0,0
3,INGLESIDE,0,0,1,0,0,0,0,0,0,0
4,INGLESIDE,0,0,1,0,0,0,0,0,0,0


### Convert the Dates-Minute column to 0 if the value is 30.

see this [notebook](http://deepdori.wowinsights.net/notebooks/kaggle/san_francisco_crime_classification/labs/20160810-shaynekang-02-ignore-dates-minute.ipynb)

In [11]:
train["Dates-minute_abs"] = train["Dates-minute"]
train["Dates-minute_abs"] = train["Dates-minute_abs"] - 30
train["Dates-minute_abs"] = train["Dates-minute_abs"].abs()

print(train["Dates-minute_abs"].value_counts()[:5])
print(train.shape)
train[["Dates", "Dates-minute", "Dates-minute_abs"]].head()

30    268950
0     125173
15     78133
10     50695
20     48257
Name: Dates-minute_abs, dtype: int64
(878049, 33)


Unnamed: 0,Dates,Dates-minute,Dates-minute_abs
0,2015-05-13 23:53:00,53,23
1,2015-05-13 23:53:00,53,23
2,2015-05-13 23:33:00,33,3
3,2015-05-13 23:30:00,30,0
4,2015-05-13 23:30:00,30,0


In [12]:
test["Dates-minute_abs"] = test["Dates-minute"]
test["Dates-minute_abs"] = test["Dates-minute_abs"] - 30
test["Dates-minute_abs"] = test["Dates-minute_abs"].abs()

print(test["Dates-minute_abs"].value_counts()[:5])
print(test.shape)
test[["Dates", "Dates-minute", "Dates-minute_abs"]].head()

30    271449
0     126650
15     78359
10     51220
20     48807
Name: Dates-minute_abs, dtype: int64
(884262, 31)


Unnamed: 0,Dates,Dates-minute,Dates-minute_abs
0,2015-05-10 23:59:00,59,29
1,2015-05-10 23:51:00,51,21
2,2015-05-10 23:50:00,50,20
3,2015-05-10 23:45:00,45,15
4,2015-05-10 23:45:00,45,15


### Add the 'Address-Type' column

In [13]:
train.loc[train["Address"].str.contains("Block of"), "AddressType"] = "Block"
train.loc[~train["Address"].str.contains("Block of"), "AddressType"] = "CrossRoad"

train.loc[train["AddressType"] == "Block", "AddressType_encode"] = 0
train.loc[train["AddressType"] == "CrossRoad", "AddressType_encode"] = 1

train[["Address", "AddressType", "AddressType_encode"]].head()

Unnamed: 0,Address,AddressType,AddressType_encode
0,OAK ST / LAGUNA ST,CrossRoad,1.0
1,OAK ST / LAGUNA ST,CrossRoad,1.0
2,VANNESS AV / GREENWICH ST,CrossRoad,1.0
3,1500 Block of LOMBARD ST,Block,0.0
4,100 Block of BRODERICK ST,Block,0.0


In [14]:
test.loc[test["Address"].str.contains("Block of"), "AddressType"] = "Block"
test.loc[~test["Address"].str.contains("Block of"), "AddressType"] = "CrossRoad"

test.loc[test["AddressType"] == "Block", "AddressType_encode"] = 0
test.loc[test["AddressType"] == "CrossRoad", "AddressType_encode"] = 1

test[["Address", "AddressType", "AddressType_encode"]].head()

Unnamed: 0,Address,AddressType,AddressType_encode
0,2000 Block of THOMAS AV,Block,0.0
1,3RD ST / REVERE AV,CrossRoad,1.0
2,2000 Block of GOUGH ST,Block,0.0
3,4700 Block of MISSION ST,Block,0.0
4,4700 Block of MISSION ST,Block,0.0


### Clean up Address

In [15]:
crossroad_list = train[train["Address"].str.contains("/")]["Address"]
crossroad_list = crossroad_list.unique()

print(len(crossroad_list))
crossroad_list[:3]

12278


array(['OAK ST / LAGUNA ST', 'VANNESS AV / GREENWICH ST',
       'AVALON AV / PERU AV'], dtype=object)

In [16]:
address = '2000 Block of GOUGH ST'

def clean_up_duplicated_address(address):
    if not "/" in address:
        return address

    address1, address2 = address.split("/")

    address1 = address1.strip()
    address2 = address2.strip()

    if address1 > address2:
        return "{0} / {1}".format(address2, address1)
    else:
        return "{0} / {1}".format(address1, address2)

clean_up_duplicated_address(address)

'2000 Block of GOUGH ST'

In [17]:
from tqdm import tqdm

tqdm.pandas(desc='cleaning up (train) ...')

train["Address"] = train["Address"].progress_apply(clean_up_duplicated_address)

print(len(train["Address"].unique()))

print(train.shape)
train.head()

cleaning up (train) ...: 100%|██████████| 878049/878049 [00:00<00:00, 1038572.02it/s]


17812
(878049, 35)


Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y,Dates-year,...,PdDistrict_MISSION,PdDistrict_NORTHERN,PdDistrict_PARK,PdDistrict_RICHMOND,PdDistrict_SOUTHERN,PdDistrict_TARAVAL,PdDistrict_TENDERLOIN,Dates-minute_abs,AddressType,AddressType_encode
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",LAGUNA ST / OAK ST,-122.425892,37.774599,2015,...,0,1,0,0,0,0,0,23,CrossRoad,1.0
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",LAGUNA ST / OAK ST,-122.425892,37.774599,2015,...,0,1,0,0,0,0,0,23,CrossRoad,1.0
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",GREENWICH ST / VANNESS AV,-122.424363,37.800414,2015,...,0,1,0,0,0,0,0,3,CrossRoad,1.0
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873,2015,...,0,1,0,0,0,0,0,0,Block,0.0
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541,2015,...,0,0,1,0,0,0,0,0,Block,0.0


In [18]:
tqdm.pandas(desc='cleaning up (test) ...')

test["Address"] = test["Address"].progress_apply(clean_up_duplicated_address)

print(len(test["Address"].unique()))

print(test.shape)
test.head()

cleaning up (test) ...: 100%|██████████| 884262/884262 [00:00<00:00, 1056819.20it/s]


17772
(884262, 33)


Unnamed: 0,Id,Dates,DayOfWeek,PdDistrict,Address,X,Y,Dates-year,Dates-month,Dates-day,...,PdDistrict_MISSION,PdDistrict_NORTHERN,PdDistrict_PARK,PdDistrict_RICHMOND,PdDistrict_SOUTHERN,PdDistrict_TARAVAL,PdDistrict_TENDERLOIN,Dates-minute_abs,AddressType,AddressType_encode
0,0,2015-05-10 23:59:00,Sunday,BAYVIEW,2000 Block of THOMAS AV,-122.399588,37.735051,2015,5,10,...,0,0,0,0,0,0,0,29,Block,0.0
1,1,2015-05-10 23:51:00,Sunday,BAYVIEW,3RD ST / REVERE AV,-122.391523,37.732432,2015,5,10,...,0,0,0,0,0,0,0,21,CrossRoad,1.0
2,2,2015-05-10 23:50:00,Sunday,NORTHERN,2000 Block of GOUGH ST,-122.426002,37.792212,2015,5,10,...,0,1,0,0,0,0,0,20,Block,0.0
3,3,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412,2015,5,10,...,0,0,0,0,0,0,0,15,Block,0.0
4,4,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412,2015,5,10,...,0,0,0,0,0,0,0,15,Block,0.0


### Encode Address

In [19]:
address_list = train["Address"].value_counts()
major_address_list = address_list[address_list >= 100].index

print(major_address_list.shape)
major_address_list[:3]

(1719,)


Index(['800 Block of BRYANT ST', '800 Block of MARKET ST',
       '2000 Block of MISSION ST'],
      dtype='object')

In [20]:
train["Address_cleanup"] = train["Address"]

train.loc[~train["Address_cleanup"].isin(major_address_list), "Address_cleanup"] = "Others"

print(len(train["Address_cleanup"].unique()))
print(train.shape)

train[["Address", "Address_cleanup"]].head()

1720
(878049, 36)


Unnamed: 0,Address,Address_cleanup
0,LAGUNA ST / OAK ST,Others
1,LAGUNA ST / OAK ST,Others
2,GREENWICH ST / VANNESS AV,Others
3,1500 Block of LOMBARD ST,1500 Block of LOMBARD ST
4,100 Block of BRODERICK ST,Others


In [None]:
test["Address_cleanup"] = test["Address"]

test.loc[~test["Address_cleanup"].isin(major_address_list), "Address_cleanup"] = "Others"

print(len(test["Address_cleanup"].unique()))
print(test.shape)

test[["Address", "Address_cleanup"]].head()

1720
(884262, 34)


Unnamed: 0,Address,Address_cleanup
0,2000 Block of THOMAS AV,Others
1,3RD ST / REVERE AV,3RD ST / REVERE AV
2,2000 Block of GOUGH ST,Others
3,4700 Block of MISSION ST,4700 Block of MISSION ST
4,4700 Block of MISSION ST,4700 Block of MISSION ST


In [None]:
from scipy.sparse import csr_matrix

train_address = pd.get_dummies(train["Address_cleanup"], prefix="Address").astype('float32')
train_address = csr_matrix(train_address)

train_address

In [None]:
test_address = pd.get_dummies(test["Address_cleanup"], prefix="Address").astype('float32')
test_address = csr_matrix(test_address)

test_address

## Score

In [None]:
from sklearn.cross_validation import cross_val_score

feature_names = ["X", "Y"]
feature_names = feature_names + ["AddressType_encode"]
feature_names = feature_names + list(train_pd_district.columns)
feature_names = feature_names + list(test_dayofweek.columns)
feature_names = feature_names + ["Dates-hour", "Dates-minute_abs"]

label_name = "Category"

In [None]:
X_train = train[feature_names]

print(X_train.shape)
X_train.head()

In [None]:
from scipy.sparse import hstack

X_train = hstack([X_train, train_address])
X_train

In [None]:
X_test = test[feature_names]

print(X_test.shape)
X_test.head()

In [None]:
from scipy.sparse import hstack

X_test = hstack([X_test, test_address])
X_test

In [None]:
y_train = train[label_name]

print(y_train.shape)
y_train.head()

In [None]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)

print(y_train.shape)
y_train[0:10]

## Tune Hyperparameters

### Coarse Search

In [None]:
import xgboost as xgb

num_epoch = 100
n_estimators = 100

dtrain = xgb.DMatrix(X_train, label=y_train)

hyperparameters_list = []

for epoch in range(num_epoch):
    np.random.seed(None)

    learning_rate = np.random.uniform(low=0.1, high=1.0)
    max_depth = np.random.randint(low=5, high=100)
    subsample = np.random.uniform(low=0.1, high=1.0)
    colsample_bytree = np.random.uniform(low=0.1, high=1.0)
    colsample_bylevel = np.random.uniform(low=0.1, high=1.0)
    reg_alpha = 10 ** np.random.uniform(high=1.0, low=-10.0)
    reg_lambda = 10 ** np.random.uniform(high=1.0, low=-10.0)
    max_delta_step = np.random.uniform(low=0.1, high=10.0)
    
    np.random.seed(None)

    params = {
        'booster': 'gbtree',
        'objective': 'multi:softprob',
        'eval_metric': 'mlogloss',
        'eta': learning_rate,
        'max_depth': max_depth,
        'subsample': subsample,
        'colsample_bytree': colsample_bytree,
        'colsample_bylevel': colsample_bylevel,
        'reg_alpha': reg_alpha,
        'reg_lambda': reg_lambda,
        'max_delta_step': max_delta_step,
        'num_class': len(np.unique(y_train)),
        'nthread': 8,
        'silent': 1,
    }

    dtrain = xgb.DMatrix(X_train, label=y_train)
    
    result = xgb.cv(params, dtrain, n_estimators, nfold=5, metrics={'mlogloss'})

    score = result["test-mlogloss-mean"].min()
    num_best_round = result["test-mlogloss-mean"].argmin() + 1

    np.random.seed(None)

    print("{0:3} num_round = {1}, learning_rate = {2:.6f}, max_depth = {3}, subsample = {4:.6f}, colsample_bytree = {5:.6f}, colsample_bylevel = {6:.6f}, reg_alpha = {7:.10f}, reg_lambda = {8:.10f}, max_delta_step = {9:.6f}, score = {10:.5f}" \
          .format(epoch, num_best_round, learning_rate, max_depth, subsample, colsample_bytree, colsample_bylevel, reg_alpha, reg_lambda, max_delta_step, score))
    
    hyperparameters_list.append({
        'epoch': epoch,
        'n_estimators': num_best_round,
        'learning_rate': learning_rate,
        'max_depth': max_depth,
        'subsample': subsample,
        'colsample_bytree': colsample_bytree,
        'colsample_bylevel': colsample_bylevel,
        'reg_alpha': reg_alpha,
        'reg_lambda': reg_lambda,
        'max_delta_step': max_delta_step,
        'score': score,
    })
    
    tmp = pd.DataFrame.from_dict(hyperparameters_list)
    tmp = tmp.sort_values(by="score", ascending=True)
    
    tmp.to_csv("hyperparameters/coarse.csv")

hyperparameters_list = pd.DataFrame.from_dict(hyperparameters_list)
hyperparameters_list = hyperparameters_list.sort_values(by="score", ascending=True)

print(hyperparameters_list.shape)
hyperparameters_list.head()

In [4]:
import xgboost as xgb

model = xgb.XGBClassifier(objective='multi:softprob',
                          n_estimators=50,
                          learning_rate=0.115519,
                          max_depth=17,
                          max_delta_step=8.857549,
                          subsample=0.899305,
                          colsample_bytree=0.634061,
                          colsample_bylevel=0.886308,
                          reg_alpha=5.559613e-06,
                          reg_lambda=8.183245,
                          nthread=-1,
                          seed=seed)
model

XGBClassifier(base_score=0.5, colsample_bylevel=0.886308,
       colsample_bytree=0.634061, gamma=0, learning_rate=0.115519,
       max_delta_step=8.857549, max_depth=17, min_child_weight=1,
       missing=None, n_estimators=50, nthread=-1,
       objective='multi:softprob', reg_alpha=5.559613e-06,
       reg_lambda=8.183245, scale_pos_weight=1, seed=37, silent=True,
       subsample=0.899305)

In [None]:
from sklearn.cross_validation import cross_val_score

%time score = cross_val_score(model, X_train, y_train, scoring='neg_log_loss', cv=5).mean()

score = -1.0 * score

print("Score = {0:.5f}".format(score))

## Predict

In [None]:
%time model.fit(X_train, y_train)

In [None]:
predictions = model.predict_proba(X_test)

print(predictions.shape)
predictions[0:10]

# Submit

In [None]:
submission = pd.read_csv("data/sampleSubmission.csv", index_col="Id")

category_list = model.classes_

for i, category in enumerate(category_list):
    submission[category] = predictions[:, i]

print(submission.shape)
submission.head(3)

In [None]:
from datetime import datetime

current_time = datetime.now()
current_time = current_time.strftime("%Y%m%d_%H%M%S")

description = "to-the-top-10"

filename = "{time}_{score:.5f}_{description}.csv".format(time=current_time, score=score, description=description)
filepath = "submissions/{filename}".format(filename=filename)

submission.to_csv(filepath)