In [1]:
import pandas as pd
import numpy as np

In [2]:
seed = 21

## Load Dataset

In [3]:
train = pd.read_csv("sf-crime/data/train.csv", parse_dates=["Dates"])

print(train.shape)
train.head()

(878049, 9)


Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541


In [4]:
test = pd.read_csv("sf-crime/data/test.csv", parse_dates=["Dates"])

print(test.shape)
test.head()

(884262, 7)


Unnamed: 0,Id,Dates,DayOfWeek,PdDistrict,Address,X,Y
0,0,2015-05-10 23:59:00,Sunday,BAYVIEW,2000 Block of THOMAS AV,-122.399588,37.735051
1,1,2015-05-10 23:51:00,Sunday,BAYVIEW,3RD ST / REVERE AV,-122.391523,37.732432
2,2,2015-05-10 23:50:00,Sunday,NORTHERN,2000 Block of GOUGH ST,-122.426002,37.792212
3,3,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412
4,4,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412


## Preprocessing

In [5]:
train["year"] = train["Dates"].dt.year
train["month"] = train["Dates"].dt.month
train["day"] = train["Dates"].dt.day
train["hour"] = train["Dates"].dt.hour
train["minute"] = train["Dates"].dt.minute

# #train["dayofweek"] = train["Dates"].dt.dayofweek

# train["Monday"] = train["DayOfWeek"] == "Monday"
# train["Tuesday"] = train["DayOfWeek"] == "Tuesday"
# train["Wednesday"] = train["DayOfWeek"] == "Wednesday"
# train["Thursday"] = train["DayOfWeek"] == "Thursday"
# train["Friday"] = train["DayOfWeek"] == "Friday"
# train["Saturday"] = train["DayOfWeek"] == "Saturday"
# train["Sunday"] = train["DayOfWeek"] == "Sunday"

print(train.shape)
train[["year", "month", "day", "hour", "minute"]].head()

(878049, 14)


Unnamed: 0,year,month,day,hour,minute
0,2015,5,13,23,53
1,2015,5,13,23,53
2,2015,5,13,23,33
3,2015,5,13,23,30
4,2015,5,13,23,30


In [6]:
test["year"] = test["Dates"].dt.year
test["month"] = test["Dates"].dt.month
test["day"] = test["Dates"].dt.day
test["hour"] = test["Dates"].dt.hour
test["minute"] = test["Dates"].dt.minute

# test["Monday"] = test["DayOfWeek"] == "Monday"
# test["Tuesday"] = test["DayOfWeek"] == "Tuesday"
# test["Wednesday"] = test["DayOfWeek"] == "Wednesday"
# test["Thursday"] = test["DayOfWeek"] == "Thursday"
# test["Friday"] = test["DayOfWeek"] == "Friday"
# test["Saturday"] = test["DayOfWeek"] == "Saturday"
# test["Sunday"] = test["DayOfWeek"] == "Sunday"

### Encode DayOfWeek

In [7]:
train_dayofweek = pd.get_dummies(train["DayOfWeek"], prefix="DayOfWeek")

#train.merge(train_dayofweek, left_index=True, right_index=True)
train = pd.concat([train, train_dayofweek], axis=1)
train[["DayOfWeek"] + list(train_dayofweek)].head()

Unnamed: 0,DayOfWeek,DayOfWeek_Friday,DayOfWeek_Monday,DayOfWeek_Saturday,DayOfWeek_Sunday,DayOfWeek_Thursday,DayOfWeek_Tuesday,DayOfWeek_Wednesday
0,Wednesday,0,0,0,0,0,0,1
1,Wednesday,0,0,0,0,0,0,1
2,Wednesday,0,0,0,0,0,0,1
3,Wednesday,0,0,0,0,0,0,1
4,Wednesday,0,0,0,0,0,0,1


In [8]:
test_dayofweek = pd.get_dummies(test["DayOfWeek"], prefix="DayOfWeek")

#test.merge(test_dayofweek, left_index=True, right_index=True)
test = pd.concat([test, test_dayofweek], axis=1)
test[["DayOfWeek"] + list(test_dayofweek)].head()

Unnamed: 0,DayOfWeek,DayOfWeek_Friday,DayOfWeek_Monday,DayOfWeek_Saturday,DayOfWeek_Sunday,DayOfWeek_Thursday,DayOfWeek_Tuesday,DayOfWeek_Wednesday
0,Sunday,0,0,0,1,0,0,0
1,Sunday,0,0,0,1,0,0,0
2,Sunday,0,0,0,1,0,0,0
3,Sunday,0,0,0,1,0,0,0
4,Sunday,0,0,0,1,0,0,0


### Encode PdDistrict

In [9]:
from sklearn.preprocessing import LabelEncoder

In [10]:
encoder = LabelEncoder()

district = encoder.fit_transform(train["PdDistrict"]) # 문자열->숫자

num_districts = len(np.unique(train["PdDistrict"]))
district_train_hot = np.eye(num_districts)[district] # 숫자->one-hot

PdDistrict_train_hot = pd.DataFrame(district_train_hot, columns=np.unique(train["PdDistrict"]))
train = pd.concat([train, PdDistrict_train_hot], axis=1)
#train.merge(PdDistrict_train_hot, left_index=True, right_index=True)
train[["PdDistrict"] + list(np.unique(train["PdDistrict"]))].head()

Unnamed: 0,PdDistrict,BAYVIEW,CENTRAL,INGLESIDE,MISSION,NORTHERN,PARK,RICHMOND,SOUTHERN,TARAVAL,TENDERLOIN
0,NORTHERN,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,NORTHERN,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,NORTHERN,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,NORTHERN,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,PARK,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [11]:
encoder = LabelEncoder()

district = encoder.fit_transform(test["PdDistrict"]) # 문자열->숫자

num_districts = len(np.unique(test["PdDistrict"]))
district_test_hot = np.eye(num_districts)[district] # 숫자->one-hot

PdDistrict_test_hot = pd.DataFrame(district_test_hot, columns=np.unique(test["PdDistrict"]))
test = pd.concat([test, PdDistrict_test_hot], axis=1)
#test.merge(PdDistrict_test_hot, left_index=True, right_index=True)
test.head()

Unnamed: 0,Id,Dates,DayOfWeek,PdDistrict,Address,X,Y,year,month,day,...,BAYVIEW,CENTRAL,INGLESIDE,MISSION,NORTHERN,PARK,RICHMOND,SOUTHERN,TARAVAL,TENDERLOIN
0,0,2015-05-10 23:59:00,Sunday,BAYVIEW,2000 Block of THOMAS AV,-122.399588,37.735051,2015,5,10,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,2015-05-10 23:51:00,Sunday,BAYVIEW,3RD ST / REVERE AV,-122.391523,37.732432,2015,5,10,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,2015-05-10 23:50:00,Sunday,NORTHERN,2000 Block of GOUGH ST,-122.426002,37.792212,2015,5,10,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,3,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412,2015,5,10,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412,2015,5,10,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
# pd_district_list = train["PdDistrict"].unique()

# pd_district_new_column_list = []

# for pd_district in pd_district_list:    
#     pd_district_new_column = "PdDistrict-{0}".format(pd_district)
#     train[pd_district_new_column] = train["PdDistrict"] == pd_district
    
#     pd_district_new_column_list.append(pd_district_new_column)

# print(train.shape)
# train[pd_district_new_column_list].head()

In [13]:
# for pd_district in pd_district_list:    
#     pd_district_new_column = "PdDistrict-{0}".format(pd_district)
#     test[pd_district_new_column] = test["PdDistrict"] == pd_district

# print(test.shape)
# test[pd_district_new_column_list].head()

### Convert minute

In [14]:
train["minute-abs"] = np.abs(train["minute"] - 30)
test["minute-abs"] = np.abs(test["minute"] - 30)

In [15]:
print(train["minute-abs"].value_counts()[:5])
print(train.shape)
train[["Dates", "minute", "minute-abs"]].head()

30    268950
0     125173
15     78133
10     50695
20     48257
Name: minute-abs, dtype: int64
(878049, 32)


Unnamed: 0,Dates,minute,minute-abs
0,2015-05-13 23:53:00,53,23
1,2015-05-13 23:53:00,53,23
2,2015-05-13 23:33:00,33,3
3,2015-05-13 23:30:00,30,0
4,2015-05-13 23:30:00,30,0


### Address-Type

In [16]:
train.loc[train["Address"].str.contains("Block of"), "AddressType"] = "Block"
train.loc[~train["Address"].str.contains("Block of"), "AddressType"] = "CrossRoad"

train.loc[train["AddressType"] == "Block", "AddressType_encode"] = 0
train.loc[train["AddressType"] == "CrossRoad", "AddressType_encode"] = 1

train[["Address", "AddressType", "AddressType_encode"]].head()

Unnamed: 0,Address,AddressType,AddressType_encode
0,OAK ST / LAGUNA ST,CrossRoad,1.0
1,OAK ST / LAGUNA ST,CrossRoad,1.0
2,VANNESS AV / GREENWICH ST,CrossRoad,1.0
3,1500 Block of LOMBARD ST,Block,0.0
4,100 Block of BRODERICK ST,Block,0.0


In [17]:
test.loc[test["Address"].str.contains("Block of"), "AddressType"] = "Block"
test.loc[~test["Address"].str.contains("Block of"), "AddressType"] = "CrossRoad"

test.loc[test["AddressType"] == "Block", "AddressType_encode"] = 0
test.loc[test["AddressType"] == "CrossRoad", "AddressType_encode"] = 1

test[["Address", "AddressType", "AddressType_encode"]].head()

Unnamed: 0,Address,AddressType,AddressType_encode
0,2000 Block of THOMAS AV,Block,0.0
1,3RD ST / REVERE AV,CrossRoad,1.0
2,2000 Block of GOUGH ST,Block,0.0
3,4700 Block of MISSION ST,Block,0.0
4,4700 Block of MISSION ST,Block,0.0


In [18]:
train["CrossRoad"] = train["Address"].str.contains("/")
test["CrossRoad"] = test["Address"].str.contains("/")

train["Block"] = train["Address"].str.contains(" of ")
test["Block"] = test["Address"].str.contains(" of ")

train["AV"] = train["Address"].str.contains("AV")
test["AV"] = test["Address"].str.contains("AV")

In [19]:
train[["Address", "AddressType", "AddressType_encode", "CrossRoad", "Block", "AV"]].head()

Unnamed: 0,Address,AddressType,AddressType_encode,CrossRoad,Block,AV
0,OAK ST / LAGUNA ST,CrossRoad,1.0,True,False,False
1,OAK ST / LAGUNA ST,CrossRoad,1.0,True,False,False
2,VANNESS AV / GREENWICH ST,CrossRoad,1.0,True,False,True
3,1500 Block of LOMBARD ST,Block,0.0,False,True,False
4,100 Block of BRODERICK ST,Block,0.0,False,True,False


### Clean up Address

In [20]:
crossroad_list = train[train["Address"].str.contains("/")]["Address"]
crossroad_list = crossroad_list.unique()

print(len(crossroad_list))
crossroad_list[:3]

12278


array(['OAK ST / LAGUNA ST', 'VANNESS AV / GREENWICH ST',
       'AVALON AV / PERU AV'], dtype=object)

In [21]:
def merge_duplicated_address(address):
    #if ("of" in address) or (address == 'Others') :
    if not "/" in address:
        return address
    address1, address2 = address.split("/")
    address1, address2 = address1.strip(), address2.strip()
    if address1 < address2:
        return "{0} / {1}".format(address1, address2)
    else:
        return "{0} / {1}".format(address2, address1)

In [22]:
train["Address(Origin)"] = train["Address"].copy()

In [23]:
from tqdm import tqdm
tqdm.pandas(desc="cleaning up (train) ...")

train["Address"] = train["Address"].progress_apply(merge_duplicated_address)

print(len(train["Address"].unique()))
print(train.shape)
train.head()

cleaning up (train) ...: 100%|██████████| 878049/878049 [00:00<00:00, 913405.96it/s]


17812
(878049, 38)


Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y,year,...,SOUTHERN,TARAVAL,TENDERLOIN,minute-abs,AddressType,AddressType_encode,CrossRoad,Block,AV,Address(Origin)
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",LAGUNA ST / OAK ST,-122.425892,37.774599,2015,...,0.0,0.0,0.0,23,CrossRoad,1.0,True,False,False,OAK ST / LAGUNA ST
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",LAGUNA ST / OAK ST,-122.425892,37.774599,2015,...,0.0,0.0,0.0,23,CrossRoad,1.0,True,False,False,OAK ST / LAGUNA ST
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",GREENWICH ST / VANNESS AV,-122.424363,37.800414,2015,...,0.0,0.0,0.0,3,CrossRoad,1.0,True,False,True,VANNESS AV / GREENWICH ST
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873,2015,...,0.0,0.0,0.0,0,Block,0.0,False,True,False,1500 Block of LOMBARD ST
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541,2015,...,0.0,0.0,0.0,0,Block,0.0,False,True,False,100 Block of BRODERICK ST


In [24]:
from tqdm import tqdm
tqdm.pandas(desc="cleaning up (train) ...")

test["Address"] = test["Address"].progress_apply(merge_duplicated_address)

print(len(test["Address"].unique()))
print(test.shape)
test.head()

cleaning up (train) ...: 100%|██████████| 884262/884262 [00:00<00:00, 955587.05it/s]


17772
(884262, 35)


Unnamed: 0,Id,Dates,DayOfWeek,PdDistrict,Address,X,Y,year,month,day,...,RICHMOND,SOUTHERN,TARAVAL,TENDERLOIN,minute-abs,AddressType,AddressType_encode,CrossRoad,Block,AV
0,0,2015-05-10 23:59:00,Sunday,BAYVIEW,2000 Block of THOMAS AV,-122.399588,37.735051,2015,5,10,...,0.0,0.0,0.0,0.0,29,Block,0.0,False,True,True
1,1,2015-05-10 23:51:00,Sunday,BAYVIEW,3RD ST / REVERE AV,-122.391523,37.732432,2015,5,10,...,0.0,0.0,0.0,0.0,21,CrossRoad,1.0,True,False,True
2,2,2015-05-10 23:50:00,Sunday,NORTHERN,2000 Block of GOUGH ST,-122.426002,37.792212,2015,5,10,...,0.0,0.0,0.0,0.0,20,Block,0.0,False,True,False
3,3,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412,2015,5,10,...,0.0,0.0,0.0,0.0,15,Block,0.0,False,True,False
4,4,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412,2015,5,10,...,0.0,0.0,0.0,0.0,15,Block,0.0,False,True,False


### Encode Address

In [25]:
# train["Address"] = train["Address(Origin)"].apply(merge_duplicated_address)
# #train["Address"].unique().shape, train["Address(Origin)"].unique().shape
# address_counts = train["Address"].value_counts()
# major_address_list = address_counts[address_counts >= 100].index
# train.loc[~train["Address"].isin(major_address_list), "Address"] = "Others"
# train["Address"].value_counts()
# Address_onehot = pd.get_dummies(train["Address"])

In [26]:
address_list = train["Address"].value_counts()
address_list

major_address_list = address_list[address_list >= 100].index
major_address_list

Index(['800 Block of BRYANT ST', '800 Block of MARKET ST',
       '2000 Block of MISSION ST', '1000 Block of POTRERO AV',
       '16TH ST / MISSION ST', '900 Block of MARKET ST', '0 Block of TURK ST',
       '0 Block of 6TH ST', '300 Block of ELLIS ST', '400 Block of ELLIS ST',
       ...
       '700 Block of WEBSTER ST', 'OAKDALE AV / RANKIN ST',
       '14TH ST / SOUTH VAN NESS AV', 'LANE ST / PALOU AV',
       '1800 Block of SUTTER ST', 'HOWARD ST / THE EMBARCADEROSOUTH ST',
       'BROADWAY ST / POWELL ST', '300 Block of BUCHANAN ST',
       '0 Block of LAGUNA ST', '0 Block of ZOO RD'],
      dtype='object', length=1719)

In [27]:
train["Address_cleanup"] = train["Address"]

train.loc[~train["Address_cleanup"].isin(major_address_list), "Address_cleanup"] = "Others"

print(len(train["Address_cleanup"].unique()))
print(train.shape)

train[["Address", "Address_cleanup"]].head()

1720
(878049, 39)


Unnamed: 0,Address,Address_cleanup
0,LAGUNA ST / OAK ST,Others
1,LAGUNA ST / OAK ST,Others
2,GREENWICH ST / VANNESS AV,Others
3,1500 Block of LOMBARD ST,1500 Block of LOMBARD ST
4,100 Block of BRODERICK ST,Others


In [28]:
test["Address_cleanup"] = test["Address"]

test.loc[~test["Address_cleanup"].isin(major_address_list), "Address_cleanup"] = "Others"

print(len(test["Address_cleanup"].unique()))
print(test.shape)

test[["Address", "Address_cleanup"]].head()

1720
(884262, 36)


Unnamed: 0,Address,Address_cleanup
0,2000 Block of THOMAS AV,Others
1,3RD ST / REVERE AV,3RD ST / REVERE AV
2,2000 Block of GOUGH ST,Others
3,4700 Block of MISSION ST,4700 Block of MISSION ST
4,4700 Block of MISSION ST,4700 Block of MISSION ST


In [29]:
from scipy.sparse import csr_matrix

train_address = pd.get_dummies(train["Address_cleanup"], prefix="Address").astype('float32')
train_address = csr_matrix(train_address)

train_address

<878049x1720 sparse matrix of type '<class 'numpy.float32'>'
	with 878049 stored elements in Compressed Sparse Row format>

In [30]:
from scipy.sparse import csr_matrix

test_address = pd.get_dummies(test["Address_cleanup"], prefix="Address").astype('float32')
test_address = csr_matrix(test_address)

test_address

<884262x1720 sparse matrix of type '<class 'numpy.float32'>'
	with 884262 stored elements in Compressed Sparse Row format>

### X, Y 

In [31]:
def _XY(pddistrict):
    pddistrict_X = train[(train["PdDistrict"] == pddistrict) & (train["X"] != train["X"].max()) & (train["X"] != train["X"].min())]["X"].mean()
    pddistrict_Y = train[(train["PdDistrict"] == pddistrict) & (train["Y"] != train["Y"].max()) & (train["Y"] != train["Y"].min())]["Y"].mean()
    
    train.loc[(train["PdDistrict"] == pddistrict) & ((train["X"] == train["X"].max()) | (train["X"] == train["X"].min())), "_X"] = pddistrict_X
    train.loc[(train["PdDistrict"] == pddistrict) & ((train["Y"] == train["Y"].max()) | (train["Y"] == train["Y"].min())), "_Y"] = pddistrict_Y

    test.loc[(test["PdDistrict"] == pddistrict) & ((test["X"] == test["X"].max()) | (test["X"] == test["X"].min())), "_X"] = pddistrict_X
    test.loc[(test["PdDistrict"] == pddistrict) & ((test["Y"] == test["Y"].max()) | (test["Y"] == test["Y"].min())), "_Y"] = pddistrict_Y

In [32]:
train["_X"] = train["X"].copy()
train["_Y"] = train["Y"].copy()
test["_X"] = test["X"].copy()
test["_Y"] = test["Y"].copy()

district = train["PdDistrict"].unique()
for i in range(district.size):
    _XY(district[i])

In [33]:
train.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y,year,...,minute-abs,AddressType,AddressType_encode,CrossRoad,Block,AV,Address(Origin),Address_cleanup,_X,_Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",LAGUNA ST / OAK ST,-122.425892,37.774599,2015,...,23,CrossRoad,1.0,True,False,False,OAK ST / LAGUNA ST,Others,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",LAGUNA ST / OAK ST,-122.425892,37.774599,2015,...,23,CrossRoad,1.0,True,False,False,OAK ST / LAGUNA ST,Others,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",GREENWICH ST / VANNESS AV,-122.424363,37.800414,2015,...,3,CrossRoad,1.0,True,False,True,VANNESS AV / GREENWICH ST,Others,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873,2015,...,0,Block,0.0,False,True,False,1500 Block of LOMBARD ST,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541,2015,...,0,Block,0.0,False,True,False,100 Block of BRODERICK ST,Others,-122.438738,37.771541


## Train

In [34]:
from sklearn.cross_validation import cross_val_score

#feature_names = ["X", "Y"]
feature_names = ["_X", "_Y"]
feature_names = feature_names + ["AddressType_encode"]
feature_names = feature_names + list(np.unique(test["PdDistrict"]))
feature_names = feature_names + list(test_dayofweek.columns)
feature_names = feature_names + ["hour", "minute-abs"]

label_name = "Category"



In [35]:
X_train = train[feature_names]

print(X_train.shape)
X_train.head()

(878049, 22)


Unnamed: 0,_X,_Y,AddressType_encode,BAYVIEW,CENTRAL,INGLESIDE,MISSION,NORTHERN,PARK,RICHMOND,...,TENDERLOIN,DayOfWeek_Friday,DayOfWeek_Monday,DayOfWeek_Saturday,DayOfWeek_Sunday,DayOfWeek_Thursday,DayOfWeek_Tuesday,DayOfWeek_Wednesday,hour,minute-abs
0,-122.425892,37.774599,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0,0,0,0,0,0,1,23,23
1,-122.425892,37.774599,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0,0,0,0,0,0,1,23,23
2,-122.424363,37.800414,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0,0,0,0,0,0,1,23,3
3,-122.426995,37.800873,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0,0,0,0,0,0,1,23,0
4,-122.438738,37.771541,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0,0,0,0,0,0,1,23,0


In [36]:
from scipy.sparse import hstack

X_train = hstack([X_train, train_address])
X_train

<878049x1742 sparse matrix of type '<class 'numpy.float64'>'
	with 6237123 stored elements in COOrdinate format>

In [37]:
X_test = test[feature_names]

print(X_test.shape)
X_test.head()

(884262, 22)


Unnamed: 0,_X,_Y,AddressType_encode,BAYVIEW,CENTRAL,INGLESIDE,MISSION,NORTHERN,PARK,RICHMOND,...,TENDERLOIN,DayOfWeek_Friday,DayOfWeek_Monday,DayOfWeek_Saturday,DayOfWeek_Sunday,DayOfWeek_Thursday,DayOfWeek_Tuesday,DayOfWeek_Wednesday,hour,minute-abs
0,-122.399588,37.735051,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0,0,0,1,0,0,0,23,29
1,-122.391523,37.732432,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0,0,0,1,0,0,0,23,21
2,-122.426002,37.792212,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0,0,0,1,0,0,0,23,20
3,-122.437394,37.721412,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0,0,0,1,0,0,0,23,15
4,-122.437394,37.721412,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0,0,0,1,0,0,0,23,15


In [38]:
from scipy.sparse import hstack

X_test = hstack([X_test, test_address])
X_test

<884262x1742 sparse matrix of type '<class 'numpy.float64'>'
	with 6279013 stored elements in COOrdinate format>

In [39]:
label_name = "Category"

y_train = train[label_name]

print(y_train.shape)
y_train.head()

(878049,)


0          WARRANTS
1    OTHER OFFENSES
2    OTHER OFFENSES
3     LARCENY/THEFT
4     LARCENY/THEFT
Name: Category, dtype: object

In [40]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)

print(y_train.shape)
y_train[0:10]

(878049,)


array([37, 21, 21, 16, 16, 16, 36, 36, 16, 16])

## Tune Hyperparameters

### Coarse Search

In [41]:
# import xgboost as xgb

# num_epoch = 100
# n_estimators = 100

# dtrain = xgb.DMatrix(X_train, label=y_train)

# hyperparameters_list = []

# for epoch in range(num_epoch):
#     np.random.seed(None)

#     learning_rate = np.random.uniform(low=0.1, high=1.0)
#     max_depth = np.random.randint(low=5, high=100)
#     subsample = np.random.uniform(low=0.1, high=1.0)
#     colsample_bytree = np.random.uniform(low=0.1, high=1.0)
#     colsample_bylevel = np.random.uniform(low=0.1, high=1.0)
#     reg_alpha = 10 ** np.random.uniform(high=1.0, low=-10.0)
#     reg_lambda = 10 ** np.random.uniform(high=1.0, low=-10.0)
#     max_delta_step = np.random.uniform(low=0.1, high=10.0)
    
#     np.random.seed(None)

#     params = {
#         'booster': 'gbtree',
#         'objective': 'multi:softprob',
#         'eval_metric': 'mlogloss',
#         'eta': learning_rate,
#         'max_depth': max_depth,
#         'subsample': subsample,
#         'colsample_bytree': colsample_bytree,
#         'colsample_bylevel': colsample_bylevel,
#         'reg_alpha': reg_alpha,
#         'reg_lambda': reg_lambda,
#         'max_delta_step': max_delta_step,
#         'num_class': len(np.unique(y_train)),
#         'nthread': 8,
#         'silent': 1,
#     }

#     dtrain = xgb.DMatrix(X_train, label=y_train)
    
#     result = xgb.cv(params, dtrain, n_estimators, nfold=5, metrics={'mlogloss'})

#     score = result["test-mlogloss-mean"].min()
#     num_best_round = result["test-mlogloss-mean"].argmin() + 1

#     np.random.seed(None)

#     print("{0:3} num_round = {1}, learning_rate = {2:.6f}, max_depth = {3}, subsample = {4:.6f}, colsample_bytree = {5:.6f}, colsample_bylevel = {6:.6f}, reg_alpha = {7:.10f}, reg_lambda = {8:.10f}, max_delta_step = {9:.6f}, score = {10:.5f}" \
#           .format(epoch, num_best_round, learning_rate, max_depth, subsample, colsample_bytree, colsample_bylevel, reg_alpha, reg_lambda, max_delta_step, score))
    
#     hyperparameters_list.append({
#         'epoch': epoch,
#         'n_estimators': num_best_round,
#         'learning_rate': learning_rate,
#         'max_depth': max_depth,
#         'subsample': subsample,
#         'colsample_bytree': colsample_bytree,
#         'colsample_bylevel': colsample_bylevel,
#         'reg_alpha': reg_alpha,
#         'reg_lambda': reg_lambda,
#         'max_delta_step': max_delta_step,
#         'score': score,
#     })
    
#     tmp = pd.DataFrame.from_dict(hyperparameters_list)
#     tmp = tmp.sort_values(by="score", ascending=True)
    
#     tmp.to_csv("hyperparameters/coarse.csv")

# hyperparameters_list = pd.DataFrame.from_dict(hyperparameters_list)
# hyperparameters_list = hyperparameters_list.sort_values(by="score", ascending=True)

# print(hyperparameters_list.shape)
# hyperparameters_list.head()

In [42]:
# hyperparameters_list.head()

### Use XGBoost

In [43]:
import xgboost as xgb

model = xgb.XGBClassifier(n_estimators=15, nthread=4)
model

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=15, nthread=4,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [44]:
# # 하이퍼 파라미터 튜닝 버젼 - 시간 오래 걸리므로 마지막에
# model = xgb.XGBClassifier(n_estimators=45, max_depth=6, learning_rate=1.0, max_delta_step=1,  nthread=4, seed=37)
# model

In [45]:
import xgboost as xgb

model = xgb.XGBClassifier(objective='multi:softprob',
                          n_estimators=50,
                          learning_rate=0.115519,
                          max_depth=17,
                          max_delta_step=8.857549,
                          subsample=0.899305,
                          colsample_bytree=0.634061,
                          colsample_bylevel=0.886308,
                          reg_alpha=5.559613e-06,
                          reg_lambda=8.183245,
                          nthread=-1,
                          seed=seed)
model

XGBClassifier(base_score=0.5, colsample_bylevel=0.886308,
       colsample_bytree=0.634061, gamma=0, learning_rate=0.115519,
       max_delta_step=8.857549, max_depth=17, min_child_weight=1,
       missing=None, n_estimators=50, nthread=-1,
       objective='multi:softprob', reg_alpha=5.559613e-06,
       reg_lambda=8.183245, scale_pos_weight=1, seed=21, silent=True,
       subsample=0.899305)

### Score

In [46]:
from sklearn.cross_validation import cross_val_score

%time score = cross_val_score(model, X_train, y_train, cv=5, scoring="neg_log_loss").mean()
score = -1.0 * score 
print("Score = {0:.5f}".format(score)) 

KeyboardInterrupt: 

NameError: name 'score' is not defined

## Predict

In [None]:
%time model.fit(X_train, y_train)

In [None]:
predictions = model.predict(X_test)
#predictions = model.predict_proba(X_test)

print(predictions.shape)
predictions

### Submit

In [None]:
submission = pd.read_csv("sf-crime/data/sampleSubmission.csv", index_col="Id")

print(submission.shape)
submission.head()

In [None]:
category_list = model.classes

for i, category in enumerate(category_list):
    submission[category] = predictions[:, i]

print(submission.shape)
submission.head(3)

In [None]:
from datetime import datetime

current_time = datetime.now()
current_time = current_time.strftime("%Y%m%d_%H%M%S")

description = "to-the-top-10"

filename = "{time}_{score:.5f}_{description}.csv".format(time=current_time, score=score, description=description)
filapath = "submission/{filename}".format(filename=filename)

submission.to_csv(filepath)

In [None]:
# submission = pd.DataFrame(predictions, 
#              index=submission.index, 
#              columns=submission.columns)

# print(submission.shape)
# submission.head()

In [None]:
# submission.to_csv("sf-crime/data/baseline-script_{0:5f}.csv".format(score))