# Kaggle 案例之“旧金山犯罪分类”

## 1、分析需求

In [1]:
import numpy as np
import pandas as pd
import os

print(os.listdir("../input"))

['.DS_Store', 'sampleSubmission.csv', 'test.csv', 'sampleSubmission.csv.zip', 'train.csv', 'test.csv.zip', 'train.csv.zip']


In [2]:
%%time
train = pd.read_csv('../input/train.csv', parse_dates=['Dates'])
test = pd.read_csv('../input/test.csv', parse_dates=['Dates'], index_col='Id')

CPU times: user 3.75 s, sys: 368 ms, total: 4.11 s
Wall time: 4.18 s


In [3]:
train.shape

(878049, 9)

In [4]:
test.shape

(884262, 6)

In [5]:
train.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541


In [6]:
test.head()

Unnamed: 0_level_0,Dates,DayOfWeek,PdDistrict,Address,X,Y
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,2015-05-10 23:59:00,Sunday,BAYVIEW,2000 Block of THOMAS AV,-122.399588,37.735051
1,2015-05-10 23:51:00,Sunday,BAYVIEW,3RD ST / REVERE AV,-122.391523,37.732432
2,2015-05-10 23:50:00,Sunday,NORTHERN,2000 Block of GOUGH ST,-122.426002,37.792212
3,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412
4,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412


In [7]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878049 entries, 0 to 878048
Data columns (total 9 columns):
Dates         878049 non-null datetime64[ns]
Category      878049 non-null object
Descript      878049 non-null object
DayOfWeek     878049 non-null object
PdDistrict    878049 non-null object
Resolution    878049 non-null object
Address       878049 non-null object
X             878049 non-null float64
Y             878049 non-null float64
dtypes: datetime64[ns](1), float64(2), object(6)
memory usage: 60.3+ MB


In [8]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 884262 entries, 0 to 884261
Data columns (total 6 columns):
Dates         884262 non-null datetime64[ns]
DayOfWeek     884262 non-null object
PdDistrict    884262 non-null object
Address       884262 non-null object
X             884262 non-null float64
Y             884262 non-null float64
dtypes: datetime64[ns](1), float64(2), object(3)
memory usage: 47.2+ MB


### 检查是否有空值

In [9]:
train.isnull().sum()

Dates         0
Category      0
Descript      0
DayOfWeek     0
PdDistrict    0
Resolution    0
Address       0
X             0
Y             0
dtype: int64

### 特征工程

In [10]:
def feature_engineering(data):
    data['Date'] = pd.to_datetime(data['Dates'].dt.date)

    # 距离最早的统计时间，经过了多少天
    data['n_days'] = (
        data['Date'] - data['Date'].min()).apply(lambda x: x.days)
    # 这个月的第几天
    data['Day'] = data['Dates'].dt.day
    # 这个星期的第几天
    data['DayOfWeek'] = data['Dates'].dt.weekday
    data['Month'] = data['Dates'].dt.month
    data['Year'] = data['Dates'].dt.year
    data['Hour'] = data['Dates'].dt.hour
    data['Minute'] = data['Dates'].dt.minute
    # 街区信息是否有 Block
    data['Block'] = data['Address'].str.contains(
        'block', case=False).apply(lambda x: 1 if x == True else 0)
    # 经纬度之差
    data["X_Y"] = data["X"] - data["Y"]
    # 经纬度之和
    data["XY"] = data["X"] + data["Y"]
    data.drop(columns=['Dates', 'Date', 'Address'], inplace=True)
    return data

In [11]:
train = feature_engineering(train)
test = feature_engineering(test)

In [12]:
train.head()

Unnamed: 0,Category,Descript,DayOfWeek,PdDistrict,Resolution,X,Y,n_days,Day,Month,Year,Hour,Minute,Block,X_Y,XY
0,WARRANTS,WARRANT ARREST,2,NORTHERN,"ARREST, BOOKED",-122.425892,37.774599,4510,13,5,2015,23,53,0,-160.20049,-84.651293
1,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,2,NORTHERN,"ARREST, BOOKED",-122.425892,37.774599,4510,13,5,2015,23,53,0,-160.20049,-84.651293
2,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,2,NORTHERN,"ARREST, BOOKED",-122.424363,37.800414,4510,13,5,2015,23,33,0,-160.224777,-84.623949
3,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,2,NORTHERN,NONE,-122.426995,37.800873,4510,13,5,2015,23,30,1,-160.227868,-84.626123
4,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,2,PARK,NONE,-122.438738,37.771541,4510,13,5,2015,23,30,1,-160.210279,-84.667196


In [13]:
test.head()

Unnamed: 0_level_0,DayOfWeek,PdDistrict,X,Y,n_days,Day,Month,Year,Hour,Minute,Block,X_Y,XY
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,6,BAYVIEW,-122.399588,37.735051,4512,10,5,2015,23,59,1,-160.134639,-84.664537
1,6,BAYVIEW,-122.391523,37.732432,4512,10,5,2015,23,51,0,-160.123955,-84.659091
2,6,NORTHERN,-122.426002,37.792212,4512,10,5,2015,23,50,1,-160.218214,-84.63379
3,6,INGLESIDE,-122.437394,37.721412,4512,10,5,2015,23,45,1,-160.158806,-84.715982
4,6,INGLESIDE,-122.437394,37.721412,4512,10,5,2015,23,45,1,-160.158806,-84.715982


In [14]:
train.drop(columns=['Descript', 'Resolution'], inplace=True)

In [15]:
from sklearn.preprocessing import LabelEncoder

le1 = LabelEncoder()
train['PdDistrict'] = le1.fit_transform(train['PdDistrict'])
test['PdDistrict'] = le1.transform(test['PdDistrict'])

In [16]:
train.head()

Unnamed: 0,Category,DayOfWeek,PdDistrict,X,Y,n_days,Day,Month,Year,Hour,Minute,Block,X_Y,XY
0,WARRANTS,2,4,-122.425892,37.774599,4510,13,5,2015,23,53,0,-160.20049,-84.651293
1,OTHER OFFENSES,2,4,-122.425892,37.774599,4510,13,5,2015,23,53,0,-160.20049,-84.651293
2,OTHER OFFENSES,2,4,-122.424363,37.800414,4510,13,5,2015,23,33,0,-160.224777,-84.623949
3,LARCENY/THEFT,2,4,-122.426995,37.800873,4510,13,5,2015,23,30,1,-160.227868,-84.626123
4,LARCENY/THEFT,2,5,-122.438738,37.771541,4510,13,5,2015,23,30,1,-160.210279,-84.667196


In [17]:
test.head()

Unnamed: 0_level_0,DayOfWeek,PdDistrict,X,Y,n_days,Day,Month,Year,Hour,Minute,Block,X_Y,XY
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,6,0,-122.399588,37.735051,4512,10,5,2015,23,59,1,-160.134639,-84.664537
1,6,0,-122.391523,37.732432,4512,10,5,2015,23,51,0,-160.123955,-84.659091
2,6,4,-122.426002,37.792212,4512,10,5,2015,23,50,1,-160.218214,-84.63379
3,6,2,-122.437394,37.721412,4512,10,5,2015,23,45,1,-160.158806,-84.715982
4,6,2,-122.437394,37.721412,4512,10,5,2015,23,45,1,-160.158806,-84.715982


In [18]:
le2 = LabelEncoder()
X = train.drop(columns=['Category'])
y = le2.fit_transform(train['Category'])

In [19]:
train.head()

Unnamed: 0,Category,DayOfWeek,PdDistrict,X,Y,n_days,Day,Month,Year,Hour,Minute,Block,X_Y,XY
0,WARRANTS,2,4,-122.425892,37.774599,4510,13,5,2015,23,53,0,-160.20049,-84.651293
1,OTHER OFFENSES,2,4,-122.425892,37.774599,4510,13,5,2015,23,53,0,-160.20049,-84.651293
2,OTHER OFFENSES,2,4,-122.424363,37.800414,4510,13,5,2015,23,33,0,-160.224777,-84.623949
3,LARCENY/THEFT,2,4,-122.426995,37.800873,4510,13,5,2015,23,30,1,-160.227868,-84.626123
4,LARCENY/THEFT,2,5,-122.438738,37.771541,4510,13,5,2015,23,30,1,-160.210279,-84.667196


In [20]:
X.head()

Unnamed: 0,DayOfWeek,PdDistrict,X,Y,n_days,Day,Month,Year,Hour,Minute,Block,X_Y,XY
0,2,4,-122.425892,37.774599,4510,13,5,2015,23,53,0,-160.20049,-84.651293
1,2,4,-122.425892,37.774599,4510,13,5,2015,23,53,0,-160.20049,-84.651293
2,2,4,-122.424363,37.800414,4510,13,5,2015,23,33,0,-160.224777,-84.623949
3,2,4,-122.426995,37.800873,4510,13,5,2015,23,30,1,-160.227868,-84.626123
4,2,5,-122.438738,37.771541,4510,13,5,2015,23,30,1,-160.210279,-84.667196


注意：在 lightgbm 中，将 `PdDistrict` 设置为类别变量。

In [21]:
import lightgbm as lgb

train_data = lgb.Dataset(
    X, label=y, categorical_feature=[
        'PdDistrict',
    ])

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [22]:
params = {
    'boosting': 'gbdt',
    'objective': 'multiclass',
    'num_class': 39,
    'max_delta_step': 0.9,
    'min_data_in_leaf': 21,
    'learning_rate': 0.4,
    'max_bin': 465,
    'num_leaves': 41,
    'verbose': 1
}

In [23]:
%%time

bst = lgb.train(params, train_data, num_boost_round=120)



CPU times: user 10min 17s, sys: 1min 10s, total: 11min 28s
Wall time: 2min 41s


In [24]:
%%time
y_pred = bst.predict(test)
y_pred

CPU times: user 8min 36s, sys: 2.76 s, total: 8min 39s
Wall time: 1min 13s


In [25]:
columns = le2.inverse_transform(np.linspace(0, 38, 39, dtype='int16'))
columns

array(['ARSON', 'ASSAULT', 'BAD CHECKS', 'BRIBERY', 'BURGLARY',
       'DISORDERLY CONDUCT', 'DRIVING UNDER THE INFLUENCE',
       'DRUG/NARCOTIC', 'DRUNKENNESS', 'EMBEZZLEMENT', 'EXTORTION',
       'FAMILY OFFENSES', 'FORGERY/COUNTERFEITING', 'FRAUD', 'GAMBLING',
       'KIDNAPPING', 'LARCENY/THEFT', 'LIQUOR LAWS', 'LOITERING',
       'MISSING PERSON', 'NON-CRIMINAL', 'OTHER OFFENSES',
       'PORNOGRAPHY/OBSCENE MAT', 'PROSTITUTION', 'RECOVERED VEHICLE',
       'ROBBERY', 'RUNAWAY', 'SECONDARY CODES', 'SEX OFFENSES FORCIBLE',
       'SEX OFFENSES NON FORCIBLE', 'STOLEN PROPERTY', 'SUICIDE',
       'SUSPICIOUS OCC', 'TREA', 'TRESPASS', 'VANDALISM', 'VEHICLE THEFT',
       'WARRANTS', 'WEAPON LAWS'], dtype=object)

In [26]:
submission = pd.DataFrame(
    y_pred,
    columns=le2.inverse_transform(np.linspace(0, 38, 39, dtype='int16')),
    index=test.index)

In [27]:
submission.to_csv('../output/LGBM_final.csv', index_label='Id')

参考资料：

https://www.kaggle.com/junheo/sf-crime-rate-prediction
