# 旧金山犯罪分类预测

参考资料：带你彻彻底底搞懂朴素贝叶斯公式
https://blog.csdn.net/fisherming/article/details/79509025

## 高斯朴素贝叶斯例子

In [13]:
from sklearn import datasets

iris = datasets.load_iris()

### 使用交叉验证的方式，得到准确率，比简单 train test split 更加可靠

In [14]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score

gnb = GaussianNB()
scores = cross_val_score(gnb, iris.data, iris.target, cv=10)
print("Accuracy:%.3f" % scores.mean())

Accuracy:0.953


## 读取数据

+ 这是一个多分类问题。

### parse_dates 属性的使用

In [17]:
import pandas as pd


train = pd.read_csv('../input/train.csv', parse_dates=['Dates'])
test = pd.read_csv('../input/test.csv', parse_dates=['Dates'])

In [18]:
train.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541


<span class="burk">Date: 日期</span>

Category: 犯罪类型，比如 Larceny/盗窃罪 等.

Descript: 对于犯罪更详细的描述

<span class="burk">DayOfWeek: 星期几</span>

PdDistrict: 所属警区

Resolution: 处理结果，比如说『逮捕』『逃了』

<span class="burk">Address: 发生街区位置</span>

X and Y: GPS坐标

In [23]:
# 39 种犯罪类别
len(train['Category'].unique())

39

In [29]:
from sklearn import preprocessing 

# 对犯罪类别 Category 用 LabelEncoder 进行编号  
leCrime = preprocessing.LabelEncoder()
# 39 种犯罪类型
crime = leCrime.fit_transform(train.Category)

In [35]:
crime

array([37, 21, 21, ..., 16, 35, 12])

In [41]:
train['Category'].iloc[:10]

0          WARRANTS
1    OTHER OFFENSES
2    OTHER OFFENSES
3     LARCENY/THEFT
4     LARCENY/THEFT
5     LARCENY/THEFT
6     VEHICLE THEFT
7     VEHICLE THEFT
8     LARCENY/THEFT
9     LARCENY/THEFT
Name: Category, dtype: object

In [42]:
[leCrime.classes_[index] for index in crime[:10]]

['WARRANTS',
 'OTHER OFFENSES',
 'OTHER OFFENSES',
 'LARCENY/THEFT',
 'LARCENY/THEFT',
 'LARCENY/THEFT',
 'VEHICLE THEFT',
 'VEHICLE THEFT',
 'LARCENY/THEFT',
 'LARCENY/THEFT']

In [44]:
train.DayOfWeek.head()

0    Wednesday
1    Wednesday
2    Wednesday
3    Wednesday
4    Wednesday
Name: DayOfWeek, dtype: object

In [48]:
# 因子化星期几
days = pd.get_dummies(train.DayOfWeek)
days.head()

Unnamed: 0,Friday,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday
0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,1
2,0,0,0,0,0,0,1
3,0,0,0,0,0,0,1
4,0,0,0,0,0,0,1


In [50]:
# PdDistrict: 所属警区
train.PdDistrict.head()

0    NORTHERN
1    NORTHERN
2    NORTHERN
3    NORTHERN
4        PARK
Name: PdDistrict, dtype: object

In [52]:
len(train.PdDistrict.unique())

10

In [53]:
# PdDistrict: 所属警区
district = pd.get_dummies(train.PdDistrict)
district.head()

Unnamed: 0,BAYVIEW,CENTRAL,INGLESIDE,MISSION,NORTHERN,PARK,RICHMOND,SOUTHERN,TARAVAL,TENDERLOIN
0,0,0,0,0,1,0,0,0,0,0
1,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0


### 从日期时间类型字段中提取出小时信息

In [63]:
hour = train.Dates.dt.hour
hour.head()

0    23
1    23
2    23
3    23
4    23
Name: Dates, dtype: int64

In [64]:
len(hour.unique())

24

In [65]:
hour = pd.get_dummies(hour)
hour.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [68]:
# 组合特征
# 将特征进行横向组合
trainData = pd.concat([hour, days, district], axis = 1)
trainData.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,BAYVIEW,CENTRAL,INGLESIDE,MISSION,NORTHERN,PARK,RICHMOND,SOUTHERN,TARAVAL,TENDERLOIN
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [70]:
# 追加 crime 列 
trainData['crime'] = crime

### 下面以同样的方式构造测试数据集

In [74]:
days = pd.get_dummies(test.DayOfWeek)
district = pd.get_dummies(test.PdDistrict)
hour = test.Dates.dt.hour
hour = pd.get_dummies(hour)
testData = pd.concat([hour, days, district], axis=1)
trainData.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,CENTRAL,INGLESIDE,MISSION,NORTHERN,PARK,RICHMOND,SOUTHERN,TARAVAL,TENDERLOIN,crime
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,37
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,21
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,21
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,16
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,16


In [79]:
trainData.shape

(878049, 42)

## 建模

### <span class="burk">这里使用伯努利贝叶斯，因为特征的取值只有 1 和 0</span>

In [78]:
features = [
    'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday',
    'Sunday', 'BAYVIEW', 'CENTRAL', 'INGLESIDE', 'MISSION', 'NORTHERN', 'PARK',
    'RICHMOND', 'SOUTHERN', 'TARAVAL', 'TENDERLOIN'
]
len(features)

17

In [82]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    trainData[features], trainData['crime'], test_size=0.2)

In [85]:
X_test.shape

(175610, 17)

In [86]:
%%time
from sklearn.naive_bayes import BernoulliNB

NB = BernoulliNB()
NB.fit(X_train, y_train)

CPU times: user 825 ms, sys: 300 ms, total: 1.12 s
Wall time: 852 ms


In [97]:
%%time
propa = NB.predict_proba(X_test)
y_pred = NB.predict(X_test)

CPU times: user 691 ms, sys: 159 ms, total: 850 ms
Wall time: 358 ms


In [91]:
from sklearn.metrics import log_loss

predicted = np.array(propa)
logLoss = log_loss(y_test, predicted)
print("朴素贝叶斯的交叉熵损失为：%.6f" % logLoss)

朴素贝叶斯的交叉熵损失为：2.612022


效果其实很差。

In [98]:
from sklearn.metrics import classification_report

target_names = leCrime.classes_
print(classification_report(y_test, y_pred, target_names=target_names))

                             precision    recall  f1-score   support

                      ARSON       0.00      0.00      0.00       307
                    ASSAULT       0.00      0.00      0.00     15505
                 BAD CHECKS       0.00      0.00      0.00        67
                    BRIBERY       0.00      0.00      0.00        49
                   BURGLARY       0.00      0.00      0.00      7379
         DISORDERLY CONDUCT       0.00      0.00      0.00       879
DRIVING UNDER THE INFLUENCE       0.00      0.00      0.00       481
              DRUG/NARCOTIC       0.22      0.33      0.26     10740
                DRUNKENNESS       0.00      0.00      0.00       844
               EMBEZZLEMENT       0.00      0.00      0.00       248
                  EXTORTION       0.00      0.00      0.00        59
            FAMILY OFFENSES       0.00      0.00      0.00        94
     FORGERY/COUNTERFEITING       0.00      0.00      0.00      2002
                      FRAUD      

  'precision', 'predicted', average, warn_for)
