***芝加哥犯罪数据集分析***

导入需要的库

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import mglearn 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier



导入芝加哥犯罪数据集

In [3]:
data=pd.read_csv('./Chicago_Crimes_2012_to_2017.csv')#导入芝加哥2012-2016年犯罪数据集
df=pd.DataFrame(data)
print(data.dtypes)#列出数据标题及相对应的类型
print(df.head())#列出数据的前几行

Unnamed: 0                int64
ID                        int64
Case Number              object
Date                     object
Block                    object
IUCR                     object
Primary Type             object
Description              object
Location Description     object
Arrest                     bool
Domestic                   bool
Beat                      int64
District                float64
Ward                    float64
Community Area          float64
FBI Code                 object
X Coordinate            float64
Y Coordinate            float64
Year                      int64
Updated On               object
Latitude                float64
Longitude               float64
Location                 object
dtype: object
   Unnamed: 0        ID Case Number                    Date  \
0           3  10508693    HZ250496  05/03/2016 11:40:00 PM   
1          89  10508695    HZ250409  05/03/2016 09:40:00 PM   
2         197  10508697    HZ250503  05/03/2016 11:31:00 PM  

数据清洗，处理丢失和无用的数据

In [4]:
df.dropna(axis=0,how='any',inplace=True)#如果有出现一个缺失的值，将缺失的值所在行进行删除
print(df)

         Unnamed: 0        ID Case Number                    Date  \
0                 3  10508693    HZ250496  05/03/2016 11:40:00 PM   
1                89  10508695    HZ250409  05/03/2016 09:40:00 PM   
2               197  10508697    HZ250503  05/03/2016 11:31:00 PM   
3               673  10508698    HZ250424  05/03/2016 10:10:00 PM   
4               911  10508699    HZ250455  05/03/2016 10:00:00 PM   
...             ...       ...         ...                     ...   
1456709     6250330  10508679    HZ250507  05/03/2016 11:33:00 PM   
1456710     6251089  10508680    HZ250491  05/03/2016 11:30:00 PM   
1456711     6251349  10508681    HZ250479  05/03/2016 12:15:00 AM   
1456712     6253257  10508690    HZ250370  05/03/2016 09:07:00 PM   
1456713     6253474  10508692    HZ250517  05/03/2016 11:38:00 PM   

                        Block  IUCR            Primary Type  \
0          013XX S SAWYER AVE  0486                 BATTERY   
1          061XX S DREXEL AVE  0486          

In [5]:
#由于Location和Latitude、Longitude是重复的，Case Number、Updated On、ID、FBI Code数据是无用的,X坐标和Y坐标可以通过经纬度转换，将其drop掉
dat=df.drop(['Location','Case Number','Updated On','X Coordinate','Y Coordinate','ID','FBI Code'],axis=1)
print(dat)

         Unnamed: 0                    Date                 Block  IUCR  \
0                 3  05/03/2016 11:40:00 PM    013XX S SAWYER AVE  0486   
1                89  05/03/2016 09:40:00 PM    061XX S DREXEL AVE  0486   
2               197  05/03/2016 11:31:00 PM   053XX W CHICAGO AVE  0470   
3               673  05/03/2016 10:10:00 PM     049XX W FULTON ST  0460   
4               911  05/03/2016 10:00:00 PM     003XX N LOTUS AVE  0820   
...             ...                     ...                   ...   ...   
1456709     6250330  05/03/2016 11:33:00 PM       026XX W 23RD PL  0486   
1456710     6251089  05/03/2016 11:30:00 PM   073XX S HARVARD AVE  1310   
1456711     6251349  05/03/2016 12:15:00 AM       024XX W 63RD ST  041A   
1456712     6253257  05/03/2016 09:07:00 PM  082XX S EXCHANGE AVE  0486   
1456713     6253474  05/03/2016 11:38:00 PM       001XX E 75TH ST  5007   

                   Primary Type              Description  \
0                       BATTERY  DOMEST

数据预处理

In [6]:
#对IUCR、Description、Location Description进行label编码
le = LabelEncoder()
le.fit(dat['Primary Type'])
dat['Primary Type']=le.transform(dat['Primary Type'])
y=dat['Primary Type']

le.fit(dat['IUCR'])
dat['IUCR']=le.transform(dat['IUCR'])

le.fit(dat["Description"])
dat['Description']=le.transform(dat['Description'])

le.fit(dat['Location Description'])
dat['Location Description']=le.transform(dat['Location Description'])

print(dat['Primary Type'],dat['IUCR'],dat['Description'],dat['Location Description'])

dat=dat.drop(['Primary Type'],axis=1)#Primary Type作为target，将其drop掉

0           2
1           2
2          27
3           2
4          31
           ..
1456709     2
1456710     6
1456711     2
1456712     2
1456713    24
Name: Primary Type, Length: 1418365, dtype: int32 0           51
1           51
2           42
3           39
4           84
          ... 
1456709     51
1456710    151
1456711     30
1456712     51
1456713    339
Name: IUCR, Length: 1418365, dtype: int32 0          119
1          119
2          264
3          282
4            0
          ... 
1456709    119
1456710    305
1456711     32
1456712    119
1456713    224
Name: Description, Length: 1418365, dtype: int32 0           17
1          109
2          125
3          121
4          109
          ... 
1456709     17
1456710     17
1456711    121
1456712    121
1456713    101
Name: Location Description, Length: 1418365, dtype: int32


In [7]:
#日期转为时间相关的属性
df['Date'] = pd.to_datetime(df['Date'])
dat['Date'] =df['Date'].dt.date
dat['Week']=df['Date'].dt.week
dat['Day'] = df['Date'].dt.day
dat['Year'] = df['Date'].dt.year
dat['Month'] = df['Date'].dt.month
dat['Hour'] = df['Date'].dt.hour
dat=dat.drop(['Date'],axis=1)

In [8]:
#使用正则表达式对Block进行数据转化
regex = r"\d+XX\s(?P<street>.*)"
subst = "\\g<street>"
dat['street'] = dat.Block.str.replace(regex, subst)
le.fit(dat['street'])
dat['street']=le.transform(dat['street'])
dat=dat.drop(['Block'],axis=1)

In [9]:
print(dat)
print(dat.dtypes)

         Unnamed: 0  IUCR  Description  Location Description  Arrest  \
0                 3    51          119                    17    True   
1                89    51          119                   109   False   
2               197    42          264                   125   False   
3               673    39          282                   121   False   
4               911    84            0                   109   False   
...             ...   ...          ...                   ...     ...   
1456709     6250330    51          119                    17    True   
1456710     6251089   151          305                    17    True   
1456711     6251349    30           32                   121   False   
1456712     6253257    51          119                   121   False   
1456713     6253474   339          224                   101    True   

         Domestic  Beat  District  Ward  Community Area  Year   Latitude  \
0            True  1022      10.0  24.0            29.0  20

应用PCA进行降维，用StandardScaler缩放数据

In [10]:
#对经纬度进行PCA降维
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca.fit(dat[['Latitude','Longitude']])
dat[['Latitude','Longitude']] = pca.transform(dat[['Latitude','Longitude']])
print(dat[['Latitude','Longitude']])

         Latitude  Longitude
0       -0.032769   0.025198
1        0.077609  -0.044797
2       -0.078869   0.063712
3       -0.067245   0.058389
4       -0.071852   0.069739
...           ...        ...
1456709 -0.013718   0.015680
1456710  0.089002  -0.010136
1456711  0.054452   0.032747
1456712  0.130174  -0.081094
1456713  0.095478  -0.021302

[1418365 rows x 2 columns]


In [11]:
scaler =StandardScaler()
scaler.fit(dat)
X_scaled=scaler.transform(dat)

测试集与训练集、验证集的划分

In [12]:
X_trainval,X_test,y_trainval,y_test=train_test_split(
                                                     X_scaled,y,test_size=0.3,random_state=42)
X_train,X_valid,y_train,y_valid=train_test_split(
                                                 X_trainval,y_trainval,random_state=40)

利用特征工程进行特征的选择

In [12]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
select = SelectFromModel(
    RandomForestClassifier(n_estimators=100, random_state=42),
    threshold="median")

In [13]:
select.fit(X_train, y_train)
X_train_l1 = select.transform(X_train)
print("X_train.shape: {}".format(X_train.shape))
print("X_train_l1.shape: {}".format(X_train_l1.shape))
X_test_l1 = select.transform(X_test)

X_train.shape: (744641, 18)
X_train_l1.shape: (744641, 9)


建立模型，选择最好的模型并进行调参

In [14]:
#逻辑回归建立模型
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(max_iter=10000,multi_class='ovr',solver='lbfgs')
logreg.fit(X_train,y_train)
print("Test score: {:.2f}".format(logreg.score(X_test,y_test)))
score =logreg.fit(X_train_l1, y_train).score(X_test_l1, y_test)
print("Test score: {:.3f}".format(score))

Test score: 0.65
Test score: 0.652


In [15]:
#用决策树建立模型
from sklearn.tree import DecisionTreeClassifier
model1 = DecisionTreeClassifier(max_depth=4,random_state=0)   
model1.fit(X_train_l1,y_train)
print("Test score: {:.2f}".format(model1.score(X_test_l1,y_test)))

Test score: 0.88


In [16]:
#用随机森林建立模型
from sklearn.ensemble import RandomForestClassifier  
model2 = RandomForestClassifier(n_estimators=4,max_features='sqrt')  
model2.fit(X_train_l1,y_train)
print("Test score: {:.2f}".format(model2.score(X_test_l1,y_test)))

Test score: 1.00


In [17]:
#用SVC建立模型
from sklearn.svm import SVC  
model3 = SVC(max_iter=100000,gamma='auto')  #SVM
model3.fit(X_train_l1,y_train)
print("Test score: {:.2f}".format(model3.score(X_test_l1,y_test)))

Test score: 0.98


经过比较，发现决策树和随机森林模型最好，由于随机森林的内推性很好，容易处于过拟合的状态，无法外推，在下面的模型评估中，我们决定采用SVC来进行模型评估

模型评估

In [None]:
from sklearn.svm import SVC
X_train, X_test, y_train, y_test = train_test_split(
   dat,y, random_state=0)
print("Size of training set: {}   size of test set: {}".format(
      X_train.shape[0], X_test.shape[0]))

best_score = 0

for gamma in [0.001, 0.01, 0.1, 1, 10, 100]:
    for C in [0.001, 0.01, 0.1, 1, 10, 100]:
       
        svm = SVC(gamma=gamma, C=C)
        svm.fit(X_train, y_train)
        
        score = svm.score(X_test, y_test)
        
        if score > best_score:
            best_score = score
            best_parameters = {'C': C, 'gamma': gamma}

print("Best score: {:.2f}".format(best_score))
print("Best parameters: {}".format(best_parameters))

Size of training set: 1063773   size of test set: 354592


In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC
lr = SVC(gamma='auto').fit(X_train, y_train)
pred = lr.predict(X_test)
print("Accuracy: {:.3f}".format(accuracy_score(y_test, pred)))
print("Confusion matrix:\n{}".format(confusion_matrix(y_test, pred)))

In [None]:
print(classification_report(y_test, pred))