# 引入常用的package

In [1]:
import pandas as pd
from sklearn import preprocessing
import numpy as np

In [2]:
train_data = pd.read_csv("sf_data/train.csv")
test_data = pd.read_csv('sf_data/test.csv')

## 1 理解数据
    任何算法的第一步，一定是深入理解所得的数据，懂业务更佳。

1. Dates - 时间标签
2. **Category** - 犯罪类型 我们的目标标签
3. Descript - 具体犯罪描述 (only in train.csv)
4. DayOfWeek - 本周的第几天
5. PdDistrict -  警局辖区名称
6. Resolution - 解决方式 (only in train.csv)
7. Address - 发生地址 
8. X - Longitude
9. Y - Latitude

In [3]:
train_data.head(2)

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599


In [4]:
test_data.head(2)

Unnamed: 0,Id,Dates,DayOfWeek,PdDistrict,Address,X,Y
0,0,2015-05-10 23:59:00,Sunday,BAYVIEW,2000 Block of THOMAS AV,-122.399588,37.735051
1,1,2015-05-10 23:51:00,Sunday,BAYVIEW,3RD ST / REVERE AV,-122.391523,37.732432


In [5]:
sample_data = test_data.sample(100) # 随机取100个样本

In [7]:
sample_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100 entries, 682123 to 541363
Data columns (total 7 columns):
Id            100 non-null int64
Dates         100 non-null object
DayOfWeek     100 non-null object
PdDistrict    100 non-null object
Address       100 non-null object
X             100 non-null float64
Y             100 non-null float64
dtypes: float64(2), int64(1), object(4)
memory usage: 6.2+ KB


In [8]:
sample_data.describe()

Unnamed: 0,Id,X,Y
count,100.0,100.0,100.0
mean,471011.65,-122.423016,37.765097
std,244643.991672,0.02502,0.025241
min,566.0,-122.496803,37.708475
25%,258500.75,-122.429721,37.751541
50%,511690.5,-122.416509,37.775421
75%,679522.75,-122.407634,37.784189
max,872944.0,-122.385415,37.804734


### 数据预处理
##### 标准化 模块提供了一个实用类 StandardScaler

In [9]:
from sklearn import preprocessing
scaler = preprocessing.StandardScaler() # 引入标准化类

scaler.fit(sample_data[['X','Y']])

scaler.mean_ # 标准化前的均值

scaler.scale_# 标准化前的标准差

scaler.var_ # 标准化前的方差

standard_data = pd.DataFrame(scaler.transform(sample_data[['X','Y']]),columns=['X','Y'])

standard_data.head(3)

# standard_data.head()

standard_data.mean(axis=0) # 归一化后的数据均值

standard_data.std() #归一化后的数据方差

scaler.inverse_transform(standard_data[['X','Y']]) # 恢复数据操作

In [23]:
pd.concat([standard_data,standard_data],axis=1)

Unnamed: 0,X,Y,X.1,Y.1
0,-1.447504,-1.459812,-1.447504,-1.459812
1,0.887749,0.810724,0.887749,0.810724
2,0.433501,0.675958,0.433501,0.675958
3,0.602730,0.165154,0.602730,0.165154
4,0.113202,-0.257951,0.113202,-0.257951
5,0.134869,-0.034887,0.134869,-0.034887
6,0.617907,0.760229,0.617907,0.760229
7,0.733474,0.850405,0.733474,0.850405
8,-0.488946,-0.096616,-0.488946,-0.096616
9,0.259427,-0.412045,0.259427,-0.412045


In [24]:
##### 另一个实用类 MinMaxScaler

minmax = preprocessing.MinMaxScaler() # 引入归一化类

minmax.fit(sample_data[['X','Y']])

minmax_data = pd.DataFrame(minmax.transform(sample_data[['X','Y']]),columns=['X','Y'])

minmax_data.head()

minmax = preprocessing.MinMaxScaler(feature_range=(4,5)) # 引入归一化类

minmax.fit_transform(sample_data[['X','Y']])[0:3]

Unnamed: 0,X,Y
0,0.33893,0.20735
1,0.860838,0.799744
2,0.759318,0.764583
3,0.797139,0.631312
4,0.687734,0.520922


##### 分类特征处理

In [29]:
sample_data.head(4)

sample_data['DayOfWeek'].unique()

le = preprocessing.LabelEncoder() # 引入LabelEncoder

le.fit(sample_data['DayOfWeek'])

le.transform(sample_data['DayOfWeek'])[0:4] # 转为默认的列向量

le.classes_

sample_data.head()

sample_data['DayOfWeek'].unique()

sample_data['Address'].unique()

sample_data['PdDistrict'].unique()

Unnamed: 0,Id,Dates,DayOfWeek,PdDistrict,Address,X,Y
682123,682123,2005-09-16 19:00:00,Friday,INGLESIDE,100 Block of WESTWOOD DR,-122.45905,37.728435
273725,273725,2011-08-19 14:00:00,Friday,SOUTHERN,100 Block of 3RD ST,-122.400916,37.785457
793775,793775,2004-03-11 21:20:00,Thursday,TENDERLOIN,GOLDEN GATE AV / JONES ST,-122.412224,37.782073
779612,779612,2004-05-19 20:37:00,Wednesday,SOUTHERN,DIVISION ST / POTRERO AV,-122.408011,37.769244


In [34]:
### 有时候one-hot更合理，接下来引入one-hot变量 

enc = preprocessing.OneHotEncoder() # 引入onehot编码类

enc.fit(sample_data[['DayOfWeek','PdDistrict']])

sample_data['PdDistrict'].unique()

enc_array = enc.transform(sample_data[['DayOfWeek','PdDistrict']]).toarray()

enc_array.shape

sample_data.head(2)

enc.inverse_transform(enc_array)[0:4] #还原编码前特征

enc.get_feature_names()

enc_array.shape

enc.get_feature_names()

enc_array[0]

OneHotEncoder(categorical_features=None, categories=None,
       dtype=<class 'numpy.float64'>, handle_unknown='error',
       n_values=None, sparse=True)

#### 空值的填充

In [49]:
sample_data.info()

from sklearn.utils import shuffle
sample_data = sample_data.reset_index()
sample_data.loc[0:10,'X']=np.nan
sample_data.loc[20:30,'DayOfWeek']=np.nan
sample_data = shuffle(sample_data)
sample_data  = sample_data.reset_index()

sample_data

sample_data.info()

from sklearn.impute import SimpleImputer

strategy : string, optional (default=”mean”)
The imputation strategy. ['mean','median','most_frequent','constant']

# 针对数值型
imp = SimpleImputer(missing_values=np.nan, strategy='mean')

imp.fit(sample_data[['X']])

sample_data['X_fill']=imp.transform(sample_data[['X']])

imp.statistics_

imp_freq = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

imp_freq.fit(sample_data[['DayOfWeek']])

sample_data['DayOfWeek_fill']=imp_freq.transform(sample_data[['DayOfWeek']])

imp_freq.statistics_

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100 entries, 682123 to 541363
Data columns (total 7 columns):
Id            100 non-null int64
Dates         100 non-null object
DayOfWeek     100 non-null object
PdDistrict    100 non-null object
Address       100 non-null object
X             100 non-null float64
Y             100 non-null float64
dtypes: float64(2), int64(1), object(4)
memory usage: 6.2+ KB


#### 多项式特征

interaction_only =False

#### X 的特征从 (X_1, X_2) 转换为 (1, X_1, X_2, X_1^2, X_1X_2, X_2^2) 。

interaction_only = True

#### X 的特征从 (X_1, X_2) 转换为 (1, X_1, X_2, X_1X_2) 。



In [60]:
from sklearn.preprocessing import PolynomialFeatures

In [63]:
poly = PolynomialFeatures(degree=2, interaction_only=True) 

In [64]:
poly.fit(sample_data[['X_fill','Y']])

PolynomialFeatures(degree=2, include_bias=True, interaction_only=True)

In [65]:
poly.transform(sample_data[['X_fill','Y']])

array([[ 1.00000000e+00, -1.22430568e+02,  3.77215834e+01,
        -4.61827487e+03],
       [ 1.00000000e+00, -1.22388799e+02,  3.77375756e+01,
        -4.61865655e+03],
       [ 1.00000000e+00, -1.22475647e+02,  3.77285281e+01,
        -4.62082590e+03],
       [ 1.00000000e+00, -1.22423506e+02,  3.77864541e+01,
        -4.62595018e+03],
       [ 1.00000000e+00, -1.22482732e+02,  3.77830359e+01,
        -4.62776946e+03],
       [ 1.00000000e+00, -1.22429645e+02,  3.77722643e+01,
        -4.62444489e+03],
       [ 1.00000000e+00, -1.22496803e+02,  3.77757940e+01,
        -4.62741399e+03],
       [ 1.00000000e+00, -1.22494982e+02,  3.77412458e+01,
        -4.62311324e+03],
       [ 1.00000000e+00, -1.22423506e+02,  3.77547485e+01,
        -4.62206868e+03],
       [ 1.00000000e+00, -1.22408068e+02,  3.77839917e+01,
        -4.62506544e+03],
       [ 1.00000000e+00, -1.22408163e+02,  3.77805345e+01,
        -4.62464583e+03],
       [ 1.00000000e+00, -1.22432176e+02,  3.77659823e+01,
      