# 1. 导入包

In [6]:
# 导入包
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# 2. 导入数据集

In [7]:
# 导入数据集
dataset = pd.read_csv('diamonds.csv')
dataset.shape

(53940, 10)

# 3. 数据预处理

## 3.1 处理缺失数据

In [8]:
# 统计缺失数据
dataset.isnull().sum()

carat      0
cut        0
color      0
clarity    0
depth      0
table      0
x          0
y          0
z          0
price      0
dtype: int64

没有任何一个字段有缺失值

In [9]:
# 找出业务不合理的数据
xyz_zero_df = dataset.query('x < 0.01 or y < 0.01 or z < 0.01')
xyz_zero_df

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
2207,1.0,Premium,G,SI2,59.1,59.0,6.55,6.48,0.0,3142
2314,1.01,Premium,H,I1,58.1,59.0,6.66,6.6,0.0,3167
4791,1.1,Premium,G,SI2,63.0,59.0,6.5,6.47,0.0,3696
5471,1.01,Premium,F,SI2,59.2,58.0,6.5,6.47,0.0,3837
10167,1.5,Good,G,I1,64.0,61.0,7.15,7.04,0.0,4731
11182,1.07,Ideal,F,SI2,61.6,56.0,0.0,6.62,0.0,4954
11963,1.0,Very Good,H,VS2,63.3,53.0,0.0,0.0,0.0,5139
13601,1.15,Ideal,G,VS2,59.2,56.0,6.88,6.83,0.0,5564
15951,1.14,Fair,G,VS1,57.5,67.0,0.0,0.0,0.0,6381
24394,2.18,Premium,H,SI2,59.4,61.0,8.49,8.45,0.0,12631


x < 0.01 或者 y < 0.01 或者 z < 0.01，这些数据不符合业务场景，且数据量比较少，故直接删除。

In [10]:
dataset = dataset.drop(xyz_zero_df.index)
xyz_zero_df = dataset.query('x < 0.01 or y < 0.01 or z < 0.01')
xyz_zero_df

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price


## 3.2 处理类别型字段

### 3.2.1 统计类别型字段

In [11]:
# take care of categorical data
cols = dataset.columns
num_cols = dataset._get_numeric_data().columns
cat_cols = set(cols) - set(num_cols)
print('类别型字段是：' + str(cat_cols))

类别型字段是：{'color', 'clarity', 'cut'}


由数据集字段说明可知，cut、clarity、color这3个字段都是类别型字段。对他们做字符编码和独热编码。

### 3.2.2 字符编码和独热编码

In [12]:
dataset = pd.get_dummies(dataset, drop_first=True)
dataset.shape

(53920, 24)

数据集由10列变为24列。

In [13]:
dataset.head()

Unnamed: 0,carat,depth,table,x,y,z,price,cut_Good,cut_Ideal,cut_Premium,...,color_H,color_I,color_J,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,0.23,61.5,55.0,3.95,3.98,2.43,326,0,1,0,...,0,0,0,0,0,1,0,0,0,0
1,0.21,59.8,61.0,3.89,3.84,2.31,326,0,0,1,...,0,0,0,0,1,0,0,0,0,0
2,0.23,56.9,65.0,4.05,4.07,2.31,327,1,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0.29,62.4,58.0,4.2,4.23,2.63,334,0,0,1,...,0,1,0,0,0,0,0,1,0,0
4,0.31,63.3,58.0,4.34,4.35,2.75,335,1,0,0,...,0,0,1,0,0,1,0,0,0,0


## 3.3 生成自变量和因变量

In [14]:
# 生成自变量和因变量
y = dataset['price'].values
dataset = dataset.drop(['price'], axis = 1)
X = dataset.values

In [15]:
X[:5,:]

array([[ 0.23, 61.5 , 55.  ,  3.95,  3.98,  2.43,  0.  ,  1.  ,  0.  ,
         0.  ,  1.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,
         1.  ,  0.  ,  0.  ,  0.  ,  0.  ],
       [ 0.21, 59.8 , 61.  ,  3.89,  3.84,  2.31,  0.  ,  0.  ,  1.  ,
         0.  ,  1.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  1.  ,
         0.  ,  0.  ,  0.  ,  0.  ,  0.  ],
       [ 0.23, 56.9 , 65.  ,  4.05,  4.07,  2.31,  1.  ,  0.  ,  0.  ,
         0.  ,  1.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,
         0.  ,  1.  ,  0.  ,  0.  ,  0.  ],
       [ 0.29, 62.4 , 58.  ,  4.2 ,  4.23,  2.63,  0.  ,  0.  ,  1.  ,
         0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  1.  ,  0.  ,  0.  ,  0.  ,
         0.  ,  0.  ,  1.  ,  0.  ,  0.  ],
       [ 0.31, 63.3 , 58.  ,  4.34,  4.35,  2.75,  1.  ,  0.  ,  0.  ,
         0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  1.  ,  0.  ,  0.  ,
         1.  ,  0.  ,  0.  ,  0.  ,  0.  ]])

In [16]:
y

array([ 326,  326,  327, ..., 2757, 2757, 2757], dtype=int64)

## 3.4 拆分数据集

In [17]:
# 拆分数据集
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## 3.5 特征缩放

In [18]:
# 特征缩放
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
sc_y = StandardScaler()
y_train = np.ravel(sc_y.fit_transform(y_train.reshape(-1, 1)))
y_test = np.ravel(sc_y.transform(y_test.reshape(-1, 1)))

# 4. 构建支持向量机模型

通过k折交叉验证法，从3个模型中选择出了支持向量机模型。mse的平均数是 0.0240，标准差是 0.0015

In [19]:
# 构建支持向量机模型
from sklearn.svm import SVR
svr_regressor = SVR(kernel = 'rbf', C=1.0, verbose = 1)

# 5. 应用网格搜索优化模型

In [20]:
# 应用网格搜索算法找到最好的超参数
from sklearn.model_selection import GridSearchCV

parameters = [{'C': [0.5, 1, 1.5, 2], 'kernel': ['linear', 'rbf']}]

grid_search = GridSearchCV(estimator = svr_regressor,
                           param_grid = parameters,
                           scoring = 'r2',
                           cv = 5,
                           verbose=1)


在4核8GB机器上，大约训练70分钟。

In [22]:
# 开始执行
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[LibSVM]

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]

[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed: 92.3min finished


[LibSVM]

GridSearchCV(cv=5, estimator=SVR(verbose=1),
             param_grid=[{'C': [0.5, 1, 1.5, 2], 'kernel': ['linear', 'rbf']}],
             scoring='r2', verbose=1)

In [23]:
best_score = grid_search.best_score_
best_score

0.9766113677511867

最好的参数使得R2达到了0.9766113677511867

In [24]:
best_parameters = grid_search.best_params_
best_parameters

{'C': 2, 'kernel': 'rbf'}

最好的参数组合是：{'C': 2, 'kernel': 'rbf'}

# 6. 比较模型优化前和优化后的性能

## 6.1 优化前

In [25]:
# 优化前
from sklearn.svm import SVR
before_svr_regressor = SVR(kernel = 'rbf', C=1.0, verbose = 1)
before_svr_regressor.fit(X_train, y_train)
before_y_pred = before_svr_regressor.predict(X_test)
from sklearn.metrics import r2_score
before_r2_score = r2_score(y_test, before_y_pred)
before_r2_score

[LibSVM]

0.9776441352140839

## 6.2 优化后

In [26]:
# 优化后
after_svr_regressor = SVR(kernel = 'rbf', C=2.0, verbose = 1)
after_svr_regressor.fit(X_train, y_train)
after_y_pred = after_svr_regressor.predict(X_test)
after_r2_score = r2_score(y_test, after_y_pred)
after_r2_score

[LibSVM]

0.9788539228622857

#### 结论
优化前R2是0.9776441352140839，优化后R2是0.9788539228622857。性能有所提升。