# 1. 导入包

In [1]:
# -*- coding: utf-8 -*-
"""
Python 3.7.7
sklearn 0.23.1
预测黑色星期五的花费
"""

# 导入包
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.svm import SVR

# 2. 导入数据集

In [2]:
# 导入数据集
data = pd.read_csv('BlackFriday.csv')

# 3. 数据预处理

## 3.1 检测缺失值

In [3]:
# 检测缺失值
null_df = data.isnull().sum()
null_df

User_ID                           0
Product_ID                        0
Gender                            0
Age                               0
Occupation                        0
City_Category                     0
Stay_In_Current_City_Years        0
Marital_Status                    0
Product_Category_1                0
Product_Category_2            15721
Product_Category_3            34817
Purchase                          0
dtype: int64

In [4]:
# 删除缺失列
data = data.drop(['Product_Category_2', 'Product_Category_3'], axis = 1)

In [5]:
# 再次检测缺失值
null_df = data.isnull().sum()
null_df

User_ID                       0
Product_ID                    0
Gender                        0
Age                           0
Occupation                    0
City_Category                 0
Stay_In_Current_City_Years    0
Marital_Status                0
Product_Category_1            0
Purchase                      0
dtype: int64

## 3.2 删除无用的列

In [6]:
# 删除无用的列
data = data.drop(['User_ID', 'Product_ID'], axis = 1)

## 3.3 检查类别型变量

In [7]:
# 检查类别型变量
print(data.dtypes)

Gender                        object
Age                           object
Occupation                     int64
City_Category                 object
Stay_In_Current_City_Years    object
Marital_Status                 int64
Product_Category_1             int64
Purchase                       int64
dtype: object


In [8]:
# 转换变量类型
data['Stay_In_Current_City_Years'].replace('4+', 4, inplace = True)
data['Stay_In_Current_City_Years'] = data['Stay_In_Current_City_Years'].astype('int64')
data['Product_Category_1'] = data['Product_Category_1'].astype('object')
data['Occupation'] = data['Occupation'].astype('object')
data['Marital_Status'] = data['Marital_Status'].astype('object')

In [9]:
# 检查类别型变量
print(data.dtypes)

Gender                        object
Age                           object
Occupation                    object
City_Category                 object
Stay_In_Current_City_Years     int64
Marital_Status                object
Product_Category_1            object
Purchase                       int64
dtype: object


## 3.4 标签编码&独热编码

In [10]:
# 标签编码&独热编码
data = pd.get_dummies(data, drop_first = True) 

## 3.5 得到自变量和因变量

In [11]:
# 得到自变量和因变量
y = data['Purchase'].values
data = data.drop(['Purchase'], axis = 1)
x = data.values

## 3.6 拆分训练集和测试集

In [12]:
# 拆分训练集和测试集
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 1)

## 3.7 特征缩放

In [13]:
# 特征缩放
sc_x = StandardScaler()
x_train = sc_x.fit_transform(x_train)
x_test = sc_x.transform(x_test)
sc_y = StandardScaler()
y_train = np.ravel(sc_y.fit_transform(y_train.reshape(-1, 1)))

# 4. 使用不同的参数构建支持向量机模型

## 4.1 模型1：构建支持向量机模型

In [14]:
# 使用不同的参数构建支持向量机模型
# 模型1：构建支持向量机模型（kernel=rbf）
regressor = SVR(kernel = 'rbf', gamma='scale', C=1.0, epsilon=0.1, verbose=True)
regressor.fit(x_train, y_train) # 程序大约需要执行2分钟

[LibSVM]

SVR(verbose=True)

In [15]:
# 在测试集做预测
y_pred = regressor.predict(x_test)
y_pred = sc_y.inverse_transform(y_pred) # y_pred变回特征缩放之前的

In [16]:
# 评估模型性能
r2 = r2_score(y_test, y_pred)
print("R2 Score:", r2) # 0.612409921134421

R2 Score: 0.612409921134421


## 4.2 模型2：构建支持向量机模型

In [17]:
# 模型2：构建支持向量机模型（kernel=poly, degree=2）
regressor = SVR(kernel = 'poly', degree=2, gamma='scale', C=1.0, epsilon=0.1, verbose=True)
regressor.fit(x_train, y_train) # 程序大约需要执行2分钟

[LibSVM]

SVR(degree=2, kernel='poly', verbose=True)

In [18]:
# 在测试集做预测
y_pred = regressor.predict(x_test)
y_pred = sc_y.inverse_transform(y_pred) # y_pred变回特征缩放之前的

In [19]:
# 评估模型性能
r2 = r2_score(y_test, y_pred)
print("R2 Score:", r2) # 0.6156456438643175

R2 Score: 0.6156456438643175


## 4.3 模型3：构建支持向量机模型

In [20]:
# 模型3：构建支持向量机模型（kernel=poly, degree=3）
regressor = SVR(kernel = 'poly', degree=3, gamma='scale', C=1.0, epsilon=0.1, verbose=True)
regressor.fit(x_train, y_train) # 程序大约需要执行2分钟

[LibSVM]

SVR(kernel='poly', verbose=True)

In [21]:
# 在测试集做预测
y_pred = regressor.predict(x_test)
y_pred = sc_y.inverse_transform(y_pred) # y_pred变回特征缩放之前的

In [22]:
# 评估模型性能
r2 = r2_score(y_test, y_pred)
print("R2 Score:", r2) # 0.5931393882595677

R2 Score: 0.5931393882595677


#### 结论：
由上面3个模型可见，不同超参数对模型性能的影响不同