# 第三题：支持向量机的分类任务

实验内容：
1. 使用支持向量机完成spambase垃圾邮件分类任务
2. 使用训练集训练模型，计算测试集的精度，查准率，查全率，F1值

核函数 | C | 精度 | 查准率 | 查全率 | F1
- | - | - | - | - | -
rbf | 0.1 | 0.74 | 0.64 | 0.72 | 0.68
rbf | 1 | 0.83 | 0.77 | 0.8 | 0.78
linear | 0.1 | 0.81 | 0.94 | 0.53 | 0.68
linear | 1 | 0.8 | 0.91 | 0.54 | 0.68
sigmoid | 0.1 | 0.46 | 0.06 | 0.03 | 0.04
sigmoid | 1 | 0.37 | 0.13 | 0.11 | 0.12

In [1]:
# 导入数据
import numpy as np
data = np.loadtxt('data/spambase/spambase.data', delimiter = ",")
spamx = data[:, :57]
spamy = data[:, 57]

In [2]:
# 数据集分割
from sklearn.model_selection import train_test_split
trainX, testX, trainY, testY = train_test_split(spamx, spamy, test_size = 0.3, random_state = 32)
trainX.shape, trainY.shape, testX.shape, testY.shape

((3220, 57), (3220,), (1381, 57), (1381,))

**注意：计算线性核的时候，要使用 LinearSVC 这个类，不要使用SVC(kernel = 'linear')。LinearSVC不需要设置kernel参数！**

In [3]:
# 引入模型
from sklearn.svm import SVC
from sklearn.svm import LinearSVC

In [4]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [5]:
clf_r0=SVC(kernel='rbf',C=0.1,random_state=32)
clf_r0.fit(trainX,trainY)
prediction=clf_r0.predict(testX)
as_spam_r=round(accuracy_score(testY,prediction),2)
ps_spam_r=round(precision_score(testY,prediction),2)
rs_spam_r=round(recall_score(testY,prediction),2)
f1_spam_r=round(f1_score(testY,prediction),2)
print('rbf：C=0.1')
print('精度：',as_spam_r,'查准率：',ps_spam_r,'查全率：',rs_spam_r,'f1：',f1_spam_r)

rbf：C=0.1
精度： 0.7 查准率： 0.65 查全率： 0.46 f1： 0.54


In [6]:
clf_r0=SVC(kernel='rbf',C=1,random_state=32)
clf_r0.fit(trainX,trainY)
prediction=clf_r0.predict(testX)
as_spam_r=round(accuracy_score(testY,prediction),2)
ps_spam_r=round(precision_score(testY,prediction),2)
rs_spam_r=round(recall_score(testY,prediction),2)
f1_spam_r=round(f1_score(testY,prediction),2)
print('rbf：C=1')
print('精度：',as_spam_r,'查准率：',ps_spam_r,'查全率：',rs_spam_r,'f1：',f1_spam_r)

rbf：C=1
精度： 0.72 查准率： 0.67 查全率： 0.49 f1： 0.57


In [7]:
clf=LinearSVC(C=0.1,random_state=32)
clf.fit(trainX,trainY)



LinearSVC(C=0.1, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=32, tol=0.0001,
          verbose=0)

In [8]:
prediction=clf.predict(testX)

In [9]:
as_spam_l=round(accuracy_score(testY,prediction),2)
ps_spam_l=round(precision_score(testY,prediction),2)
rs_spam_l=round(recall_score(testY,prediction),2)
f1_spam_l=round(f1_score(testY,prediction),2)
print('线性：C=0.1')
print('精度：',as_spam_l,'查准率：',ps_spam_l,'查全率：',rs_spam_l,'f1：',f1_spam_l)

线性：C=0.1
精度： 0.81 查准率： 0.94 查全率： 0.53 f1： 0.68


In [10]:
clf=LinearSVC(C=1,random_state=32)
clf.fit(trainX,trainY)
prediction=clf.predict(testX)
as_spam_l=round(accuracy_score(testY,prediction),2)
ps_spam_l=round(precision_score(testY,prediction),2)
rs_spam_l=round(recall_score(testY,prediction),2)
f1_spam_l=round(f1_score(testY,prediction),2)
print('线性：C=1')
print('精度：',as_spam_l,'查准率：',ps_spam_l,'查全率：',rs_spam_l,'f1：',f1_spam_l)

线性：C=1
精度： 0.8 查准率： 0.91 查全率： 0.54 f1： 0.68




In [11]:
clf_s0=SVC(kernel='sigmoid',C=0.1,random_state=32)
clf_s0.fit(trainX,trainY)
prediction=clf_s0.predict(testX)
as_spam_s=round(accuracy_score(testY,prediction),2)
ps_spam_s=round(precision_score(testY,prediction),2)
rs_spam_s=round(recall_score(testY,prediction),2)
f1_spam_s=round(f1_score(testY,prediction),2)
print('sigmoid：C=0.1')
print('精度：',as_spam_s,'查准率：',ps_spam_s,'查全率：',rs_spam_s,'f1：',f1_spam_s)

sigmoid：C=0.1
精度： 0.63 查准率： 0.52 查全率： 0.53 f1： 0.52


In [12]:
clf_s1=SVC(kernel='sigmoid',C=1,random_state=32)
clf_s1.fit(trainX,trainY)
prediction=clf_s1.predict(testX)
as_spam_s=round(accuracy_score(testY,prediction),2)
ps_spam_s=round(precision_score(testY,prediction),2)
rs_spam_s=round(recall_score(testY,prediction),2)
f1_spam_s=round(f1_score(testY,prediction),2)
print('sigmoid：C=1')
print('精度：',as_spam_s,'查准率：',ps_spam_s,'查全率：',rs_spam_s,'f1：',f1_spam_s)

sigmoid：C=1
精度： 0.63 查准率： 0.52 查全率： 0.54 f1： 0.53


# 选做：比较LinearSVC和SVR(kernel = 'linear')的运行时间

In [None]:
clf_r0 = SVC(kernel='linear',C=1,random_state=32)
clf_r0.fit(trainX,trainY)

In [None]:
prediction=clf_r0.predict(testX)
as_spam_r=round(accuracy_score(testY,prediction),2)
ps_spam_r=round(precision_score(testY,prediction),2)
rs_spam_r=round(recall_score(testY,prediction),2)
f1_spam_r=round(f1_score(testY,prediction),2)
print('linear：C=1')
print('精度：',as_spam_r,'查准率：',ps_spam_r,'查全率：',rs_spam_r,'f1：',f1_spam_r)