In [6]:
# 分类。单分类（是、否）。

import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

cancer = load_breast_cancer()

cancer_data = cancer['data']
cancer_target = cancer['target']
cancer_names = cancer['feature_names']

# 切割数据。把原始数据切割成训练集80%，测试集20%
cancer_data_train, cancer_data_test, cancer_target_train, cancer_target_test = \
train_test_split(cancer_data, cancer_target, test_size=0.2, random_state = 22)

# 数据标准化
stdScaler = StandardScaler().fit(cancer_data_train) #使用训练集生成规则
cancer_trainStd = stdScaler.transform(cancer_data_train) #应用规则到训练集
cancer_testStd = stdScaler.transform(cancer_data_test) #应用规则到测试集

#建立SVM模型:监督学习。需要原始数据（标准化），和真实的标签值
svm = SVC().fit(cancer_trainStd, cancer_target_train)
print('SVM模型为', svm)


SVM模型为 SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)


In [7]:
# 预测训练集结果
cancer_target_pred = svm.predict(cancer_trainStd)
print(cancer_target_pred)

[0 0 1 1 0 1 1 0 0 1 1 1 1 1 0 0 1 0 1 1 1 0 1 1 1 1 0 1 1 1 1 0 0 1 0 1 1
 0 0 1 1 1 0 0 1 0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 1 1 1 1 0 1 0 1 1 1 0 0 0 0
 1 1 0 1 0 0 1 0 1 0 1 0 1 1 1 1 0 0 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 0
 1 1 1 0 1 1 1 1 1 1 1 0 1 0 1 1 0 0 1 1 0 0 1 0 1 1 0 0 1 0 0 1 1 1 0 1 1
 1 0 0 0 1 0 0 1 0 1 0 1 0 1 1 0 1 1 0 0 1 1 1 1 1 0 0 1 1 0 0 1 1 1 0 1 1
 1 1 0 0 1 1 1 1 1 0 1 1 1 1 1 0 1 0 1 0 1 1 1 1 0 1 1 0 1 0 1 1 1 1 0 0 0
 0 0 1 1 0 1 1 0 0 1 0 0 1 1 0 1 0 1 1 0 0 1 1 1 0 0 1 1 0 0 1 1 1 1 1 1 1
 1 1 1 1 0 1 1 0 0 1 1 0 1 0 1 1 1 1 0 1 1 1 0 1 1 1 0 1 1 1 1 0 0 0 1 0 1
 1 1 1 1 0 0 1 0 0 1 1 1 0 1 0 0 0 0 1 0 1 0 1 0 0 1 1 0 1 1 1 1 0 1 1 1 1
 1 1 1 1 0 1 0 1 1 1 1 1 1 0 1 1 1 1 0 1 1 1 1 0 1 0 1 1 0 1 0 0 1 0 0 0 1
 1 1 1 0 1 1 1 1 0 1 0 0 0 1 0 0 1 1 1 1 0 1 1 1 0 0 1 0 0 1 0 1 1 1 1 1 0
 1 1 1 1 0 0 1 0 1 0 1 1 1 1 1 0 0 0 1 0 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 0 1
 1 1 1 1 1 0 1 1 1 1 0]


In [8]:
# 训练集的真实结果
print(cancer_target_train)

[0 0 0 1 0 1 1 0 0 1 1 1 1 1 0 0 1 0 0 1 1 0 1 1 1 1 0 1 1 1 1 0 0 1 0 1 1
 0 0 1 1 1 0 0 1 0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 1 1 1 1 0 1 0 1 1 1 0 0 0 0
 1 1 0 1 0 0 1 0 1 0 1 0 1 1 1 1 0 0 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 0
 1 1 1 0 1 1 1 1 1 1 1 0 1 0 1 1 0 0 1 1 0 0 1 0 1 1 0 0 1 0 0 1 1 1 0 1 1
 1 0 0 0 1 0 0 1 0 1 0 1 0 1 1 0 1 0 0 0 1 1 1 1 1 0 0 1 1 0 0 1 1 1 0 1 1
 1 1 0 0 1 1 1 1 1 0 1 1 1 1 1 0 1 0 1 0 1 0 1 1 0 1 1 0 1 0 1 1 1 1 0 0 0
 0 0 1 1 0 1 1 0 0 1 0 0 1 1 0 1 0 1 1 0 0 1 1 1 0 0 1 1 0 0 1 1 1 1 1 1 1
 1 1 1 1 0 1 1 0 0 1 1 0 1 0 1 1 1 1 0 1 1 1 0 1 1 1 0 1 1 1 1 0 0 0 1 0 1
 1 1 1 1 0 0 1 0 0 1 1 1 0 1 0 0 0 0 1 0 1 0 1 0 0 1 1 0 1 1 1 1 0 1 1 1 1
 1 1 1 1 0 1 0 1 1 1 1 1 1 0 1 1 1 1 0 1 1 1 1 0 1 0 1 1 0 1 0 0 1 0 0 0 1
 1 1 1 0 1 1 1 1 0 1 0 0 0 1 0 0 1 1 1 1 0 1 1 1 0 0 1 0 0 1 0 1 1 1 1 1 0
 1 1 1 1 0 0 1 0 1 1 1 1 1 1 1 0 0 0 1 0 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 0 1
 1 1 1 1 1 0 1 1 1 1 0]


In [11]:
true = (cancer_target_pred==cancer_target_train).astype('int')
accuracy = np.sum(true)/len(true)
print(accuracy)

0.989010989010989


In [15]:
# 评价分类模型的指标
#   ++++    ----
#   +++-  | ---+
from sklearn.metrics import accuracy_score, precision_score, recall_score, \
f1_score, cohen_kappa_score

print(accuracy_score(cancer_target_train, cancer_target_pred)) # 准确率
print(precision_score(cancer_target_train, cancer_target_pred)) # 精准率：判断是是的样本中，多少是正确的
print(recall_score(cancer_target_train, cancer_target_pred)) # 召回率，得病的多少被判断正确
print(f1_score(cancer_target_train, cancer_target_pred))
print(cohen_kappa_score(cancer_target_train, cancer_target_pred))

# 应该评估测试集

0.989010989010989
0.986159169550173
0.9965034965034965


In [16]:
# 单分类 0或1    ；  多分类，多个0或1
# 预测鲍鱼的年龄
import pandas as pd
from sklearn.svm import SVC
from sklearn.metrics import classification_report # 多分类的结果评估
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler




In [20]:
abalone = pd.read_csv('./data/abalone.data', sep=',')
print(abalone)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         3
           4       0.00      0.00      0.00        13
           5       0.00      0.00      0.00        32
           6       0.00      0.00      0.00        48
           7       0.00      0.00      0.00        84
           8       0.00      0.00      0.00        99
           9       0.29      0.22      0.25       142
          10       0.15      0.76      0.25       139
          11       0.00      0.00      0.00        93
          12       0.00      0.00      0.00        51
          13       0.00      0.00      0.00        31
          14       0.00      0.00      0.00        26
          15       0.00      0.00      0.00    

  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# 把原始数据拆分成数据和标签
abalone_data = abalone.iloc[:,:8]
abalone_target = abalone.iloc[:,8]

#哑变量处理
sex = pd.get_dummies(abalone_data['sex'])
#print(sex)
abalone_data = pd.concat([abalone_data, sex], axis = 1)
abalone_data.drop('sex', axis=1, inplace=True)

#print(abalone_data)

#划分训练集、测试集
abalone_data_train, abalone_data_test, abalone_target_train, abalone_target_test = \
train_test_split(abalone_data, abalone_target, train_size=0.8, random_state=42)



In [None]:
# 预处理，标准化
stdScaler = StandardScaler().fit(abalone_data_train) # 生成规则

abalone_std_train = stdScaler.transform(abalone_data_train) # 应用规则到训练集
abalone_std_test = stdScaler.transform(abalone_data_test) # 应用规则到测试集

# 建模
svm_abalone = SVC().fit(abalone_data_train, abalone_target_train)
print(svm_abalone)

In [None]:
abalone_test_pred = svm_abalone.predict(abalone_std_test)
# print(abalone_test_pred)
print(classification_report(abalone_target_test, abalone_test_pred))

In [None]:
# 随机森林、决策树、