# 模型侧代码demo

## 参考链接：

* [sklearn api](https://scikit-learn.org/stable/modules/classes.html#module-sklearn.neighbors)
* [sklearn linear model](https://scikit-learn.org/stable/modules/classes.html#module-sklearn.linear_model)
* [sklearn svm](https://scikit-learn.org/stable/modules/classes.html#module-sklearn.svm)
* [sklearn decision tree](https://scikit-learn.org/stable/modules/classes.html#module-sklearn.tree)
* [sklearn ensemble method](https://scikit-learn.org/stable/modules/classes.html#module-sklearn.ensemble)
* [xgboost](https://xgboost.readthedocs.io/en/latest/#)
* [lightgbm](https://lightgbm.readthedocs.io/en/latest/index.html)
* [sklearn clustering](https://scikit-learn.org/stable/modules/classes.html#module-sklearn.cluster)

# 有监督
## 导入模块

In [21]:
from sklearn.datasets import load_iris  # 导入数据

from sklearn.model_selection import train_test_split  # 划分训练测试集

from sklearn.preprocessing import StandardScaler  # 正态化

# model
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV  # logistic regression
from sklearn.svm import SVC  # svm
from sklearn.tree import DecisionTreeClassifier  # 决策树
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier  # 随机森林 gbdt

# metrics
from sklearn.metrics import roc_curve, precision_recall_curve, confusion_matrix
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, auc
from sklearn.metrics import plot_roc_curve, plot_precision_recall_curve, plot_confusion_matrix

# hyper_parameter optimizers
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# model validation
from sklearn.model_selection import cross_val_predict, cross_val_score  # cross_val_predict是将k折的预测合在一起，不具备泛化误差参考性

## 导入数据

In [5]:
X, y = load_iris(return_X_y=True)
X.shape

(150, 4)

## 划分训练、验证、测试集

In [18]:
random_state = 42
X_full_train, X_test, y_full_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state, stratify=y)
X_train, X_dev, y_train, y_dev = train_test_split(X_full_train, y_full_train, test_size=0.2, random_state=random_state, stratify=y_full_train)

print(f'X_train size {X_train.shape}, y_train size {y_train.shape}')
print(f'X_dev size {X_dev.shape}, y_dev size {y_dev.shape}')
print(f'X_test size {X_test.shape}, y_test size {y_test.shape}')

X_train size (96, 4), y_train size (96,)
X_dev size (24, 4), y_dev size (24,)
X_test size (30, 4), y_test size (30,)


## 数据归一化

In [29]:
scaler = StandardScaler()
scaler.fit(X_train)

X_train_s = scaler.transform(X_train)
X_dev_s = scaler.transform(X_dev)
X_test_s = scaler.transform(X_test)

## 构建模型

In [79]:
lr_fit = LogisticRegression(random_state=random_state).fit(X_train_s, y_train)
svm_fit = SVC(random_state=random_state, probability=True).fit(X_train_s, y_train)
dt_fit = DecisionTreeClassifier(random_state=random_state).fit(X_train_s, y_train)
gbdt_fit = GradientBoostingClassifier(random_state=0).fit(X_train_s, y_train)

## 各种分数计算

In [80]:
model_dict = {
    'lr  ': lr_fit,
    'svm ': svm_fit,
    'dt  ': dt_fit,
    'gbdt': gbdt_fit
}

In [81]:
# confusion_matrix
for key, value in model_dict.items():
    pred = value.predict(X_dev_s)
    print(key+ ':\n', confusion_matrix(y_dev, pred))

lr  :
 [[8 0 0]
 [0 7 1]
 [0 0 8]]
svm :
 [[8 0 0]
 [0 7 1]
 [0 0 8]]
dt  :
 [[8 0 0]
 [0 7 1]
 [0 0 8]]
gbdt:
 [[8 0 0]
 [0 7 1]
 [0 0 8]]


In [99]:
# score
for key, clf in model_dict.items():
    pred = clf.predict(X_dev_s)  # 预测target label
    pred_prob = clf.predict_proba(X_dev_s)
    acc = accuracy_score(y_dev, pred)
    f1 = f1_score(y_dev, pred, average='weighted')
    precision = precision_score(y_dev, pred, average='weighted')
    recall = recall_score(y_dev, pred, average='weighted')
    roc_auc = roc_auc_score(y_dev, pred_prob, average='weighted', multi_class='ovr')
    print('{}, acc: {:.5f}, f1: {:.5f}, precision: {:.5f}, recall: {:.5f}, roc_auc: {:.5f}'\
          .format(key, acc, f1, precision, recall, roc_auc))

lr  , acc: 0.95833, f1: 0.95817, precision: 0.96296, recall: 0.95833, roc_auc: 1.00000
svm , acc: 0.95833, f1: 0.95817, precision: 0.96296, recall: 0.95833, roc_auc: 1.00000
dt  , acc: 0.95833, f1: 0.95817, precision: 0.96296, recall: 0.95833, roc_auc: 0.96875
gbdt, acc: 0.95833, f1: 0.95817, precision: 0.96296, recall: 0.95833, roc_auc: 1.00000
