
# 不平衡樣本測試及評估

https://imbalanced-learn.org/stable/references/generated/imblearn.combine.SMOTEENN.html

In [2]:
# !pip install -U imbalanced-learn
# 20250922 執行過了
%pip install -U imbalanced-learn

Note: you may need to restart the kernel to use updated packages.


In [3]:
from sklearn import datasets
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
#from imblearn import pipeline as pl


In [4]:
# 生成不平衡樣本
RANDOM_STATE = 42
X, y = datasets.make_classification(n_classes=2, class_sep=2,
                                    weights=[0.1, 0.9], n_informative=10,
                                    n_redundant=1, flip_y=0, n_features=20,
                                    n_clusters_per_class=4, n_samples=5000,
                                    random_state=RANDOM_STATE) 
# n_classes=2 代表二分類;class_sep=2 代表類別分離度;weights=[0.1, 0.9] 代表類別不平衡比例; n_informative=10 代表有用特徵數量; n_redundant=1 代表冗餘特徵數量; 
# flip_y=0 代表標籤不會被隨機翻轉; n_features=20 代表總特徵數量; n_clusters_per_class=4 代表每個類別的群集數量; n_samples=5000 代表樣本數量; 
# random_state=RANDOM_STATE 代表隨機種子  
# X 為特徵矩陣, y 為標籤向量
# X.shape  # (5000, 20)
# y.shape  # (5000,)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=RANDOM_STATE) # 切分訓練集與測試集,測試集佔30% 

In [5]:
import numpy as np
print(np.bincount(y)) # 類別0有500個樣本,類別1有4500個樣本

[ 500 4500]


In [6]:
from imblearn.combine import SMOTEENN # 先做過取樣再做欠取樣

sm = SMOTEENN()
newX, newY = sm.fit_resample(X, y) # 重新取樣
print(newY)
print(np.bincount(newY)) # 類別0有4500個樣本,類別1有4375個樣本

[0 0 0 ... 1 1 1]
[4500 4373]


In [7]:
from imblearn.over_sampling import SMOTE # 只做過取樣

sm2 = SMOTE() # 
newX2, newY2 = sm2.fit_resample(X, y) # 
print(newY2)
print(np.bincount(newY2))

[1 0 1 ... 0 0 0]
[4500 4500]


In [8]:
# 採用 SVM + 原始資料
#pipeline = pl.make_pipeline(LinearSVC(random_state=RANDOM_STATE))
svc = LinearSVC(random_state=RANDOM_STATE) # 

# 資料切割
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    random_state=RANDOM_STATE)

# 訓練
svc.fit(X_train, y_train) # 

# 評估
y_pred_bal = svc.predict(X_test) # 
print(svc.score(X_test, y_test)) # X_test 測試資料, y_test 預測資料

0.9488


In [9]:
# 採用 SVM-2 + SMOTE
svc2 = LinearSVC(random_state=RANDOM_STATE)

# 資料切割
X_train2, X_test2, y_train2, y_test2 = train_test_split(newX2, newY2,
                                                    random_state=RANDOM_STATE) # 上面SMOTE是newX2,newY2

# 訓練
svc2.fit(X_train2, y_train2)

# 評估
y_pred_bal2 = svc2.predict(X_test2)
print(svc2.score(X_test2, y_test2))

0.9


In [10]:
# 採用 SVM-3 + SMOTEENN
svc3 = LinearSVC(random_state=RANDOM_STATE)

# 資料切割
X_train3, X_test3, y_train3, y_test3 = train_test_split(newX, newY,
                                                    random_state=RANDOM_STATE) # 上面SMOTEENN是newX,newY

# 訓練
svc3.fit(X_train3, y_train3)

# 評估
y_pred_bal3 = svc3.predict(X_test3) # y_pred_bal3 預測資料;svc3.predict(X_test3) 預測測試資料
print(svc3.score(X_test3, y_test3))

0.9094186570527264


In [11]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred_bal))

              precision    recall  f1-score   support

           0       0.82      0.61      0.70       123
           1       0.96      0.99      0.97      1127

    accuracy                           0.95      1250
   macro avg       0.89      0.80      0.84      1250
weighted avg       0.95      0.95      0.95      1250



In [12]:

print(classification_report(y_test2, y_pred_bal2))

              precision    recall  f1-score   support

           0       0.90      0.90      0.90      1140
           1       0.90      0.90      0.90      1110

    accuracy                           0.90      2250
   macro avg       0.90      0.90      0.90      2250
weighted avg       0.90      0.90      0.90      2250



In [13]:
print(classification_report(y_test3, y_pred_bal3))

              precision    recall  f1-score   support

           0       0.91      0.92      0.91      1123
           1       0.91      0.90      0.91      1096

    accuracy                           0.91      2219
   macro avg       0.91      0.91      0.91      2219
weighted avg       0.91      0.91      0.91      2219



## imbalanced-learn分類報表

In [14]:
# 如發生錯誤：AttributeError: 'NoneType' object has no attribute 'split'
%pip install -U threadpoolctl
# Kernel restart

Note: you may need to restart the kernel to use updated packages.


In [15]:
from sklearn import datasets
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split

from imblearn.over_sampling import SMOTE
from imblearn import pipeline as pl
from imblearn.metrics import classification_report_imbalanced

# SMOTE
X_res, y_res = SMOTE().fit_resample(X, y)

X_res.shape, y_res.shape

((9000, 20), (9000,))

In [16]:
# 資料切割
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    random_state=RANDOM_STATE)

# 訓練
pipeline.fit(X_train, y_train) 

# 評估
y_pred_bal = pipeline.predict(X_test)

# Show the classification report
print(classification_report_imbalanced(y_test, y_pred_bal))

NameError: name 'pipeline' is not defined

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_bal))