In [1]:
import lightgbm as lgb
from sklearn import datasets
from sklearn.model_selection import train_test_split
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.metrics import accuracy_score, confusion_matrix
import numpy as np
import random
from cleanlab.classification import CleanLearning

In [2]:
# データを用意する
iris = datasets.load_iris()                   # scikit-learnのdatasetsを読み込む
X = pd.DataFrame(iris.data[:, [0, 1, 2, 3]])  # 訓練データ
Y = pd.Series(iris.target)                    # 教師データ

In [3]:
# データをホールドアウト法で分割
train_X, test_X, train_Y, test_Y = train_test_split(X, Y,                # 訓練データとテストデータに分割する
                                                    test_size=0.3,       # テストデータの割合
                                                    shuffle=True,        # シャッフルする
                                                    random_state=0)      # 乱数シードを固定する

In [4]:
train_Y = train_Y.values
display(train_Y)
y_values = np.unique(train_Y)
print(y_values)
y_len = train_Y.shape[0]
noise_size = int(y_len/4)
print(noise_size)
noise_index = random.sample(range(y_len), noise_size)
display(noise_index)

array([1, 2, 2, 2, 2, 1, 2, 1, 1, 2, 2, 2, 2, 1, 2, 1, 0, 2, 1, 1, 1, 1,
       2, 0, 0, 2, 1, 0, 0, 1, 0, 2, 1, 0, 1, 2, 1, 0, 2, 2, 2, 2, 0, 0,
       2, 2, 0, 2, 0, 2, 2, 0, 0, 2, 0, 0, 0, 1, 2, 2, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 2, 1, 2, 1, 0, 2, 0, 2, 0, 0, 2, 0, 2, 1, 1, 1, 2, 2, 1,
       1, 0, 1, 2, 2, 0, 1, 1, 1, 1, 0, 0, 0, 2, 1, 2, 0])

[0 1 2]
26


[83,
 47,
 58,
 71,
 32,
 65,
 50,
 2,
 23,
 21,
 48,
 51,
 27,
 17,
 82,
 46,
 16,
 100,
 20,
 43,
 6,
 3,
 89,
 41,
 73,
 34]

In [5]:
train_Y_org = np.copy(train_Y)
for i in noise_index:
    org_y = train_Y[i]
    exclude_values = list(set(y_values) - set([train_Y[i]]))
    train_Y[i] = random.sample(exclude_values, 1)[0]
    tr_y = train_Y[i]
    print(f'{i}: {org_y} -> {tr_y}')

83: 1 -> 2
47: 2 -> 1
58: 2 -> 0
71: 2 -> 1
32: 1 -> 2
65: 0 -> 2
50: 2 -> 0
2: 2 -> 0
23: 0 -> 2
21: 1 -> 2
48: 0 -> 2
51: 0 -> 1
27: 0 -> 2
17: 2 -> 1
82: 1 -> 2
46: 0 -> 1
16: 0 -> 1
100: 0 -> 2
20: 1 -> 2
43: 0 -> 2
6: 2 -> 1
3: 2 -> 0
89: 0 -> 1
41: 2 -> 0
73: 0 -> 2
34: 1 -> 2


In [6]:
accuracy = accuracy_score(train_Y, train_Y_org)
print(f'accuracy: {accuracy}')
cm = confusion_matrix(train_Y_org, train_Y)
print(cm)

accuracy: 0.7523809523809524
[[23  4  7]
 [ 0 26  6]
 [ 5  4 30]]


In [7]:
# データセットを登録
lgb_train = lgb.Dataset(train_X, train_Y)
lgb_test = lgb.Dataset(test_X, test_Y, reference=lgb_train)

In [8]:
# LightGBMのハイパーパラメータを設定
params = {'task': 'train',              # タスクを訓練に設定
            'boosting_type': 'gbdt',      # GBDTを指定
            'objective': 'multiclass',    # 多クラス分類を指定
            'metric': 'multi_logloss',  # 多クラス分類の損失（誤差）
            'num_class': 3,               # クラスの数（irisデータセットが3個のクラスなので）
            'learning_rate': 0.1,         # 学習率
            'num_leaves': 21,             # ノードの数
            'min_data_in_leaf': 3,        # 決定木ノードの最小データ数
            'num_iteration': 100}         # 予測器(決定木)の数:イタレーション

In [9]:
lgb_results = {}                                    # 学習の履歴を入れる入物
model = lgb.train(params=params,                    # ハイパーパラメータをセット
                    train_set=lgb_train,              # 訓練データを訓練用にセット
                    valid_sets=[lgb_train, lgb_test], # 訓練データとテストデータをセット
                    valid_names=['Train', 'Test'],    # データセットの名前をそれぞれ設定
                    num_boost_round=100,              # 計算回数
                    early_stopping_rounds=10,         # アーリーストッピング設定
                    evals_result=lgb_results)         # 履歴を保存する

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 85
[LightGBM] [Info] Number of data points in the train set: 105, number of used features: 4
[LightGBM] [Info] Start training from score -1.321756
[LightGBM] [Info] Start training from score -1.127600
[LightGBM] [Info] Start training from score -0.892760
[1]	Train's multi_logloss: 0.963892	Test's multi_logloss: 1.02345
Training until validation scores don't improve for 10 rounds
[2]	Train's multi_logloss: 0.871391	Test's multi_logloss: 0.932534
[3]	Train's multi_logloss: 0.79037	Test's multi_logloss: 0.861272
[4]	Train's multi_logloss: 0.723788	Test's multi_logloss: 0.807579
[5]	Train's multi_logloss: 0.667311	Test's multi_logloss: 0.767386
[6]	Train's multi_logloss: 0.614636	Test's multi_logloss: 0.736436
[7]	Train's multi_logloss: 0.56836	Test's multi_logloss: 0.707959
[8]	Train's multi_logloss: 0.526985	Test's multi_logloss: 0.684076
[9]	Train's multi_logloss: 0.490722	Test's multi_logloss: 0.6688



In [10]:
loss_train = lgb_results['Train']['multi_logloss']  # 訓練誤差
loss_test = lgb_results['Test']['multi_logloss']    # 汎化誤差
best_iteration = model.best_iteration               # 最良の予測器が得られたイタレーション数
print(best_iteration)

14


In [11]:
display(test_X.shape)
y_pred = model.predict(test_X, num_iteration=model.best_iteration)
y_pred = np.argmax(y_pred, axis=1)
display(y_pred.shape)
display(test_Y.shape)
accuracy = accuracy_score(y_pred, test_Y)
print(f'accuracy: {accuracy}')
cm = confusion_matrix(test_Y, y_pred)
print(cm)

(45, 4)

(45,)

(45,)

accuracy: 0.7333333333333333
[[12  1  3]
 [ 0 11  7]
 [ 1  0 10]]


In [12]:
lgb_results = {}                                    # 学習の履歴を入れる入物
# model = lgb.train(params=params,                    # ハイパーパラメータをセット
#                     train_set=lgb_train,              # 訓練データを訓練用にセット
#                     valid_sets=[lgb_train, lgb_test], # 訓練データとテストデータをセット
#                     valid_names=['Train', 'Test'],    # データセットの名前をそれぞれ設定
#                     num_boost_round=100,              # 計算回数
#                     early_stopping_rounds=10,         # アーリーストッピング設定
#                     evals_result=lgb_results)         # 履歴を保存する
clf = lgb.LGBMClassifier(**params)
cl = CleanLearning(clf=clf, verbose=True)
cl.fit(train_X, train_Y)



Computing out of sample predicted probabilities via 5-fold cross validation. May take a while ...








Using predicted probabilities to identify label issues ...
Identified 54 examples with label issues.
Pruning 54 examples with label issues ...
Remaining clean data has 51 examples.
Assigning sample weights for final training based on estimated label quality.
Fitting final model on the clean data ...
Label issues stored in label_issues_df DataFrame accessible via: self.get_label_issues(). Call self.save_space() to delete this potentially large DataFrame attribute.




In [13]:
print(train_Y.shape)
df_issues = cl.get_label_issues()
df_issues['change_flg'] = 0
df_issues.loc[noise_index, 'change_flg'] = 1
df_issues['train_Y'] = train_Y
df_issues['train_Y_org'] = train_Y_org
display(sorted(noise_index))
display(df_issues[(df_issues['is_label_issue'] == True) | (df_issues['change_flg'] == 1)])

(105,)


[2,
 3,
 6,
 16,
 17,
 20,
 21,
 23,
 27,
 32,
 34,
 41,
 43,
 46,
 47,
 48,
 50,
 51,
 58,
 65,
 71,
 73,
 82,
 83,
 89,
 100]

Unnamed: 0,is_label_issue,label_quality,given_label,predicted_label,sample_weight,change_flg,train_Y,train_Y_org
2,True,4e-06,0,2,0.0,1,0,2
3,True,2.5e-05,0,1,0.0,1,0,2
6,True,3.2e-05,1,2,0.0,1,1,2
7,True,0.281418,1,2,0.0,0,1,1
8,True,0.003796,1,2,0.0,0,1,1
10,True,0.031146,2,1,0.0,0,2,2
12,True,0.380185,2,1,0.0,0,2,2
15,True,0.068687,1,2,0.0,0,1,1
16,True,0.005122,1,0,0.0,1,1,0
17,True,1e-06,1,2,0.0,1,1,2


In [14]:
display(test_X.shape)
y_pred = cl.predict(test_X, num_iteration=model.best_iteration)
display(y_pred.shape)
display(test_Y.shape)
accuracy = accuracy_score(y_pred, test_Y)
print(f'accuracy: {accuracy}')
cm = confusion_matrix(test_Y, y_pred)
print(cm)

(45, 4)

(45,)

(45,)

accuracy: 0.8666666666666667
[[16  0  0]
 [ 0 13  5]
 [ 0  1 10]]


In [15]:
display(train_X.shape)
y_pred = cl.predict(train_X, num_iteration=model.best_iteration)
display(y_pred.shape)
display(train_Y.shape)
accuracy = accuracy_score(y_pred, train_Y)
print(f'accuracy: {accuracy}')
cm = confusion_matrix(train_Y, y_pred)
print(cm)

accuracy = accuracy_score(y_pred, train_Y_org)
print(f'accuracy: {accuracy}')
cm = confusion_matrix(train_Y_org, y_pred)
print(cm)

(105, 4)

(105,)

(105,)

accuracy: 0.7333333333333333
[[23  0  5]
 [ 4 23  7]
 [ 7  5 31]]
accuracy: 0.9619047619047619
[[34  0  0]
 [ 0 28  4]
 [ 0  0 39]]
