### confident learningのサンプルを疑似ラベリングしてから訓練する

In [1]:
from multiprocessing import cpu_count
import numpy as np

from sklearn.model_selection import train_test_split, cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import rcv1
from sklearn.metrics import classification_report

from cleanlab.noise_generation import generate_noise_matrix, generate_noise_matrix_from_trace, generate_noisy_labels
from cleanlab.classification import LearningWithNoisyLabels

# visualize
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
%config InlineBackend.figure_formats = {'png', 'retina'}

seed=42

### dataの読み込み と ノイズ付与

In [2]:
data=rcv1.fetch_rcv1()

In [3]:
# データセットがマルチクラス判別なので、シングルクラスにまとめる
mask_col=np.array(list(map(lambda x:x.endswith('CAT'), data.target_names))) #カテゴリーに分けるのが良さそう
target_names=data.target_names[mask_col]
print('target names', target_names) #C→corporate inductrial, E→economics, G→goverment, M→わからない...Market?
mask_row=data.target[:,mask_col].toarray().sum(axis=1) == 1 #マルチクラスが割り当てられているサンプルは削除

y=data.target[mask_row][:,mask_col]
X=data.data[mask_row]
py=y.toarray().sum(axis=0).reshape(-1) #given labelの数
print('samples',X.shape[0],'category value counts',py)
y=np.array(y.argmax(axis=1)).reshape(-1) #one-hot to num

target names ['CCAT' 'ECAT' 'GCAT' 'MCAT']
samples 685071 category value counts [299612  54695 163135 167629]


In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                    random_state=seed, shuffle=True)

In [10]:
# p(given=i | true=j) の行列を生成
noise_matrix=generate_noise_matrix_from_trace(4,3,
                                             min_trace_prob=0.6,
                                             frac_zero_noise_rates=0.5,
                                             py=py,
                                             seed=seed,
                                            )
print('p(given=i|true=j) =')
print(noise_matrix)
np.random.seed(seed)
y_train_corrupted=generate_noisy_labels(y_train,noise_matrix)
y_train_is_error=y_train_corrupted!=y_train
n=y_train_is_error.sum()
print('y_train_copputed contains', n ,'errors. error rate is', int(n/len(y_train) * 100),'%')

p(given=i|true=j) =
[[0.68936167 0.         0.         0.        ]
 [0.2387445  0.85410683 0.21184431 0.05112328]
 [0.         0.14589317 0.78815569 0.28050091]
 [0.07189383 0.         0.         0.66837581]]
y_train_copputed contains 152985 errors. error rate is 27 %


### corrupted labelとされたものに疑似ラベリング

In [11]:
%%time
# corrupted labelの推定
model=LogisticRegression(multi_class='auto',
                       verbose=2,
                       random_state=seed)

clf=LearningWithNoisyLabels(clf=model,
                            seed=seed,
                            n_jobs=cpu_count())
                            
clf.fit(X_train,y_train_corrupted)
print('test accuracy when train without corrupted label:',clf.score(X_test, y_test)) #悪くないけどノイズの影響受けてる



[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear]test accuracy when train without corrupted label: 0.9611137466700727
CPU times: user 1h 58min 21s, sys: 17.8 s, total: 1h 58min 39s
Wall time: 16min 39s


In [12]:
# Pseudo-labeling
X_with_noise = X_train[clf.noise_mask]
y_train_pseudo=y_train_corrupted.copy()
y_train_pseudo[clf.noise_mask]=clf.predict(X_with_noise)

### 疑似ラベリングでの性能

In [13]:
model.fit(X_train,y_train_pseudo)
print('test accuracy when train with pseudo label:', clf.score(X_test, y_test))

[LibLinear]test accuracy when train with pseudo label: 0.9628288873481006


すこーしだけ向上といったところか（cvも検定もしてないので本当に良いかなんてわからないが）