In [33]:
import numpy as np
import matplotlib.pyplot as plt
from keras.datasets import mnist
from imblearn.datasets import make_imbalance

from sklearn.metrics import precision_score, recall_score, roc_auc_score

In [2]:
def sigmoid(z):
  return 1 / (1+np.exp(-z)) 

In [3]:
def cost(theta, x, y):
  h = sigmoid(x @ theta)
  m = len(y)
  cost = 1 / m * np.sum( -y * np.log(h) - (1 - y) * np.log(1 - h))
  grad = 1 / m * ((y - h) @ x)
  return cost,grad

In [27]:
def fit(x, y, itr = 200, alpha = 0.25):
  x = np.insert(x, 0, 1, axis = 1)
  thetas=[]
  classes = np.unique(y)
  costs = np.zeros(itr)

  for c in classes:
    bin_y = np.where(y == c, 1, 0)
    theta = np.zeros(x.shape[1])

    for epoch in range(itr):
      costs[epoch], grad = cost(theta, x, bin_y)
      theta += alpha * grad

    thetas.append(theta)

  return thetas, classes, costs

In [41]:
def predict(classes, thetas, x):
  x = np.insert(x, 0, 1, axis = 1)
  preds = [np.argmax([sigmoid])]

In [51]:
def score (classes, theta, x, y):
  return (predict(classes, theta, x) == y).mean()

In [7]:
def other_class(n_classes, current_class):
    if current_class < 0 or current_class >= n_classes:
        error_str = "class_ind must be within the range (0, nb_classes - 1)"
        raise ValueError(error_str)

    other_class_list = list(range(n_classes))
    other_class_list.remove(current_class)
    other_class = np.random.choice(other_class_list)
    return other_class

def create_noise(y_tr, noise_ratio, asym):
  if noise_ratio > 0:
      dataset = 'mnist'
      noisy_y_tr = y_tr
      if asym:
          if dataset == 'mnist':
              # 1 < - 5, 2 -> 4, 3 -> 7, 5 <-> 6, 8 -> 9
              source_class = [5, 2, 3, 5, 6, 8]
              target_class = [1, 4, 7, 6, 5, 9]
          if dataset == 'mnist' :
              for s, t in zip(source_class, target_class):
                  cls_idx = np.where(y_tr == s)[0]
                  n_noisy = int(noise_ratio * cls_idx.shape[0] / 100)
                  noisy_sample_index = np.random.choice(cls_idx, n_noisy, replace=False)
                  noisy_y_tr[noisy_sample_index] = t
      else:
          n_samples = noisy_y_tr.shape[0]
          n_noisy = int(noise_ratio * n_samples / 100)
          class_index = [np.where(y_tr == i)[0] for i in range(10)]
          class_noisy = int(n_noisy / 10)

          noisy_idx = []
          for d in range(10):
              noisy_class_index = np.random.choice(class_index[d], class_noisy, replace=False)
              noisy_idx.extend(noisy_class_index)

          for i in noisy_idx:
              noisy_y_tr[i] = other_class(n_classes=10, current_class=y_tr[i])

      print("Print noisy label generation statistics:")
      count = 0
      for i in range(10):
              n_noisy = np.sum(noisy_y_tr == i)
              print("Noisy class %s, has %s samples." % (i, n_noisy))
              count += n_noisy
      print(count)
      return noisy_y_tr

In [8]:
(x_train, y_train), (x_test, y_test) = mnist.load_data()

In [9]:
print(x_train.shape)
print(x_test.shape)

(60000, 28, 28)
(10000, 28, 28)


In [10]:
x_train=x_train.reshape(x_train.shape[0],-1)

x_test=x_test.reshape(x_test.shape[0],-1)

print(x_train.shape)
print(x_test.shape)

(60000, 784)
(10000, 784)


In [11]:
imbx_tr, imby_tr = make_imbalance(x_train, y_train,
                              sampling_strategy={0: 2550, 1: 6675, 2: 3280, 3: 6100, 4: 3850, 5: 5400, 6: 3065, 7: 5860, 8: 5000, 9: 3654,},
                              random_state=42)

In [12]:
print(imbx_tr.shape)
print(imby_tr.shape)

(45434, 784)
(45434,)


In [13]:
sym_noisy_imbytr = create_noise(imby_tr, noise_ratio = 20, asym = False)

Print noisy label generation statistics:
Noisy class 0, has 2477 samples.
Noisy class 1, has 6714 samples.
Noisy class 2, has 3258 samples.
Noisy class 3, has 6074 samples.
Noisy class 4, has 3862 samples.
Noisy class 5, has 5396 samples.
Noisy class 6, has 3093 samples.
Noisy class 7, has 5835 samples.
Noisy class 8, has 5031 samples.
Noisy class 9, has 3694 samples.
45434


In [14]:
asym_noisy_imbytr = create_noise(imby_tr, noise_ratio = 20, asym = True)

Print noisy label generation statistics:
Noisy class 0, has 2477 samples.
Noisy class 1, has 7793 samples.
Noisy class 2, has 2607 samples.
Noisy class 3, has 4860 samples.
Noisy class 4, has 4513 samples.
Noisy class 5, has 4245 samples.
Noisy class 6, has 3165 samples.
Noisy class 7, has 7049 samples.
Noisy class 8, has 4025 samples.
Noisy class 9, has 4700 samples.
45434


In [15]:
sym_noisy_ytr = create_noise(y_train, noise_ratio = 20, asym = False)

Print noisy label generation statistics:
Noisy class 0, has 5958 samples.
Noisy class 1, has 6722 samples.
Noisy class 2, has 5967 samples.
Noisy class 3, has 6183 samples.
Noisy class 4, has 5851 samples.
Noisy class 5, has 5396 samples.
Noisy class 6, has 5912 samples.
Noisy class 7, has 6234 samples.
Noisy class 8, has 5835 samples.
Noisy class 9, has 5942 samples.
60000


In [16]:
asym_noisy_ytr = create_noise(y_train, noise_ratio = 20, asym = True)

Print noisy label generation statistics:
Noisy class 0, has 5958 samples.
Noisy class 1, has 7801 samples.
Noisy class 2, has 4774 samples.
Noisy class 3, has 4947 samples.
Noisy class 4, has 7044 samples.
Noisy class 5, has 4809 samples.
Noisy class 6, has 5420 samples.
Noisy class 7, has 7470 samples.
Noisy class 8, has 4668 samples.
Noisy class 9, has 7109 samples.
60000


In [17]:
x_train = x_train / 255.0
x_test = x_test / 255.0

imbx_tr = imbx_tr / 255.0

In [28]:
thetas, classes, costs = fit(x_train, y_train)

print(f"Train Accuracy: {score(classes, thetas, x_train, y_train):.3f}")
print(f"Test Accuracy: {score(classes, thetas, x_test, y_test):.3f}")

Train Accuracy: 0.623
Test Accuracy: 0.856


In [52]:
scores,pred = score(classes, thetas, x_test, y_test)
print("Test Accuracy: ",scores)

Test Accuracy:  0.8556


In [57]:
print('Precision: %.3f' % precision_score(y_test, pred, average = 'micro'))

Precision: 0.113


In [29]:
thetas_sym, classes_sym, costs_sym = fit(x_train, sym_noisy_ytr)

print(f"Train Accuracy: {score(classes_sym, thetas_sym, x_train, sym_noisy_ytr):.3f}")
print(f"Test Accuracy: {score(classes_sym, thetas_sym, x_test, y):.3f}")

Train Accuracy: 0.623
Test Accuracy: 0.856


In [30]:
thetas_asym, classes_asym, costs_asym = fit(x_train, asym_noisy_ytr)

print(f"Train Accuracy: {score(classes_asym, thetas_asym, x_train, asym_noisy_ytr):.3f}")
print(f"Test Accuracy: {score(classes_asym, thetas_asym, x_test, y_test):.3f}")

Train Accuracy: 0.623
Test Accuracy: 0.856


In [31]:
thetas_imb, classes_imb, costs_imb = fit(imbx_tr, imby_tr)

print(f"Train Accuracy: {score(classes_imb, thetas_imb, imbx_tr, imby_tr):.3f}")
print(f"Test Accuracy: {score(classes_imb, thetas_imb, x_test, y_test):.3f}")

Train Accuracy: 0.617
Test Accuracy: 0.844


In [32]:
thetas_imbsym, classes_imbsym, costs_imbsym = fit(imbx_tr, sym_noisy_imbytr)

print(f"Train Accuracy: {score(classes_imbsym, thetas_imbsym, imbx_tr, sym_noisy_imbytr):.3f}")
print(f"Test Accuracy: {score(classes_imbsym, thetas_imbsym, x_test, y_test):.3f}")

KeyboardInterrupt: ignored

In [None]:
thetas_imbasym, classes_imbasym, costs_imbasym = fit(imbx_tr, asym_noisy_imbytr)

print(f"Train Accuracy: {score(classes_imbasym, thetas_imbasym, imbx_tr, asym_noisy_imbytr):.3f}")
print(f"Test Accuracy: {score(classes_imbasym, thetas_imbasym, x_test, y_test):.3f}")

In [26]:
0.829 0.840 -> 50 0.1
0.850 0.858 -> 100 0.1
0.871 0.878 -> 100 0.25 original

0.147 0.114 -> 50 0.1
0.867 0.871 -> 100 0.25 imb 
0.843 0.846 -> 100 0.1 


SyntaxError: ignored