In [21]:
import numpy as np
import matplotlib.pyplot as plt
from keras.datasets import mnist
from imblearn.datasets import make_imbalance
from tensorflow.keras.utils import to_categorical

from sklearn.metrics import precision_score, recall_score, roc_auc_score, accuracy_score

In [22]:
def countlabels(y_train):
  count = 0
  for i in range(10):
          n_noisy = np.sum(y_train == i)
          print("Class %s, has %s samples." % (i, n_noisy))
          count += n_noisy
  print("Total count ",count)

In [23]:
def sigmoid(z):
  return 1 / (1+np.exp(-z)) 

In [24]:
def cost(theta, x, y):
  h = sigmoid(x @ theta)
  m = len(y)
  cost = 1 / m * np.sum( -y * np.log(h) - (1 - y) * np.log(1 - h))
  grad = 1 / m * ((y - h) @ x)
  return cost,grad

In [25]:
def fit(x, y, itr = 100, alpha = 0.25):
  x = np.insert(x, 0, 1, axis = 1)
  thetas=[]
  classes = np.unique(y)
  costs = np.zeros(itr)

  for c in classes:
    bin_y = np.where(y == c, 1, 0)
    theta = np.zeros(x.shape[1])

    for epoch in range(itr):
      costs[epoch], grad = cost(theta, x, bin_y)
      theta += alpha * grad

    thetas.append(theta)

  return thetas, classes, costs

In [26]:
def predict(classes, thetas, x, y):
  x = np.insert(x, 0, 1, axis = 1)
  preds = [np.argmax([sigmoid (xi @ theta) for theta in thetas]) for xi in x]
  y_pred = [classes[p] for p in preds]
  return y_pred
  #'''(y_pred == y).mean(),'''

In [27]:
def score (classes, theta, x, y):
  return (predict(classes, theta, x, y) == y).mean()

In [28]:
def other_class(n_classes, current_class):
    if current_class < 0 or current_class >= n_classes:
        error_str = "class_ind must be within the range (0, nb_classes - 1)"
        raise ValueError(error_str)

    other_class_list = list(range(n_classes))
    other_class_list.remove(current_class)
    other_class = np.random.choice(other_class_list)
    return other_class

def create_noise(y_tr, noise_ratio, asym):
  if noise_ratio > 0:
      dataset = 'mnist'
      noisy_y_tr = np.array(y_tr, copy = True)
      if asym:
          if dataset == 'mnist':
              # 1 < - 5, 2 -> 4, 3 -> 7, 5 <-> 6, 8 -> 9
              source_class = [5, 2, 3, 5, 6, 8]
              target_class = [1, 4, 7, 6, 5, 9]
          if dataset == 'mnist' :
              for s, t in zip(source_class, target_class):
                  cls_idx = np.where(y_tr == s)[0]
                  n_noisy = int(noise_ratio * cls_idx.shape[0] / 100)
                  noisy_sample_index = np.random.choice(cls_idx, n_noisy, replace=False)
                  noisy_y_tr[noisy_sample_index] = t
      else:
          n_samples = noisy_y_tr.shape[0]
          n_noisy = int(noise_ratio * n_samples / 100)
          class_index = [np.where(y_tr == i)[0] for i in range(10)]
          class_noisy = int(n_noisy / 10)

          noisy_idx = []
          for d in range(10):
              noisy_class_index = np.random.choice(class_index[d], class_noisy, replace=False)
              noisy_idx.extend(noisy_class_index)

          for i in noisy_idx:
              noisy_y_tr[i] = other_class(n_classes=10, current_class=y_tr[i])

      print("Print noisy label generation statistics:")
      count = 0
      for i in range(10):
              n_noisy = np.sum(noisy_y_tr == i)
              print("Noisy class %s, has %s samples." % (i, n_noisy))
              count += n_noisy
      print(count)
      
      return noisy_y_tr

In [29]:
(x_train, y_train), (x_test, y_test) = mnist.load_data()

In [30]:
print(x_train.shape)
print(x_test.shape)

(60000, 28, 28)
(10000, 28, 28)


In [31]:
x_train=x_train.reshape(x_train.shape[0],-1)

x_test=x_test.reshape(x_test.shape[0],-1)

print(x_train.shape)
print(x_test.shape)

(60000, 784)
(10000, 784)


In [32]:
imbx_tr, imby_tr = make_imbalance(x_train, y_train,
                              sampling_strategy={0: 2550, 1: 6675, 2: 3280, 3: 6100, 4: 3850, 5: 5400, 6: 3065, 7: 5860, 8: 5000, 9: 3654,},
                              random_state=42)

In [33]:
print(imbx_tr.shape)
print(imby_tr.shape)

(45434, 784)
(45434,)


In [34]:
sym_noisy_imbytr = create_noise(imby_tr, noise_ratio = 20, asym = False)

Print noisy label generation statistics:
Noisy class 0, has 2562 samples.
Noisy class 1, has 6697 samples.
Noisy class 2, has 3256 samples.
Noisy class 3, has 6124 samples.
Noisy class 4, has 3857 samples.
Noisy class 5, has 5431 samples.
Noisy class 6, has 3036 samples.
Noisy class 7, has 5874 samples.
Noisy class 8, has 4980 samples.
Noisy class 9, has 3617 samples.
45434


In [35]:
asym_noisy_imbytr = create_noise(imby_tr, noise_ratio = 20, asym = True)

Print noisy label generation statistics:
Noisy class 0, has 2550 samples.
Noisy class 1, has 7532 samples.
Noisy class 2, has 2624 samples.
Noisy class 3, has 4880 samples.
Noisy class 4, has 4506 samples.
Noisy class 5, has 4076 samples.
Noisy class 6, has 3532 samples.
Noisy class 7, has 7080 samples.
Noisy class 8, has 4000 samples.
Noisy class 9, has 4654 samples.
45434


In [36]:
sym_noisy_ytr = create_noise(y_train, noise_ratio = 20, asym = False)

Print noisy label generation statistics:
Noisy class 0, has 5970 samples.
Noisy class 1, has 6741 samples.
Noisy class 2, has 5971 samples.
Noisy class 3, has 6086 samples.
Noisy class 4, has 5865 samples.
Noisy class 5, has 5442 samples.
Noisy class 6, has 5947 samples.
Noisy class 7, has 6206 samples.
Noisy class 8, has 5833 samples.
Noisy class 9, has 5939 samples.
60000


In [37]:
asym_noisy_ytr = create_noise(y_train, noise_ratio = 20, asym = True)

Print noisy label generation statistics:
Noisy class 0, has 5923 samples.
Noisy class 1, has 7602 samples.
Noisy class 2, has 4767 samples.
Noisy class 3, has 4905 samples.
Noisy class 4, has 7033 samples.
Noisy class 5, has 4660 samples.
Noisy class 6, has 5819 samples.
Noisy class 7, has 7491 samples.
Noisy class 8, has 4681 samples.
Noisy class 9, has 7119 samples.
60000


In [38]:
countlabels(y_train)

Class 0, has 5923 samples.
Class 1, has 6742 samples.
Class 2, has 5958 samples.
Class 3, has 6131 samples.
Class 4, has 5842 samples.
Class 5, has 5421 samples.
Class 6, has 5918 samples.
Class 7, has 6265 samples.
Class 8, has 5851 samples.
Class 9, has 5949 samples.
Total count  60000


In [39]:
x_train = x_train / 255.0
x_test = x_test / 255.0

imbx_tr = imbx_tr / 255.0

In [40]:
thetas, classes, costs = fit(x_train, y_train)


print(f"Train Accuracy: {score(classes, thetas, x_train, y_train):.3f}")

ypred = predict(classes, thetas, x_test, y_test)
cat_ypred =to_categorical(ypred, 10)

print("Accuracy of the balanced dataset is \n", accuracy_score(y_test, ypred))
print("Precision of the balanced dataset is \n", precision_score(y_test, ypred, average = 'micro'))
print("Recall of the balanced dataset is \n", recall_score(y_test, ypred, average = 'micro'))
print("ROC AUC score of the balanced dataset is \n", roc_auc_score(y_test, cat_ypred, multi_class='ovr'))

Train Accuracy: 0.807
Accuracy of the balanced dataset is 
 0.8187
Precision of the balanced dataset is 
 0.8187
Recall of the balanced dataset is 
 0.8187
ROC AUC score of the balanced dataset is 
 0.8965186086286122


In [41]:
thetas_sym, classes_sym, costs_sym = fit(x_train, sym_noisy_ytr)

print(f"Train Accuracy: {score(classes_sym, thetas_sym, x_train, sym_noisy_ytr):.3f}")

ypred = predict(classes_sym, thetas_sym, x_test, y_test)
cat_ypred =to_categorical(ypred, 10)


print("Accuracy of the balanced dataset with symmetric noise is \n", accuracy_score(y_test, ypred))
print("Precision of the balanced dataset with symmetric noise is \n", precision_score(y_test, ypred, average = 'micro'))
print("Recall of the balanced dataset with symmetric noise is \n", recall_score(y_test, ypred, average = 'micro'))
print("ROC AUC score of the balanced dataset with symmetric noise is \n", roc_auc_score(y_test, cat_ypred, multi_class='ovr'))

Train Accuracy: 0.645
Accuracy of the balanced dataset with symmetric noise is 
 0.8111
Precision of the balanced dataset with symmetric noise is 
 0.8111
Recall of the balanced dataset with symmetric noise is 
 0.8111
ROC AUC score of the balanced dataset with symmetric noise is 
 0.8918632663070525


In [42]:
thetas_asym, classes_asym, costs_asym = fit(x_train, asym_noisy_ytr)

print(f"Train Accuracy: {score(classes_asym, thetas_asym, x_train, asym_noisy_ytr):.3f}")

ypred = predict(classes_asym, thetas_asym, x_test, y_test)
cat_ypred =to_categorical(ypred, 10)


print("Accuracy of the balanced dataset with asymmetric noise is \n", accuracy_score(y_test, ypred))
print("Precision of the balanced dataset with asymmetric noise is \n", precision_score(y_test, ypred, average = 'micro'))
print("Recall of the balanced dataset with asymmetric noise is \n", recall_score(y_test, ypred, average = 'micro'))
print("ROC AUC score of the balanced dataset with asymmetric noise is \n", roc_auc_score(y_test, cat_ypred, multi_class='ovr'))


Train Accuracy: 0.691
Accuracy of the balanced dataset with asymmetric noise is 
 0.7545
Precision of the balanced dataset with asymmetric noise is 
 0.7545
Recall of the balanced dataset with asymmetric noise is 
 0.7545
ROC AUC score of the balanced dataset with asymmetric noise is 
 0.8588978227279724


In [43]:
thetas_imb, classes_imb, costs_imb = fit(imbx_tr, imby_tr)

print(f"Train Accuracy: {score(classes_imb, thetas_imb, imbx_tr, imby_tr):.3f}")

ypred = predict(classes_imb, thetas_imb, x_test, y_test)
cat_ypred =to_categorical(ypred, 10)


print("Accuracy of the imbalanced dataset is \n", accuracy_score(y_test, ypred))
print("Precision of the imbalanced dataset is \n", precision_score(y_test, ypred, average = 'micro'))
print("Recall of the imbalanced dataset is \n", recall_score(y_test, ypred, average = 'micro'))
print("ROC AUC score of the imbalanced dataset is \n", roc_auc_score(y_test, cat_ypred, multi_class='ovr'))




Train Accuracy: 0.779
Accuracy of the imbalanced dataset is 
 0.775
Precision of the imbalanced dataset is 
 0.775
Recall of the imbalanced dataset is 
 0.775
ROC AUC score of the imbalanced dataset is 
 0.8730143788822545


In [44]:
thetas_imbsym, classes_imbsym, costs_imbsym = fit(imbx_tr, sym_noisy_imbytr)

print(f"Train Accuracy: {score(classes_imbsym, thetas_imbsym, imbx_tr, sym_noisy_imbytr):.3f}")

ypred = predict(classes_imbsym, thetas_imbsym, x_test, y_test)
cat_ypred =to_categorical(ypred, 10)


print("Accuracy of the imbalanced dataset with symmetric noise is \n", accuracy_score(y_test, ypred))
print("Precision of the imbalanced dataset with symmetric noise is \n", precision_score(y_test, ypred, average = 'micro'))
print("Recall of the imbalanced dataset with symmetric noise is \n", recall_score(y_test, ypred, average = 'micro'))
print("ROC AUC score of the imbalanced dataset with symmetric noise is \n", roc_auc_score(y_test, cat_ypred, multi_class='ovr'))

Train Accuracy: 0.598
Accuracy of the imbalanced dataset with symmetric noise is 
 0.7016
Precision of the imbalanced dataset with symmetric noise is 
 0.7016
Recall of the imbalanced dataset with symmetric noise is 
 0.7016
ROC AUC score of the imbalanced dataset with symmetric noise is 
 0.8317730624830583


In [45]:
thetas_imbasym, classes_imbasym, costs_imbasym = fit(imbx_tr, asym_noisy_imbytr)

print(f"Train Accuracy: {score(classes_imbasym, thetas_imbasym, imbx_tr, asym_noisy_imbytr):.3f}")

ypred = predict(classes_imbasym, thetas_imbasym, x_test, y_test)
cat_ypred =to_categorical(ypred, 10)


print("Accuracy of the imbalanced dataset with asymmetric noise is \n", accuracy_score(y_test, ypred))
print("Precision of the imbalanced dataset with asymmetric noise is \n", precision_score(y_test, ypred, average = 'micro'))
print("Recall of the imbalanced dataset with asymmetric noise is \n", recall_score(y_test, ypred, average = 'micro'))
print("ROC AUC score of the balanced dataset with asymmetric noise is \n", roc_auc_score(y_test, cat_ypred, multi_class='ovr'))

Train Accuracy: 0.662
Accuracy of the imbalanced dataset with asymmetric noise is 
 0.726
Precision of the imbalanced dataset with asymmetric noise is 
 0.726
Recall of the imbalanced dataset with asymmetric noise is 
 0.726
ROC AUC score of the balanced dataset with asymmetric noise is 
 0.8441293864071001
