In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.datasets import fetch_openml


import warnings
warnings.filterwarnings('ignore')
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))
# os.listdir('../input/digit-recognizer')
# Any results you write to the current directory are saved as output.
mnist_dataset = fetch_openml('mnist_784', version=1, as_frame=True)

In [None]:
imbalance = False
add_noise = True
asym_noise = False
noise_ratio = 10

In [None]:
from sklearn.model_selection import train_test_split
mnist_dataframe = pd.DataFrame(data= np.c_[mnist_dataset['data'], mnist_dataset['target']],
                     columns= mnist_dataset['feature_names'] + ['target'])
print(mnist_dataframe.shape)
X_tr = mnist_dataframe.iloc[:,:-1] # iloc ensures X_tr will be a dataframe
y_tr = mnist_dataframe.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X_tr, y_tr, test_size=28000, stratify=y_tr)
print(type(X_train), type(X_test), type(y_train), type(y_test))

In [None]:
PATH = '../input/digit-recognizer'



# df_train = pd.read_csv(os.path.join(PATH, 'train.csv'))
# train_y = df_train['label'].values
# train_x = df_train.drop(['label'], axis=1).values


# df_test = pd.read_csv(os.path.join(PATH, 'test.csv'))
# test_x = df_test.values
train_x = X_train.values
train_y = y_train.values
test_x = X_test.values
Y_Test = y_test.values

print(train_x.shape)
print(train_y.shape)
print(test_x.shape)

In [None]:
IMG_SIZE = 32
# train_x = train_x.rename(columns={x:y for x,y in zip(train_x.columns,range(0,len(train_x.columns)))})
# test_x = test_x.rename(columns={x:y for x,y in zip(test_x.columns,range(0,len(test_x.columns)))})
# print(type(train_x), type(test_x))

In [None]:
# resize
import cv2

def resize(img_array):
    # print(img_array[])
    tmp = np.empty((img_array.shape[0], IMG_SIZE, IMG_SIZE))

    for i in range(len(img_array)):
        img = img_array[i].reshape(28, 28).astype('uint8')
        img = cv2.resize(img, (IMG_SIZE, IMG_SIZE))
        img = img.astype('float32')/255
        tmp[i] = img
        
    return tmp

train_x_resize = resize(train_x)
test_x_resize = resize(test_x)

In [None]:
train_x_final = np.stack((train_x_resize,)*3, axis=-1)
test_x_final = np.stack((test_x_resize,)*3, axis=-1)
print(train_x_final.shape)
print(test_x_final.shape)

In [None]:
from tensorflow.keras.utils import to_categorical
train_y_final = to_categorical(train_y, num_classes=10)
test_y_final = to_categorical(Y_Test, num_classes=10)
print(train_y_final.shape)
print(test_y_final.shape)

In [None]:
# models 
from keras.models import Sequential
from tensorflow.keras.applications import VGG19
from keras.layers import Dense, Flatten

vgg19 = VGG19(weights = 'imagenet', 
              include_top = False,
              input_shape=(IMG_SIZE, IMG_SIZE, 3)
              )

model = Sequential()
model.add(vgg19)
model.add(Flatten())
model.add(Dense(10, activation='softmax'))
model.compile(loss='categorical_crossentropy', 
              optimizer='sgd', 
              metrics=['accuracy'])

model.summary()

In [None]:
def noise_helper(n_classes, current_class):
    if current_class < 0 or current_class >= n_classes:
        error_str = "class_ind must be within the range (0, nb_classes - 1)"
        raise ValueError(error_str)

    other_class_list = list(range(n_classes))
    other_class_list.remove(current_class)
    other_class = np.random.choice(other_class_list)
    return other_class

def create_noise(y_tr, noise_ratio, asym = False):
    if noise_ratio > 0:
        dataset = 'mnist'
        print(y_tr)
        noisy_y_tr = np.array(y_tr, copy=True)
        if asym:
            if dataset == 'mnist':
                # 1 < - 5, 2 -> 4, 3 -> 7, 5 <-> 6, 8 -> 9
                source_class = [5, 2, 3, 5, 6, 8]
                target_class = [1, 4, 7, 6, 5, 9]
            if dataset == 'mnist' :
                for s, t in zip(source_class, target_class):
                    cls_idx = np.where(y_tr == s)[0]
                    n_noisy = int(noise_ratio * cls_idx.shape[0] / 100)
                    noisy_sample_index = np.random.choice(cls_idx, n_noisy, replace=False)
                    noisy_y_tr[noisy_sample_index] = t
        else:
            n_samples = noisy_y_tr.shape[0]
            n_noisy = int(noise_ratio * n_samples / 100)
            class_index = [np.where(y_tr == i)[0] for i in range(10)]
            class_noisy = int(n_noisy / 10)

            noisy_idx = []
            for d in range(10):
                noisy_class_index = np.random.choice(class_index[d], class_noisy, replace=False)
                noisy_idx.extend(noisy_class_index)

            for i in noisy_idx:
                noisy_y_tr[i] = noise_helper(n_classes=10, current_class=y_tr[i])

        print("Print noisy label generation statistics:")
        count = 0
        for i in range(10):
                n_noisy = np.sum(noisy_y_tr == i)
                print("Noisy class %s, has %s samples." % (i, n_noisy))
                count += n_noisy
        print(count)
        return noisy_y_tr

In [None]:
from numpy.core.function_base import add_newdoc
from sklearn.model_selection import train_test_split
from imblearn.datasets import make_imbalance
x_train, x_test, y_train, y_test = train_test_split(train_x_final, train_y_final, test_size=0.2, random_state=2019)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)



In [None]:
# randomval = x_train.reshape(-1,3072,)
# print(randomval.shape)

if imbalance:
    tem_x = x_train.reshape(-1,3072,)
    tem_y = np.argmax(y_train, axis=-1)

    print(tem_x.shape, tem_y.shape)
    tem_x, tem_y = make_imbalance(tem_x, tem_y,
                                sampling_strategy={0: 1000, 1: 2000, 2: 2500, 3: 2500, 4: 2000, 5: 1500, 6: 1000, 7: 3400, 8: 2000, 9: 1500,},
                                random_state=42)
    
    x_train = tem_x.reshape(-1, 32, 32, 3)
    y_train = to_categorical(tem_y, num_classes=10)

if add_noise:
    tem_y = np.argmax(y_train, axis=-1)
    tem_y = create_noise(tem_y, noise_ratio, asym_noise)
    y_train = to_categorical(tem_y, num_classes=10)

In [None]:
print(x_train.shape, y_train.shape)

In [None]:
# callback
from keras.callbacks import ModelCheckpoint, EarlyStopping
es = EarlyStopping(monitor='val_acc', verbose=1, patience=5)
mc = ModelCheckpoint(filepath='mnist-vgg19.h5', verbose=1, monitor='val_acc')
cb = [es, mc]

In [None]:
history = model.fit(x_train, y_train, 
                    epochs=1, 
                    batch_size=128, 
                    validation_data=(x_test, y_test),
                    callbacks=cb)

In [None]:
preds = model.predict(test_x_final, batch_size=128)

In [None]:
preds.shape

In [None]:
results = np.argmax(preds, axis=-1)
results.shape
Y_TEST = np.argmax(test_y_final, axis=-1)
Y_TEST.shape

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score
print(Y_TEST.shape, results.shape)
print("confusion matrix: \n ", confusion_matrix(Y_TEST, results))
# print(y_test)
# print(y_pred)
print("Accuracy of the balanced dataset with symmetric noise is \n", accuracy_score(Y_TEST, results))
print("Precision of the balanced dataset with symmetric noise is \n", precision_score(Y_TEST, results, average = None))
print("Recall of the balanced dataset with symmetric noise is ", recall_score(Y_TEST, results, average = None))
# cat_ypred = to_categorical(preds, 10)
print("ROC AUC score of the balanced dataset with symmetric noise is \n", roc_auc_score(Y_TEST, preds, multi_class='ovr'))