# Data Preparation

## Imports

In [None]:
from torchvision.datasets import MNIST
from torch.utils.data import Dataset,DataLoader
from collections import Counter
from operator import itemgetter

import torch
import torchvision
import seaborn as sns
import numpy as np
import random

## Configs

In [None]:
batch_size_train = 128
batch_size_test = 100
random_seed = 12453211
random_threshold = 0.6 #should be b/w 0 and 1
mode = "symmetric" # Can be symmetric, assymetric or original (default)
partiality = "imbalanced" # Can be balanced or imbalanced or None

# For Dataset 1 - Balanced dataset -                 - mode = original   and partitality = balanced
# For Dataset 2 - Imbalanced dataset Original MNIST  - mode = original   and partiality = None
# For Dataset 3 - Balanced Symmetric Noise           - mode = Symmetric  and partiality = balanced
# For Dataset 4 - Balanced Assymetric Noise          - mode = Assymetric and partiality = balanced
# For Dataset 5 - Imbalanced Symmetric Noise         - mode = Symmetric  and partiality = imbalanced
# For Dataset 6 - Imbalanced Assymetric Noise        - mode = Assymetric and partiality = imbalanced   

imbalanced_weights = {
    0: 0.3,
    1: 0.3,
    2: 1.0,
    3: 1.0,
    4: 1.0,
    5: 0.3,
    6: 1.0,
    7: 0.3,
    8: 1.0,
    9: 1.0
}

# Creating symmetric noise for 1,2 and 5 as 9,7 and 8
symmetric_noise = {
    0: 0,
    1: 9,
    9: 1,
    2: 7,
    7: 2,
    3: 3,
    4: 4,
    5: 8,
    8: 5,
    6: 6
}

# Creating asymettric noise for 0,3,4 and 8
asymmetric_noise = {
    0: 0,
    1: 1,
    2: 2,
    3: 4,
    4: 8,
    5: 5,
    6: 6,
    7: 7,
    8: 3,
    9: 0
}

torch.manual_seed(random_seed)
np.random.seed(random_seed)

# Symmetric and Assymetric Noise

In [None]:
if mode == "symmetric" or mode == "assymetric":
    train_set = torchvision.datasets.MNIST(
        '.',
        train=True,
        download=True,
        transform=torchvision.transforms.Compose([
            torchvision.transforms.ToTensor(),
            torchvision.transforms.Normalize((0.1307,), (0.3081,))
        ]),
        target_transform = lambda y: 
        (y if random.random() > random_threshold else symmetric_noise[y]) 
        if mode == "symmetric" else 
        (y if random.random() > random_threshold else asymmetric_noise[y])
    )

if mode == "original":
    train_set = torchvision.datasets.MNIST(
        '.',
        train=True,
        download=True,
        transform=torchvision.transforms.Compose([
            torchvision.transforms.ToTensor(),
            torchvision.transforms.Normalize((0.1307,), (0.3081,))
        ])
    )

test_set = torchvision.datasets.MNIST(
    '.',
    train=False, 
    download=True,
    transform=torchvision.transforms.Compose([
        torchvision.transforms.ToTensor(),
        torchvision.transforms.Normalize(
        (0.1307,), (0.3081,))
    ])
)

train_set, val_set = torch.utils.data.random_split(train_set, [50000, 10000])

train_loader = torch.utils.data.DataLoader(
    train_set,
    batch_size=batch_size_train,
    shuffle=True
)

valid_loader = torch.utils.data.DataLoader(
    val_set,
    batch_size=batch_size_train,
    shuffle=True,
)

test_loader = torch.utils.data.DataLoader(
    test_set,
    batch_size=batch_size_test,
    shuffle=False
)

# Balanced and Imbalanced data

## To numpy data

In [None]:
def data_loader_to_numpy(data_loader):
    result_x = []
    result_y = []
    for x, y in data_loader:
        result_x.append(x.numpy())
        result_y.append(y.numpy())
        
    return np.concatenate(result_x, axis=0), np.concatenate(result_y, axis=0)
    
train_x, train_y = data_loader_to_numpy(train_loader)
test_x, test_y = data_loader_to_numpy(test_loader)
valid_x, valid_y = data_loader_to_numpy(valid_loader)

print(train_x.shape, test_x.shape, train_y.shape, test_y.shape)

## Imbalanced data

In [None]:
def make_imbalanced(ds_x, ds_y, imbalanced_weights=imbalanced_weights):
    class_partition = {k:[] for k in range(10)}

    for x, y in zip(ds_x, ds_y):
        class_partition[y].append((x, y))

    for i in range(10):
        idxs = np.random.randint(0, len(class_partition[i]), int(imbalanced_weights[i]*len(class_partition[i])))
        class_partition[i] = [class_partition[i][j] for j in idxs]
        print(f"class {i}: size={len(class_partition[i])}")

    imbalanced_train = []

    for partition in class_partition.values():
        imbalanced_train.extend(partition)

    np.random.shuffle(imbalanced_train)
    imbalanced_train_x, imbalanced_train_y = zip(*imbalanced_train)
    
    return imbalanced_train_x, imbalanced_train_y

In [None]:
def make_balanced(ds_x, ds_y):
    #data_count = Counter(ds_y)
    #min_key, min_count = min(data_count.items(), key=itemgetter(1))
    return ds_x,ds_y

In [None]:
if partiality == "imbalanced":
    train_x, train_y = make_imbalanced(train_x, train_y)
elif partiality == "balanced":
    train_x, train_y = make_balanced(train_x, train_y)

# Distribution Plotter

In [None]:
def distribution_plotter(df):
    train_classes = [label for label in df]
    data_count = Counter(train_classes)
    palette = sns.color_palette("husl")
    plt.figure(figsize=(18,5))
    sns.barplot(x=list(data_count.keys()),y=list(data_count.values()),palette=palette)
    plt.xlabel('{}'.format(mode))

In [None]:
distribution_plotter(train_y)