# Import Modules

## Standard modules

In [None]:
import os
import json
import pickle as pkl

from collections import Counter

## External modules

In [None]:
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn.metrics import precision_score, \
                            recall_score, \
                            f1_score, \
                            roc_auc_score \
            
from tqdm import tqdm, trange
from pylab import rcParams



tqdm.pandas()
%matplotlib inline
warnings.filterwarnings('ignore')
rcParams['figure.figsize'] = 10, 10

## Internal modules

In [None]:
import utils_scripts as utlis

# Constants

In [None]:
RANDOM_SEED = 17
np.random.seed(RANDOM_SEED)

ABS_PATH = '/kaggle/input/herbarium-2020-fgvc7/nybg2020/'

# Data EDA

In [None]:
def get_result_df(path, set_value):
    with open(os.path.join(ABS_PATH, set_value, 'metadata.json'), "r", encoding="ISO-8859-1") as file:
        metadata = json.load(file)
        
    img_info = pd.DataFrame(metadata['images'])
    
    if set_value == 'train':
        annotation_info = pd.DataFrame(metadata['annotations']).drop(columns=['image_id'])
        img_info = img_info.merge(annotation_info, on='id')
    
    img_info['file_name'] = img_info['file_name'].progress_apply(lambda x : os.path.join(path, set_value, x))
    return img_info

In [None]:
metadata_train = get_result_df(path=ABS_PATH, set_value='train')

In [None]:
metadata_test = get_result_df(path=ABS_PATH, set_value='test')

In [None]:
classes = sorted(list(metadata_train['category_id'].unique()))
classes == list(range(min(classes), len(classes) + 1))

In [None]:
metadata_train['category_id'].value_counts()

## Label preprocessing

In [None]:
le_preprocessor = LabelEncoder()
le_preprocessor.fit(metadata_train['category_id'])

In [None]:
metadata_train['category_id_le_preprocessed'] = le_preprocessor.transform(metadata_train['category_id'])

In [None]:
classes = sorted(list(metadata_train['category_id_le_preprocessed'].unique()))
classes == list(range(min(classes), len(classes)))

# Train Test Split

In [None]:
# train_indices, test_indices, _, _ = train_test_split(metadata_train.index, 
#                                                      metadata_train['category_id_le_preprocessed'],
#                                                      train_size=0.75, 
#                                                      random_state=RANDOM_SEED,                                                     
#                                                      shuffle=True, 
#                                                      stratify=metadata_train['category_id_le_preprocessed'])

In [None]:
grouped = metadata_train.groupby('category_id_le_preprocessed', as_index=False).count()

In [None]:
little_classes = grouped[grouped['id'] < 3]['category_id_le_preprocessed']

In [None]:
little_classes

In [None]:
train_indices, test_indices, _, _ = train_test_split(metadata_train.index, 
                                                     metadata_train['category_id_le_preprocessed'],
                                                     train_size=0.75, 
                                                     random_state=RANDOM_SEED,                                                     
                                                     shuffle=True)

In [None]:
train_data = metadata_train.loc[train_indices, :]
train_data.shape

train_data.reset_index(inplace=True)

In [None]:
test_data = metadata_train.loc[test_indices, :]
test_data.shape

test_data.reset_index(inplace=True)

In [None]:
test_indices, val_indices, _, _ = train_test_split(test_data.index, 
                                                   test_data['category_id_le_preprocessed'],
                                                   train_size=0.80, 
                                                   random_state=RANDOM_SEED,                                                     
                                                   shuffle=True)

In [None]:
val_data = test_data.loc[val_indices, :]
val_data.shape
val_data.reset_index(inplace=True)

In [None]:
test_data = test_data.loc[test_indices, :]
test_data.shape
test_data.reset_index(inplace=True)

## Class weights

In [None]:
class_weights = Counter(train_data['category_id_le_preprocessed'])
class_weights = [item[1] for item in sorted(list(class_weights.items()), key=lambda x : x[0])]

# Model Development

In [None]:
import torch

In [None]:
from torch import Tensor
from torch.utils.data import DataLoader
from utils_scripts import Specimen_Dataset, \
                          Data_Pipeline, \
                          Resizer, \
                          Normalizer, \
                          ToTensor, \
                          NN_Model_Trainer

In [None]:
data_pipe_obj = Data_Pipeline(
    Resizer(output_size=(256,256)),
    Normalizer(mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225]),
    ToTensor()
)

In [None]:
train_dataset = Specimen_Dataset(dataset=train_data, set_value='train', transform=data_pipe_obj)
test_dataset = Specimen_Dataset(dataset=test_data, set_value='test', transform=data_pipe_obj)
val_dataset = Specimen_Dataset(dataset=val_data, set_value='val', transform=data_pipe_obj)
test_subm_dataset = Specimen_Dataset(dataset=metadata_test, set_value='test_submission', transform=data_pipe_obj)

In [None]:
print(f'train dataset : {len(train_dataset)}')
print(f'test dataset : {len(test_dataset)}')
print(f'val dataset : {len(val_dataset)}')
print(f'subm dataset : {len(test_subm_dataset)}')

In [None]:
BATCH_SIZE = 256
torch.manual_seed(RANDOM_SEED)
torch.cuda.manual_seed(RANDOM_SEED)

In [None]:
train_dataloader = DataLoader(dataset=train_dataset, shuffle=True, batch_size=BATCH_SIZE, num_workers = 8, pin_memory=False)
test_dataloader = DataLoader(dataset=test_dataset, shuffle=True, batch_size=BATCH_SIZE, num_workers = 8, pin_memory=False)
val_dataloader = DataLoader(dataset=val_dataset, shuffle=True, batch_size=BATCH_SIZE, num_workers = 8, pin_memory=False)
test_subm_dataloader = DataLoader(dataset=test_subm_dataset, shuffle=False, batch_size=BATCH_SIZE, num_workers = 8, pin_memory=False)

In [None]:
loaders = {
    'train' : train_dataloader,
    'test' : test_dataloader,
    'val' : val_dataloader,
    'submission' : test_subm_dataloader
}

## ResNet-18

In [None]:
from collections import namedtuple

In [None]:
from torch.optim import SGD, lr_scheduler
from torch.nn import Linear, CrossEntropyLoss
from torchvision.models import resnet18

In [None]:
NUM_OF_CLASSES = len(classes)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
device

In [None]:
resnet18_model = resnet18(pretrained=True)
resnet18_model.fc = Linear(resnet18_model.fc.in_features, NUM_OF_CLASSES)

In [None]:
resnet18_model = resnet18_model.to(device)

loss_func = CrossEntropyLoss()
optimizer_sgd = SGD
exp_lr_scheduler = lr_scheduler.StepLR

In [None]:
model_param = {
    'img_size' : '256x256',
    'learning_rate' : 0.001,
    'epochs' : 5,
    'momentum' : 0.9,
    'num_of_classes' : NUM_OF_CLASSES,
    'Retrain_path' : '../input/resnet18-model-state-info-v2/Resnet-18_model_state_info.pth',
    'optimizer' : 'sgd',
    'batch_size' : 256,
    'loss_function' : 'cross-entropy'
}

environment_param = {
    'abs_path' : '/kaggle/working/',
    'title' : 'Resnet-18',
    'version' : 'V2',
    'verbose_mode' : ''
}

In [None]:
abs_model_helper = NN_Model_Trainer(model = resnet18_model, 
                                    optimizer = optimizer_sgd, 
                                    loss_func = loss_func,
                                    scheduler = exp_lr_scheduler,
                                    label_encoder = le_preprocessor,
                                    loaders = loaders,
                                    model_param = model_param, 
                                    environment_param = environment_param)

abs_model_helper.train_model()