# Import Modules

## Standard modules

In [None]:
import os
import json
import pickle as pkl

from collections import Counter

## External modules

In [None]:
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn.metrics import precision_score, \
                            recall_score, \
                            f1_score, \
                            roc_auc_score \
            
from tqdm import tqdm, trange
from pylab import rcParams



tqdm.pandas()
%matplotlib inline
warnings.filterwarnings('ignore')
rcParams['figure.figsize'] = 10, 10

## Internal modules

In [None]:
import utils_scripts as utlis

# Constants

In [None]:
RANDOM_SEED = 17
np.random.seed(RANDOM_SEED)

ABS_PATH = '/kaggle/input/herbarium-2020-fgvc7/nybg2020/'

# Data EDA

In [None]:
def get_result_df(path, set_value):
    with open(os.path.join(ABS_PATH, set_value, 'metadata.json'), "r", encoding="ISO-8859-1") as file:
        metadata = json.load(file)
        
    img_info = pd.DataFrame(metadata['images'])
    
    if set_value == 'train':
        annotation_info = pd.DataFrame(metadata['annotations']).drop(columns=['image_id'])
        img_info = img_info.merge(annotation_info, on='id')
    
    img_info['file_name'] = img_info['file_name'].progress_apply(lambda x : os.path.join(path, set_value, x))
    return img_info

In [None]:
metadata_train = get_result_df(path=ABS_PATH, set_value='train')
metadata_test = get_result_df(path=ABS_PATH, set_value='test')

In [None]:
classes = sorted(list(metadata_train['category_id'].unique()))
classes == list(range(min(classes), len(classes) + 1))

In [None]:
metadata_train['category_id'].value_counts()

## Label preprocessing

In [None]:
# le_preprocessor = LabelEncoder()
# le_preprocessor.fit(metadata_train['category_id'])
# metadata_train['category_id_le_preprocessed'] = le_preprocessor.transform(metadata_train['category_id'])

# classes = sorted(list(metadata_train['category_id_le_preprocessed'].unique()))
# classes == list(range(min(classes), len(classes)))

# Train Test Split

In [None]:
# train_indices, test_indices, _, _ = train_test_split(metadata_train.index, 
#                                                      metadata_train['category_id_le_preprocessed'],
#                                                      train_size=0.75, 
#                                                      random_state=RANDOM_SEED,                                                     
#                                                      shuffle=True, 
#                                                      stratify=metadata_train['category_id_le_preprocessed'])

In [None]:
min_samples = 3
grouped = metadata_train.groupby('category_id', as_index=False).count()
little_classes = grouped[grouped['id'] < min_samples]['category_id']
print(metadata_train.shape)

big_classes_cond = metadata_train['category_id'].isin(little_classes.values)
metadata_train = metadata_train[big_classes_cond == False].reset_index().drop(columns=['index'])
print(metadata_train.shape)


le_preprocessor = LabelEncoder()
le_preprocessor.fit(metadata_train['category_id'])
metadata_train['category_id_le_preprocessed'] = le_preprocessor.transform(metadata_train['category_id'])

In [None]:
train_indices, test_indices, _, _ = train_test_split(metadata_train.index, 
                                                     metadata_train['category_id_le_preprocessed'],
                                                     train_size=0.75, 
                                                     random_state=RANDOM_SEED,                                                     
                                                     shuffle=True)

train_data = metadata_train.loc[train_indices, :]
print(train_data.shape)

train_data.reset_index(inplace=True)

In [None]:
test_data = metadata_train.loc[test_indices, :]
print(test_data.shape)

test_data.reset_index(inplace=True)

test_indices, val_indices, _, _ = train_test_split(test_data.index, 
                                                   test_data['category_id_le_preprocessed'],
                                                   train_size=0.80, 
                                                   random_state=RANDOM_SEED,                                                     
                                                   shuffle=True)

val_data = test_data.loc[val_indices, :]
print(val_data.shape)
val_data.reset_index(inplace=True)

test_data = test_data.loc[test_indices, :]
print(test_data.shape)
test_data.reset_index(inplace=True)

## Class weights

In [None]:
class_weights = Counter(train_data['category_id_le_preprocessed'])
class_weights = [item[1] for item in sorted(list(class_weights.items()), key=lambda x : x[0])]

# Model Development

In [None]:
import torch

In [None]:
from torch import Tensor
from torch.utils.data import DataLoader
from utils_scripts import Specimen_Dataset, \
                          Data_Pipeline, \
                          Resizer, \
                          Normalizer, \
                          ToTensor \
#                           NN_Model_Helper

In [None]:
data_pipe_obj = Data_Pipeline(
    Resizer(output_size=(128,128)),
    Normalizer(mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225]),
    ToTensor()
)

train_dataset = Specimen_Dataset(dataset=train_data, set_value='train', transform=data_pipe_obj)
test_dataset = Specimen_Dataset(dataset=test_data, set_value='test', transform=data_pipe_obj)
val_dataset = Specimen_Dataset(dataset=val_data, set_value='val', transform=data_pipe_obj)
test_subm_dataset = Specimen_Dataset(dataset=metadata_test, set_value='test_submission', transform=data_pipe_obj)

print(f'train dataset : {len(train_dataset)}')
print(f'test dataset : {len(test_dataset)}')
print(f'val dataset : {len(val_dataset)}')
print(f'subm dataset : {len(test_subm_dataset)}')

BATCH_SIZE = 128
torch.manual_seed(RANDOM_SEED)
torch.cuda.manual_seed(RANDOM_SEED)

train_dataloader = DataLoader(dataset=train_dataset, shuffle=True, batch_size=BATCH_SIZE, num_workers = 8, pin_memory=False)
test_dataloader = DataLoader(dataset=test_dataset, shuffle=True, batch_size=BATCH_SIZE, num_workers = 8, pin_memory=False)
val_dataloader = DataLoader(dataset=val_dataset, shuffle=True, batch_size=BATCH_SIZE, num_workers = 8, pin_memory=False)
test_subm_dataloader = DataLoader(dataset=test_subm_dataset, shuffle=False, batch_size=BATCH_SIZE, num_workers = 8, pin_memory=False)


loaders = {
    'train' : train_dataloader,
    'test' : test_dataloader,
    'val' : val_dataloader,
    'submission' : test_subm_dataloader
}

## ResNet-50

In [None]:
from collections import namedtuple
from torch.optim import SGD, lr_scheduler, Adam
from torch.nn import Linear, CrossEntropyLoss, AdaptiveAvgPool2d
from torchvision.models import resnet50

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
resnet50_model = resnet50(pretrained=True)

In [None]:
cross_entropy_loss_function = CrossEntropyLoss()
optimizer_sgd = Adam
exp_lr_scheduler = lr_scheduler.ReduceLROnPlateau

In [None]:
NUM_OF_CLASSES = metadata_train['category_id_le_preprocessed'].unique().shape[0]
IMG_SIZE = '128x128'
BATCH_SIZE = 128
RANDOM_SEED = 17
EPOCHS = 12
MIN_SAMPLES = min_samples

In [None]:
params = {
    'logs' : {
        'abs_path' : '/kaggle/working/',
        'version' : 'V1',
        'title' : 'Resnet-50',
    },
    'info' : {
        'model' : 'Resnet-50',
        'optimizer' : 'Adam',
        'scheduler' : 'ReduceLROnPlateau',
        'loss' : 'Cross_Entropy'
    },
    'model_params' : {
        'is_freeze' : False,
        'pretrained' : True,
        'num_of_classes' : NUM_OF_CLASSES,
    },
    'optimizer_params' : {
        'lr' : 4e-4,
#         'momentum' : 0.9,
        'amsgrad' : False
    },
    'scheduler_params' : {
        'mode' : 'min',
        'factor' : 0.75,
        'patience' : 5,
        'eps' : 1e-6
    },
    'common_params' : {
        'epochs' : 1,
        'start' : 1,
        'img_size' : '256x256',
        'batch_size' : 256,
        'random_seed' : RANDOM_SEED,
        'min_sample' : MIN_SAMPLES,
        'num_of_workers' : 8
    }
}

In [None]:
NUM_OF_CLASSES

In [None]:
import time
import copy

from timeit import default_timer as timer


In [None]:
class NN_Model_Helper:

    def __init__(self,model, 
                 optimizer, 
                 scheduler, 
                 loss_function,
                 label_encoder,
                 loaders:dict, 
                 params:dict):

        self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

        self.model = model.to(self.device)
        self.loss_function = loss_function
        self.label_encoder = label_encoder
        self.optimizer = optimizer
        self.scheduler = scheduler
        self.loaders = loaders
        self.params = params

        self.is_best_model = False
        self.model_state_info = self.init_model_state_info()
        self.train_logs = pd.DataFrame(columns=['epoch', 'loss', 'learning_rate', 'precision', 'recall', 'f1_score'])
        self.val_logs = pd.DataFrame(columns=['epoch', 'loss', 'learning_rate', 'precision', 'recall', 'f1_score'])
        self.results = pd.DataFrame(columns=['phase', 'precision', 'recall', 'f1_score'])
        self.submission = pd.DataFrame(columns=['Id', 'Predicted'])

        self.filelogs_info = self.init_logs()

        self._weights = None
        self._num_of_weights = None

        self.total_time = timer()
        self.epoch_time = None
        self.kaggle_limit = 32400

    
    def init_logs(self,):
        model_directory = self.init_model_directory()
        
        return {
            'train_logs_pd' : os.path.join(model_directory, 'train_logs.csv'),
            'val_logs_pd' : os.path.join(model_directory, 'val_logs.csv'),
            'results' : os.path.join(model_directory, 'results.csv'),
            'submission_df' : os.path.join(model_directory, 'submission.csv'),
            'txt_logs' : os.path.join(model_directory, 'training_logs.txt'),
            'model_state_info' : os.path.join(model_directory, 'model_state_info.pth')

        }

    def init_model_directory(self):
        logs = self.params.get('logs')
        model_directory = os.path.join(logs.get('abs_path'), logs.get('title'), logs.get('version'))
        
        os.makedirs(name = model_directory, exist_ok = True)
        return model_directory

    def init_model_state_info(self,):
        model_state_info = {
            'last_model' : None,
            'best_model' : None,
        }
        return model_state_info

    @property
    def weights(self):
        weights = {}
        for param in self.model.named_parameters():
            weights[param[0]] = {
                'size' : param[1].flatten().size()[0],
                'requires_grad' : param[1].requires_grad
            }

        self._weights = weights
        
        return self._weights

    @property
    def num_of_weights(self,):
        num_ = 0
        for param_name, info in self.weights.items():
            if info['requires_grad']:
                num_ += info['size']

        self._num_of_weights = num_
        return self._num_of_weights

    def load_model(self,model_phase):
        model = self.model_state_info.get(model_phase)
        if model.get('model_state_dict') and model.get('optimizer_state_dict'):
            self.model.load_state_dict(model['model_state_dict'])
            self.optimizer.load_state_dict(model['optimizer_state_dict'])
            
            if self.device != 'cpu':
                self.model = self.model.to(self.device)
            
            if model_phase == 'best_model':
                self.is_best_model = True


            return True
        
        return False

    def update_model(self, path):
        self.model_state_info = torch.load(path, map_location=self.device)
        self.load_model(model_phase='last_model')

    def assign_model_state_dict(self, model_phase, f1_score, loss):
        keys = self.get_model_save_snippet()
        values = [copy.deepcopy(self.model.state_dict()), copy.deepcopy(self.optimizer.state_dict()), 
                 f1_score, loss, self.weights]
        
        self.model_state_info[model_phase] = dict(zip(keys,values))

    def pd_logs_update(self, phase,**row):
        if phase == 'train':
            self.train_logs = self.train_logs.append(row, ignore_index=True)
        elif phase == 'val':
            self.val_logs = self.val_logs.append(row, ignore_index=True)
    
    def metrics_calculation(self,y_true, y_pred, average='macro'):
        y_true_cpu = y_true if not y_true.is_cuda else y_true.to('cpu').numpy()
        y_pred_cpu = y_pred if not y_pred.is_cuda else y_pred.to('cpu').numpy()

        return {
            'precision' : precision_score(y_true_cpu,y_pred_cpu,average=average),
            'recall' : recall_score(y_true_cpu,y_pred_cpu,average=average),
            'f1_score' : f1_score(y_true_cpu,y_pred_cpu,average=average)
        }

    def results_update(self,y_true, y_pred, phase, metrics=None):
        if metrics is None:
            metrics = self.metrics_calculation(y_true, y_pred)
        
        metrics['phase'] = phase

        self.results = self.results.append(metrics, ignore_index=True)

    def metrics_logs_update(self, y_true, y_pred, epoch, loss, learning_rate, phase, metrics=None):
        if metrics is None:
            metrics = self.metrics_calculation(y_true, y_pred)
        
        metrics['epoch'] = epoch
        metrics['loss'] = loss
        metrics['learning_rate'] = learning_rate

        self.pd_logs_update(phase, **metrics)

    def init_model(self,is_freeze=True):
        num_of_classes = self.params.get('model_params').get('num_of_classes')
        self.model.fc = Linear(in_features=self.model.fc.in_features, out_features=num_of_classes)
        
        if self.device != 'cpu':
            self.model = self.model.to(self.device)

        optim_params = self.params.get('optimizer_params')
        self.optimizer = self.optimizer(self.model.parameters(), **optim_params)
        
        scheduler_params = self.params.get('scheduler_params')
        self.scheduler = self.scheduler(self.optimizer, **scheduler_params)
        
        if is_freeze:
            self.freeze()

    def freeze(self, is_fc=True):
        for param in self.model.named_parameters():
            if (param[0] == 'fc.weight' or param[0] == 'fc.bias') and is_fc:
                continue

            else:
                param[1].requires_grad = False

    def get_model_save_snippet(self):
        return [
            'model_state_dict',
            'optimizer_state_dict',
            'f1_score',
            'loss',
            'weights_info'
        ]

    def print_params(self, params:dict, outputfile, is_weights=False):
        param_format = '{:15} : {:15}'
        for key, value in params.items():
            print(param_format.format(key, value), file=outputfile)
        
        print(file=outputfile)
        print(100*'*', file=outputfile)
        print(file=outputfile)

        if is_weights:
            print(param_format.format('The num of weights : ', self.num_of_weights), file=outputfile)
            print(file=outputfile)
            print(100*'*', file=outputfile)
            print(file=outputfile)

    def parse_batch(self, batch, phase):
        img = batch.get('img')
        img_id = batch.get('id')
        category_id = batch.get('category_id')

        if phase == 'submission' and self.device != 'cpu':
            return img.to(self.device), img_id.to(self.device)
        elif phase == 'submission' and self.device == 'cpu':
            return img, img_id
        
        elif phase != 'submission' and self.device != 'cpu':
            return img.to(self.device), category_id.to(self.device)

        else:
            return img, category_id

    def make_checkpoint(self):
        torch.save(self.model_state_info, self.filelogs_info.get('model_state_info'))
        self.train_logs.to_csv(self.filelogs_info.get('train_logs_pd'))
        self.val_logs.to_csv(self.filelogs_info.get('val_logs_pd'))
        self.results.to_csv(self.filelogs_info.get('results'))

    def check_left_time(self, mode='epoch'):
        ratio = 1 if mode == 'epoch' else 2

        if (self.kaggle_limit - self.total_time) < self.epoch_time // ratio:
            return False
        else:
            return True

    def train(self,model_freeze,update_path,epochs, is_test=False, start=1):

        self.epoch_time = timer()
        
        txt_outputfile = open(self.filelogs_info.get('txt_logs'), 'w')

        self.print_params(params=self.params.get('logs'), outputfile=txt_outputfile)
        self.print_params(params=self.params.get('info'), outputfile=txt_outputfile)
        self.print_params(params=self.params.get('model_params'), outputfile=txt_outputfile)
        self.print_params(params=self.params.get('optimizer_params'), outputfile=txt_outputfile)
        self.print_params(params=self.params.get('scheduler_params'), outputfile=txt_outputfile)
        self.print_params(params=self.params.get('common_params'), outputfile=txt_outputfile, is_weights=True)

        info = self.params.get('info')

        filelogs_fmt = '{} -> Epoch {} -> Batch_index {} -> Loss {}'
        epoch_logs_format = '{} -> Epoch_loss : {}'
        
        self.init_model(is_freeze=model_freeze)

        if update_path:
            self.update_model(path=update_path)
        
        if model_freeze:
            self.freeze()

        best_f1_score = -1.0
        
        for epoch_index in trange(start, epochs+1, desc='epochs'):
            
            epoch_loss = 0.0
            if self.device == 'cpu':
                y_true, y_pred = torch.IntTensor([], device=self.device), torch.IntTensor([], device=self.device)
            
            else:
                y_true, y_pred = torch.IntTensor([]).to(self.device), torch.IntTensor([]).to(self.device)

            for batch_index, batch in enumerate(tqdm(self.loaders.get('train')),1):
                
                img, category_id = self.parse_batch(batch, phase='train')

                self.optimizer.zero_grad()
                outputs = self.model(img)

                _, preds = torch.max(outputs, 1)

                y_true = torch.cat((y_true, category_id))
                y_pred = torch.cat((y_pred, preds))

                loss = self.loss_function(outputs, category_id)
                loss_item = loss.item()
                epoch_loss += loss_item
                
                loss.backward()
                self.optimizer.step()

                print(filelogs_fmt.format(time.ctime() ,epoch_index, batch_index, loss_item), file=txt_outputfile)

                if is_test and batch_index == 2:
                    break

            
            epoch_loss = epoch_loss / (batch_index)
            metrics_val, avg_val_loss = self.test(phase='val', epoch=epoch_index,loss=epoch_loss, is_test=is_test)
            current_f1_score = metrics_val.get('f1_score')

            self.metrics_logs_update(y_true,
                                     y_pred, 
                                     epoch=epoch_index, 
                                     loss=epoch_loss, 
                                     learning_rate=self.optimizer.param_groups[0]['lr'], 
                                     phase='train')


            if info.get('scheduler') is not None and info.get('scheduler') == 'ReduceLROnPlateau':
                self.scheduler.step(avg_val_loss)
            
            else:
                self.scheduler.step()

            self.assign_model_state_dict(model_phase='last_model', f1_score=current_f1_score, loss=epoch_loss)

            if current_f1_score > best_f1_score:
                best_f1_score = current_f1_score
                self.assign_model_state_dict(model_phase='best_model', f1_score=best_f1_score, loss=epoch_loss)

            self.make_checkpoint()

            if is_test:
                break

            if epoch_index == 1:
                self.epoch_time = timer() - self.epoch_time

            if not self.check_left_time(mode='epoch'):
                print('', file=txt_outputfile)
                print('Kaggle time limit exceeded. Epoch mode', file=txt_outputfile)
                break


        if not self.check_left_time(mode='test'):
            print('', file=txt_outputfile)
            print('Kaggle time limit exceeded. Test mode', file=txt_outputfile)
            return 
        
        self.results_update(y_true, y_pred, phase='train')
        self.load_model(model_phase='best_model')
        self.test(phase='val', epoch=epoch_index, loss=epoch_loss, is_last=True, is_test=is_test)
        self.test(phase='test', is_test=is_test)

        self.make_checkpoint()
        self.submit(is_test=is_test)

    def test(self, phase, update_path=None, epoch=None, loss=None, is_last=False, is_test=False):

        if update_path is not None:
            self.update_model(path=update_path)
            self.load_model(model_phase='best_model')

        if not self.is_best_model and phase == 'test':
            self.load_model(model_phase='best_model')
        
        elif not self.is_best_model and phase == 'val' and is_last==True:
            self.load_model(model_phase='best_model')
        
        if self.device == 'cpu':
                y_true, y_pred = torch.IntTensor([], device=self.device), torch.IntTensor([], device=self.device)
            
        else:
            y_true, y_pred = torch.IntTensor([]).to(self.device), torch.IntTensor([]).to(self.device)

        avg_loss = 0.0

        for batch_index, batch in enumerate(tqdm(self.loaders.get(phase)),1):
            img, category_id = self.parse_batch(batch, phase)
            
            with torch.no_grad():
                outputs = self.model(img)

            _, preds = torch.max(outputs,1)
            y_true = torch.cat((y_true, category_id))
            y_pred = torch.cat((y_pred, preds))

            loss = self.loss_function(outputs, category_id)
            avg_loss += loss.item()

            if is_test and batch_index == 2:
                break
        
        avg_loss = avg_loss // batch_index
        metrics = self.metrics_calculation(y_true, y_pred)

        if phase == 'val' and not is_last:        
            self.metrics_logs_update(y_true, 
                                    y_pred, 
                                    epoch, 
                                    loss, 
                                    learning_rate=self.optimizer.param_groups[0]['lr'], 
                                    phase=phase,
                                    metrics=metrics)
            return metrics, avg_loss

        elif phase == 'val' and is_last:
            self.results_update(y_true, y_pred, phase,metrics)
        
        elif phase == 'test':
            self.results_update(y_true, y_pred, phase,metrics)  

    def submit(self, update_path='', is_test=False) -> None:
        if update_path:
            self.update_model(path=update_path)
            self.load_model(model_phase='best_model')

        if not self.is_best_model:
            self.load_model(model_phase='best_model')

        if self.device == 'cpu':
            y_pred, img_ids = torch.IntTensor([], device=self.device), torch.IntTensor([], device=self.device)
            
        else:
            y_pred, img_ids = torch.IntTensor([]).to(self.device), torch.IntTensor([]).to(self.device)

        for batch_index, batch in enumerate(tqdm(self.loaders.get('submission')),1):
            img, img_id = self.parse_batch(batch, phase='submission')
            
            with torch.no_grad():
                outputs = self.model(img)

            _, preds = torch.max(outputs,1)
            y_pred = torch.cat((y_pred, preds))
            img_ids = torch.cat((img_ids, img_id))

            if is_test and batch_index == 2:
                break

        
        y_pred_postprocessed = self.label_encoder.inverse_transform(y_pred.to('cpu').numpy())

        self.submission['Id'] = img_ids.to('cpu').numpy()
        self.submission['Predicted'] = y_pred_postprocessed

        self.submission.to_csv(self.filelogs_info.get('submission_df'),header=True,index=False)
        


        

In [None]:
resnet50_epoch4_model_state_info = '../input/resnet50-epoch-4/model_state_info.pth'

In [None]:
resnet50_model_helper = NN_Model_Helper(model=resnet50_model, 
                                        optimizer=optimizer_sgd,
                                        scheduler=exp_lr_scheduler,
                                        loss_function=cross_entropy_loss_function,
                                        label_encoder=le_preprocessor,
                                        loaders=loaders,
                                        params=params)

In [None]:
resnet50_model_helper.train(model_freeze=False,
                            update_path=resnet50_epoch4_model_state_info,
                            epochs=1,
                            is_test=False)