# Import Modules

## Standard modules

In [3]:
import os
import json
import pickle as pkl

from collections import Counter

## External modules

In [4]:
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn.metrics import precision_score, \
                            recall_score, \
                            f1_score, \
                            roc_auc_score \
            
from tqdm import tqdm, trange
from pylab import rcParams



tqdm.pandas()
%matplotlib inline
warnings.filterwarnings('ignore')
rcParams['figure.figsize'] = 10, 10

## Internal modules

In [32]:
import utils_scripts as utlis

# Constants

In [5]:
RANDOM_SEED = 17
np.random.seed(RANDOM_SEED)

ABS_PATH = '/kaggle/input/herbarium-2020-fgvc7/nybg2020/'

# Data EDA

In [6]:
def get_result_df(path, set_value):
    with open(os.path.join(ABS_PATH, set_value, 'metadata.json'), "r", encoding="ISO-8859-1") as file:
        metadata = json.load(file)
    
    for column_name, column in metadata.items():
        print(f'{column_name} - {len(column)} values')
        
    img_info = pd.DataFrame(metadata['images'])
    
    if set_value == 'train':
        annotation_info = pd.DataFrame(metadata['annotations']).drop(columns=['image_id'])
        img_info = img_info.merge(annotation_info, on='id')
    
    img_info['file_name'] = img_info['file_name'].progress_apply(lambda x : os.path.join(path, set_value, x))
    return img_info

In [7]:
submission_example = pd.read_csv('/kaggle/input/herbarium-2020-fgvc7/sample_submission.csv')
submission_example.tail()

Unnamed: 0,Id,Predicted
138287,138287,0
138288,138288,0
138289,138289,0
138290,138290,0
138291,138291,0


In [8]:
metadata_train = get_result_df(path=ABS_PATH, set_value='train')
metadata_train.head()

annotations - 1030747 values
categories - 32094 values
images - 1030747 values
info - 6 values
licenses - 1 values
regions - 4 values


100%|██████████| 1030747/1030747 [00:04<00:00, 241717.10it/s]


Unnamed: 0,file_name,height,id,license,width,category_id,region_id
0,/kaggle/input/herbarium-2020-fgvc7/nybg2020/tr...,1000,354106,1,661,15672,1
1,/kaggle/input/herbarium-2020-fgvc7/nybg2020/tr...,1000,818566,1,661,11524,1
2,/kaggle/input/herbarium-2020-fgvc7/nybg2020/tr...,1000,750704,1,661,11524,1
3,/kaggle/input/herbarium-2020-fgvc7/nybg2020/tr...,1000,722381,1,661,11467,1
4,/kaggle/input/herbarium-2020-fgvc7/nybg2020/tr...,1000,382783,1,661,15660,1


In [9]:
metadata_test = get_result_df(path=ABS_PATH, set_value='test')
metadata_test.head()

images - 138292 values
info - 6 values
licenses - 1 values


100%|██████████| 138292/138292 [00:00<00:00, 242252.99it/s]


Unnamed: 0,file_name,height,id,license,width
0,/kaggle/input/herbarium-2020-fgvc7/nybg2020/te...,1000,104891,1,661
1,/kaggle/input/herbarium-2020-fgvc7/nybg2020/te...,1000,18029,1,661
2,/kaggle/input/herbarium-2020-fgvc7/nybg2020/te...,1000,35151,1,661
3,/kaggle/input/herbarium-2020-fgvc7/nybg2020/te...,1000,124144,1,682
4,/kaggle/input/herbarium-2020-fgvc7/nybg2020/te...,1000,24649,1,682


In [10]:
classes = sorted(list(metadata_train['category_id'].unique()))
classes == list(range(min(classes), len(classes) + 1))

False

In [11]:
metadata_train['category_id'].value_counts()

23718    1765
163      1195
5340     1090
23713    1077
8437     1024
         ... 
10370       2
7592        2
22922       1
24021       1
23142       1
Name: category_id, Length: 32093, dtype: int64

## Label preprocessing

In [12]:
le_preprocessor = LabelEncoder()
le_preprocessor.fit(metadata_train['category_id'])

LabelEncoder()

In [13]:
metadata_train['category_id_le_preprocessed'] = le_preprocessor.transform(metadata_train['category_id'])

In [14]:
classes = sorted(list(metadata_train['category_id_le_preprocessed'].unique()))
classes == list(range(min(classes), len(classes)))

True

# Train Test Split

In [15]:
train_indices, test_indices, _, _ = train_test_split(metadata_train.index, 
                                                     metadata_train['category_id_le_preprocessed'],
                                                     train_size=0.75, 
                                                     random_state=RANDOM_SEED,                                                     
                                                     shuffle=True, 
                                                     stratify=metadata_train['category_id_le_preprocessed'])

ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

In [16]:
grouped = metadata_train.groupby('category_id_le_preprocessed', as_index=False).count()

In [17]:
little_classes = grouped[grouped['id'] < 3]['category_id_le_preprocessed']

In [18]:
little_classes

2            2
21          21
24          24
28          28
38          38
         ...  
32061    32061
32068    32068
32073    32073
32080    32080
32082    32082
Name: category_id_le_preprocessed, Length: 3729, dtype: int64

In [37]:
train_indices, test_indices, _, _ = train_test_split(metadata_train.index, 
                                                     metadata_train['category_id_le_preprocessed'],
                                                     train_size=0.75, 
                                                     random_state=RANDOM_SEED,                                                     
                                                     shuffle=True)

In [38]:
train_data = metadata_train.loc[train_indices, :]
train_data.shape

train_data.reset_index(inplace=True)

In [39]:
test_data = metadata_train.loc[test_indices, :]
test_data.shape

test_data.reset_index(inplace=True)

In [40]:
test_indices, val_indices, _, _ = train_test_split(test_data.index, 
                                                   test_data['category_id_le_preprocessed'],
                                                   train_size=0.80, 
                                                   random_state=RANDOM_SEED,                                                     
                                                   shuffle=True)

In [41]:
val_data = test_data.loc[val_indices, :]
val_data.shape
val_data.reset_index(inplace=True)

In [42]:
test_data = test_data.loc[test_indices, :]
test_data.shape
test_data.reset_index(inplace=True)

## Class weights

In [43]:
class_weights = Counter(train_data['category_id_le_preprocessed'])
class_weights = [item[1] for item in sorted(list(class_weights.items()), key=lambda x : x[0])]

# Model Development

In [44]:
import torch

In [45]:
from torch import Tensor
from torch.utils.data import DataLoader
from utils_scripts import Specimen_Dataset, \
                          Data_Pipeline, \
                          Resizer, \
                          Normalizer, \
                          ToTensor

In [57]:
data_pipe_obj = Data_Pipeline(
    Resizer(output_size=(512,512)),
    ToTensor()
)

In [58]:
train_dataset = Specimen_Dataset(dataset=train_data, set_value='train', transform=data_pipe_obj)
test_dataset = Specimen_Dataset(dataset=test_data, set_value='test', transform=data_pipe_obj)
val_dataset = Specimen_Dataset(dataset=train_data, set_value='val', transform=data_pipe_obj)
test_subm_dataset = Specimen_Dataset(dataset=train_data, set_value='test_submission', transform=data_pipe_obj)

In [59]:
BATCH_SIZE = 128
torch.manual_seed(RANDOM_SEED)
torch.cuda.manual_seed(RANDOM_SEED)

In [49]:
train_dataloader = DataLoader(dataset=train_dataset, shuffle=True, batch_size=BATCH_SIZE)
test_dataloader = DataLoader(dataset=test_dataset, shuffle=True, batch_size=BATCH_SIZE)
val_dataloader = DataLoader(dataset=val_dataset, shuffle=True, batch_size=BATCH_SIZE)
test_subm_dataloader = DataLoader(dataset=test_subm_dataset, shuffle=True, batch_size=BATCH_SIZE)

In [50]:
for batch_index, batch in enumerate(train_dataloader, 0):
    images, categories = batch['img'], batch['category_id']
    
    print(images.shape)
    print(categories.shape)
    
    break
    

RuntimeError: stack expects each tensor to be equal size, but got [3, 1000, 676] at entry 0 and [3, 1000, 683] at entry 1

In [51]:
metadata_train.columns

Index(['file_name', 'height', 'id', 'license', 'width', 'category_id',
       'region_id', 'category_id_le_preprocessed'],
      dtype='object')

In [53]:
metadata_train['height'].value_counts()

1000    1030649
667          57
670           5
665           4
683           3
682           2
675           2
802           1
731           1
742           1
788           1
818           1
720           1
858           1
386           1
928           1
975           1
977           1
721           1
533           1
718           1
700           1
598           1
684           1
673           1
669           1
666           1
662           1
661           1
649           1
615           1
696           1
Name: height, dtype: int64

In [54]:
metadata_train['width'].value_counts()

682    219673
667    212347
676    190476
681     86647
678     76882
        ...  
623         1
618         1
615         1
614         1
595         1
Name: width, Length: 181, dtype: int64