# Import Modules

## Standard modules

In [78]:
import os
import json
import pickle as pkl

from collections import Counter

## External modules

In [79]:
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn.metrics import precision_score, \
                            recall_score, \
                            f1_score, \
                            roc_auc_score \
            
from tqdm import tqdm, trange
from pylab import rcParams



tqdm.pandas()
%matplotlib inline
warnings.filterwarnings('ignore')
rcParams['figure.figsize'] = 10, 10

## Internal modules

In [81]:
import utils_scripts as utlis

# Constants

In [82]:
RANDOM_SEED = 17
np.random.seed(RANDOM_SEED)

ABS_PATH = '/kaggle/input/herbarium-2020-fgvc7/nybg2020/'

# Data EDA

In [83]:
def get_result_df(path, set_value):
    with open(os.path.join(ABS_PATH, set_value, 'metadata.json'), "r", encoding="ISO-8859-1") as file:
        metadata = json.load(file)
    
    for column_name, column in metadata.items():
        print(f'{column_name} - {len(column)} values')
        
    img_info = pd.DataFrame(metadata['images'])
    
    if set_value == 'train':
        annotation_info = pd.DataFrame(metadata['annotations']).drop(columns=['image_id'])
        img_info = img_info.merge(annotation_info, on='id')
    
    img_info['file_name'] = img_info['file_name'].progress_apply(lambda x : os.path.join(path, set_value, x))
    return img_info

In [84]:
submission_example = pd.read_csv('/kaggle/input/herbarium-2020-fgvc7/sample_submission.csv')
submission_example.tail()

Unnamed: 0,Id,Predicted
138287,138287,0
138288,138288,0
138289,138289,0
138290,138290,0
138291,138291,0


In [109]:
metadata_train = get_result_df(path=ABS_PATH, set_value='train')
metadata_train.head()

annotations - 1030747 values
categories - 32094 values
images - 1030747 values
info - 6 values
licenses - 1 values
regions - 4 values


100%|██████████| 1030747/1030747 [00:04<00:00, 253437.14it/s]


Unnamed: 0,file_name,height,id,license,width,category_id,region_id
0,/kaggle/input/herbarium-2020-fgvc7/nybg2020/tr...,1000,354106,1,661,15672,1
1,/kaggle/input/herbarium-2020-fgvc7/nybg2020/tr...,1000,818566,1,661,11524,1
2,/kaggle/input/herbarium-2020-fgvc7/nybg2020/tr...,1000,750704,1,661,11524,1
3,/kaggle/input/herbarium-2020-fgvc7/nybg2020/tr...,1000,722381,1,661,11467,1
4,/kaggle/input/herbarium-2020-fgvc7/nybg2020/tr...,1000,382783,1,661,15660,1


In [110]:
metadata_test = get_result_df(path=ABS_PATH, set_value='test')
metadata_test.head()

images - 138292 values
info - 6 values
licenses - 1 values


100%|██████████| 138292/138292 [00:00<00:00, 248866.42it/s]


Unnamed: 0,file_name,height,id,license,width
0,/kaggle/input/herbarium-2020-fgvc7/nybg2020/te...,1000,104891,1,661
1,/kaggle/input/herbarium-2020-fgvc7/nybg2020/te...,1000,18029,1,661
2,/kaggle/input/herbarium-2020-fgvc7/nybg2020/te...,1000,35151,1,661
3,/kaggle/input/herbarium-2020-fgvc7/nybg2020/te...,1000,124144,1,682
4,/kaggle/input/herbarium-2020-fgvc7/nybg2020/te...,1000,24649,1,682


In [93]:
classes = sorted(list(metadata_train['category_id'].unique()))
classes == list(range(min(classes), len(classes) + 1))

False

In [88]:
metadata_train['category_id'].value_counts()

23718    1765
163      1195
5340     1090
23713    1077
8437     1024
         ... 
10370       2
7592        2
22922       1
24021       1
23142       1
Name: category_id, Length: 32093, dtype: int64

## Label preprocessing

In [111]:
le_preprocessor = LabelEncoder()
le_preprocessor.fit(metadata_train['category_id'])

LabelEncoder()

In [112]:
metadata_train['category_id_le_preprocessed'] = le_preprocessor.transform(metadata_train['category_id'])

In [92]:
classes = sorted(list(metadata_train['category_id_le_preprocessed'].unique()))
classes == list(range(min(classes), len(classes)))

True

## Class weights

In [94]:
class_weights = Counter(metadata_train['category_id_le_preprocessed'])

In [95]:
class_weights = [item[1] for item in sorted(list(class_weights.items()), key=lambda x : x[0])]

In [96]:
class_weights

[6,
 15,
 2,
 3,
 16,
 10,
 4,
 29,
 8,
 36,
 40,
 3,
 21,
 44,
 6,
 13,
 4,
 12,
 27,
 4,
 11,
 2,
 20,
 6,
 2,
 12,
 3,
 4,
 2,
 10,
 13,
 109,
 40,
 59,
 131,
 10,
 81,
 8,
 2,
 4,
 2,
 5,
 6,
 6,
 10,
 4,
 3,
 4,
 2,
 5,
 2,
 2,
 4,
 52,
 4,
 3,
 274,
 176,
 91,
 2,
 32,
 83,
 23,
 96,
 60,
 13,
 4,
 4,
 12,
 4,
 9,
 222,
 27,
 37,
 10,
 9,
 13,
 36,
 4,
 9,
 4,
 7,
 6,
 2,
 4,
 4,
 35,
 12,
 6,
 30,
 6,
 8,
 4,
 67,
 8,
 56,
 4,
 17,
 12,
 8,
 45,
 9,
 8,
 6,
 16,
 5,
 9,
 2,
 12,
 20,
 90,
 32,
 4,
 13,
 20,
 4,
 33,
 8,
 12,
 26,
 16,
 2,
 19,
 5,
 9,
 25,
 24,
 9,
 2,
 22,
 17,
 4,
 30,
 8,
 21,
 8,
 3,
 70,
 41,
 2,
 7,
 12,
 162,
 58,
 37,
 10,
 6,
 131,
 68,
 249,
 67,
 77,
 230,
 495,
 111,
 266,
 142,
 560,
 272,
 430,
 317,
 21,
 32,
 1195,
 4,
 12,
 2,
 67,
 35,
 8,
 32,
 3,
 4,
 2,
 56,
 5,
 560,
 333,
 123,
 72,
 33,
 190,
 6,
 4,
 36,
 60,
 65,
 23,
 4,
 12,
 83,
 32,
 73,
 33,
 28,
 12,
 93,
 9,
 6,
 44,
 36,
 2,
 5,
 3,
 12,
 2,
 2,
 11,
 4,
 8,
 2,
 4,
 10,
 2,
 10

# Train Test Split

In [97]:
train_indices, test_indices, _, _ = train_test_split(metadata_train.index, 
                                                     metadata_train['category_id_le_preprocessed'],
                                                     train_size=0.75, 
                                                     random_state=RANDOM_SEED,                                                     
                                                     shuffle=True, 
                                                     stratify=metadata_train['category_id_le_preprocessed'])

ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

In [113]:
grouped = metadata_train.groupby('category_id_le_preprocessed', as_index=False).count()

In [114]:
little_classes = grouped[grouped['id'] < 3]['category_id_le_preprocessed']

In [115]:
little_classes

2            2
21          21
24          24
28          28
38          38
         ...  
32061    32061
32068    32068
32073    32073
32080    32080
32082    32082
Name: category_id_le_preprocessed, Length: 3729, dtype: int64

In [116]:
train_indices, test_indices, _, _ = train_test_split(metadata_train.index, 
                                                     metadata_train['category_id_le_preprocessed'],
                                                     train_size=0.75, 
                                                     random_state=RANDOM_SEED,                                                     
                                                     shuffle=True)

In [117]:
train_data = metadata_train.loc[train_indices, :]
train_data.shape

(773060, 8)

In [118]:
test_data = metadata_train.loc[test_indices, :]
test_data.shape

(257687, 8)

In [119]:
test_indices, val_indices, _, _ = train_test_split(test_data.index, 
                                                   test_data['category_id_le_preprocessed'],
                                                   train_size=0.80, 
                                                   random_state=RANDOM_SEED,                                                     
                                                   shuffle=True)

In [120]:
val_data = test_data.loc[val_indices, :]
val_data.shape

(51538, 8)

In [None]:
test_data = test_data.loc[test_indices, :]
test_data.shape