# Herb2021 

# Identifying Plant Species

In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image

import os
import json
import collections

import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader

# Loading Data
---
(adapted from [YAROSLAV ISAIENKOV's notebook](https://www.kaggle.com/code/ihelon/herbarium-2021-exploratory-data-analysis))

Can't do a simple `pd.read_json()` because not all arrays of the same length.

In [2]:
PATH_BASE = "../input/herbarium-2021-fgvc8/"
PATH_TRAIN = os.path.join(PATH_BASE, "train/")
PATH_TRAIN_META = os.path.join(PATH_TRAIN, "metadata.json")
PATH_TRAIN_IMG = os.path.join(PATH_TRAIN, "images/")

with open(PATH_TRAIN_META) as train_md:
    metadata = json.load(train_md)

In [3]:
metadata.keys()

dict_keys(['annotations', 'categories', 'images', 'info', 'licenses', 'institutions'])

In [4]:
len(metadata['annotations']), len(metadata['categories']), len(metadata['images'])

(2257759, 64500, 2257759)

print samples from the dict keys

In [5]:
print(metadata["annotations"][0])
print(metadata["images"][0])
print(metadata["categories"][0])
print(metadata["licenses"][0])
print(metadata["institutions"][0])

{'category_id': 60492, 'id': 1814367, 'image_id': 1814367, 'institution_id': 0}
{'file_name': 'images/604/92/1814367.jpg', 'height': 1000, 'id': 1814367, 'license': 0, 'width': 678}
{'family': 'Orchidaceae', 'order': 'Asparagales', 'name': 'Aa calceata (Rchb.f.) Schltr.', 'id': 0}
{'id': 0, 'name': 'Public Domain Dedication', 'url': 'http://creativecommons.org/publicdomain/zero/1.0/'}
{'id': 0, 'name': 'New York Botanical Garden'}


a few things pop out:
1. `category_id` needs to be inner joint with `categories` column
2. we are scored on macro f1 on the `id` and `category_id` columns
3. the columns we need are:
    - id
    - height
    - width
    - path
    - category_name
    - family_name
    - order_name
    - category_id

Join the columns with the same length first.

In [6]:
ids, categories, paths, heights, widths = [], [], [], [], []

for annotation, image in zip(metadata["annotations"], metadata["images"]):
    assert annotation["image_id"] == image["id"]
    ids.append(image["id"])
    paths.append(image["file_name"])
    heights.append(image['height'])
    widths.append(image['width'])
    categories.append(annotation["category_id"])
        
df_meta = pd.DataFrame({"id": ids, "height": heights, "width": widths, "path": paths, "category_id": categories})
df_meta.head()

Unnamed: 0,id,height,width,path,category_id
0,1814367,1000,678,images/604/92/1814367.jpg,60492
1,1308257,1000,666,images/108/24/1308257.jpg,10824
2,1270453,1000,739,images/330/76/1270453.jpg,33076
3,1123834,1000,672,images/247/99/1123834.jpg,24799
4,1042410,1000,675,images/170/18/1042410.jpg,17018


In [7]:
d_categories = {category["id"]: category["name"] for category in metadata["categories"]}
d_families = {category["id"]: category["family"] for category in metadata["categories"]}
d_orders = {category["id"]: category["order"] for category in metadata["categories"]}

df_meta["category_name"] = df_meta["category_id"].map(d_categories)
df_meta["family_name"] = df_meta["category_id"].map(d_families)
df_meta["order_name"] = df_meta["category_id"].map(d_orders)
df_meta = df_meta.reindex(columns = [col for col in df_meta.columns if col != 'category_id'] + ['category_id']) # move dependent variable to the end
df_meta.head()

Unnamed: 0,id,height,width,path,category_name,family_name,order_name,category_id
0,1814367,1000,678,images/604/92/1814367.jpg,Thysanocarpus curvipes Hook.,Brassicaceae,Brassicales,60492
1,1308257,1000,666,images/108/24/1308257.jpg,Cassia grandis L.f.,Fabaceae,Fabales,10824
2,1270453,1000,739,images/330/76/1270453.jpg,Leptospermum whitei Cheel,Myrtaceae,Myrtales,33076
3,1123834,1000,672,images/247/99/1123834.jpg,Fallopia scandens (L.) Holub,Polygonaceae,Caryophyllales,24799
4,1042410,1000,675,images/170/18/1042410.jpg,Cyperus dentatus Torr.,Cyperaceae,Poales,17018


check for any inconsistencies (duplicated, null etc)

In [8]:
print(df_meta.duplicated().sum())
print(df_meta.isnull().sum())

0
id               0
height           0
width            0
path             0
category_name    0
family_name      0
order_name       0
category_id      0
dtype: int64


now that the metadata is cleaned up, we can perform EDA.

# EDA