In [17]:
import pandas as pd
from models import base_res_net, small_res_net, efficient_net, pretrained_mobilenet, big_model
import tensorflow as tf
import tensorflow.compat.v1 as tfc
from sklearn.utils.class_weight import compute_class_weight

from modeling import predict, predict_from_csv
from src import InputPipeline

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Check GPU for tf

In [18]:
# Some GPU setup
# for documentation about using gpus refer to: https://www.tensorflow.org/install/pip#windows-wsl2

tf.keras.backend.clear_session()

device_name = tf.test.gpu_device_name()
if not device_name:
  raise SystemError('GPU device not found')
try:  # prevent a previous session from being alive
  sess.close() 
except:
  pass

tfc.enable_eager_execution()
gpu_options= tfc.GPUOptions(per_process_gpu_memory_fraction = 0.90)
sess = tfc.InteractiveSession(config=tfc.ConfigProto(gpu_options=gpu_options))

### Optional Stuff
- Here we compute classweights since the data is very imbalanced

In [19]:
train_df = pd.read_csv("../data/train_images_stratified.csv")

In [20]:
class_weights = compute_class_weight(class_weight = 'balanced', classes = train_df["label"].unique(), y=train_df["label"])
class_weights = dict(enumerate(class_weights))

## Configure Parameters

In [21]:
INPUT_SHAPE = (220,220,3)

CONF = {
  "learning_rate": 0.00001,
  "batch_size": 48,
  "epochs": 150,
  "loss_function": "sparse_categorical_crossentropy",
  "metric": "sparse_categorical_accuracy",
}

## Make Input Pipelines

In [22]:
# Input pipeline for subspecies
sub_species_input_pipeline = InputPipeline(splits=(0.85,0.0,0.15), channels=3, batch_size=CONF["batch_size"], size=INPUT_SHAPE[:2])
# This has a stratified split
sub_species_input_pipeline.make_stratified_train_dataset(
  train_ds_path= "../data/train_ds_images_stratified.csv",
  val_ds_path="../data/val_ds_images_stratified.csv",
)


# Input pipeline for species
species_input_pipeline = InputPipeline(splits=(0.85,0.0,0.15), channels=3, batch_size=CONF["batch_size"], size=INPUT_SHAPE[:2])
species_input_pipeline.make_train_datasets(directory="../data/train_images/species_classify")  # This doesnt have a stratified split


Datasets populated!
Found 15704 files belonging to 70 classes.
Using 13349 files for training.
Using 2355 files for validation.
Datasets populated!


In [None]:
# Cache and prefetch datasets for faster access
sub_species_cached_train, sub_species_cached_val = sub_species_input_pipeline.get_cached_train_datasets()
species_cached_train, species_cached_val = species_input_pipeline.get_cached_train_datasets()

## Training a Model

In [23]:
from modeling import train_classifier

In [None]:
# making the species classifier
train_classifier(
  model_name="../classifiers/subspecies_effnet_classifier",
  input_shape=INPUT_SHAPE,
  classes_to_classify=200,
  configuration=CONF,
  model=efficient_net, 
  train_dataset=sub_species_input_pipeline.train_dataset,
  validation_dataset=sub_species_input_pipeline.validation_dataset,
)

In [None]:
# making the species classifier
train_classifier(
  model_name="../classifiers/species_efficient_net_classifier",
  input_shape=INPUT_SHAPE,
  classes_to_classify=70,
  configuration=CONF,
  model=efficient_net, 
  train_dataset=species_cached_train,
  validation_dataset=species_cached_val,
)

## Predict Stuff

In [None]:
from modeling import predict
import pickle

In [None]:
# Load mapping for stacked classifiers
with open("../mapping.pickle", "rb+") as f:
  mapping = pickle.load(f)

In [None]:
# Make Train Dataset
sub_species_input_pipeline.make_test_dataset(directory="../data/test_images/test_images")
test_cached = sub_species_input_pipeline.get_cached_test_datasets()

In [None]:
predict(
  species_classifier=None,
  subspecies_classifier="../classifiers/subspecies_effnet_classifier",
  dataset=test_cached,
  species_subspecies_dict=mapping
)

In [None]:
# TF predicts labels from 0-199, however we need 1-200: Therefore we need to adjust the predictions.
adj_df_src = "../data/test_images_sample_1701245740.5383716.csv"
pred_df = pd.read_csv(adj_df_src, index_col=0)
pred_df["label"] += 1
pred_df.to_csv(adj_df_src)

# New prediction

In [9]:
# No label adjustments needed afterwards!!!!!!
predict_from_csv(
  subspecies_classifier="../classifiers/subspecies_effnet_classifier",
  dataset="../data/test_images_path.csv",
  path="../data/test_images",
  size=INPUT_SHAPE,
)

predicting: 100%|██████████| 4000/4000 [07:20<00:00,  9.07it/s]


Saving to: ../data/test_images_sample_1701249151.784157.csv


In [10]:
df = pd.read_csv("../data/test_images_sample_1701249151.784157.csv", index_col=0)

In [11]:
display(df)

Unnamed: 0_level_0,label
id,Unnamed: 1_level_1
1,68.0
2,39.0
3,74.0
4,12.0
5,74.0
...,...
3996,94.0
3997,55.0
3998,40.0
3999,66.0


In [13]:
df = df.fillna(value=111)

In [14]:
df["label"] = df["label"].apply(int)

In [15]:
display(df)

Unnamed: 0_level_0,label
id,Unnamed: 1_level_1
1,68
2,39
3,74
4,12
5,74
...,...
3996,94
3997,55
3998,40
3999,66


In [16]:
df.to_csv("../data/test_images_sample_1701249151.784157.csv")