In [1]:
import os
import pathlib
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.utils.class_weight import compute_class_weight
seed = 42

os.chdir("../")

pathlib.Path.cwd()

PosixPath('/home/paolo/git/wild-boar-detection')

## Create train test split

In [2]:
def generate_group(x: str) -> str:
    group = re.sub(pattern="_(frame|image)_.*", repl="", string=x.split("/")[-1][:-4])
    return re.sub(pattern="_[0-9]+$", repl="", string=group)

In [3]:
other_animals = [str(x) for x in pathlib.Path("data/bronze/images/other_animals").rglob("*")]
wild_boar = [str(x) for x in pathlib.Path("data/bronze/images/wild_boar").rglob("*")]


data: pd.DataFrame = pd.concat([
    pd.DataFrame(data={"path": wild_boar, "target": [1]*len(wild_boar)}),
    pd.DataFrame(data={"path": other_animals, "target": [0]*len(other_animals)})
]).reset_index(drop=True)
data

Unnamed: 0,path,target
0,data/bronze/images/wild_boar/ho_incontrato_un_...,1
1,data/bronze/images/wild_boar/wild_boar_gang_ca...,1
2,data/bronze/images/wild_boar/cinghiali_corrono...,1
3,data/bronze/images/wild_boar/wild_boar_gang_ca...,1
4,data/bronze/images/wild_boar/wild_boar_image_2...,1
...,...,...
2789,data/bronze/images/other_animals/cosa_ci_fate_...,0
2790,data/bronze/images/other_animals/il_sonnellino...,0
2791,data/bronze/images/other_animals/otter_at_nigh...,0
2792,data/bronze/images/other_animals/oliver_the_ot...,0


In [4]:
groups = data["path"].apply(lambda x: generate_group(x))
groups

0       ho_incontrato_un_branco_di_cinghiali_di_notte_mp4
1                wild_boar_gang_caught_on_camera_trap_mp4
2       cinghiali_corrono_nel_bosco_come_fantasmi_nell...
3                wild_boar_gang_caught_on_camera_trap_mp4
4                                               wild_boar
                              ...                        
2789    cosa_ci_fate_qui_alle_2_di_notte_____caprioli_...
2790           il_sonnellino_del_cervo__o9rzkiuzy84__webm
2791                    otter_at_night__buvfr_nh33k__webm
2792    oliver_the_otter_walking_by_at_night__qviddl8g...
2793    la_lepre_italica_nel_parco_nazionale_del_cilen...
Name: path, Length: 2794, dtype: object

In [5]:
for i in range(2, 200):
    sgkf = StratifiedGroupKFold(n_splits=i, shuffle=True, random_state=seed)
    
    train_index, valid_index = next(sgkf.split(X=data.path, y=data.target, groups=groups))
    
    train_size = train_index.shape[0] / data.shape[0]
    
    if 0.65 <= train_size <= 0.8:
        print(i, train_size)
        break
        

6 0.658554044380816


In [6]:
data.index

RangeIndex(start=0, stop=2794, step=1)

In [7]:
train_data = data.loc[train_index] #.to_parquet(pathlib.Path("../data/train.parquet"), index=False)
valid_data = data.loc[valid_index] #.to_parquet(pathlib.Path("../data/valid.parquet"), index=False)

In [8]:
class_weight = compute_class_weight(class_weight="balanced", classes=np.unique(train_data.target), y=train_data.target)
class_weight

array([0.81272085, 1.29943503])

In [9]:
train_data["weight"] = class_weight[train_data.target]
valid_data["weight"] = class_weight[valid_data.target]

In [10]:
train_data.to_parquet(pathlib.Path("data/train.parquet"), index=False)
valid_data.to_parquet(pathlib.Path("data/valid.parquet"), index=False)