In [None]:
import os
import matplotlib.pyplot as plt
import pandas as pd
import datetime as dt

In [None]:
dir_path = "/workspaces/gorillatracker/data/splits/ground_truth-cxl-face_images-kfold-openset-seed-42-trainval-80-test-20-k-5"

In [None]:
dirs = os.listdir(dir_path)
df: list[pd.DataFrame] = []

In [None]:
#create a dataframe for each directory
for i, d in enumerate(dirs):
    data = []
    wrong_data = 0
    # Extract data from filenames
    for filename in os.listdir(os.path.join(dir_path, d)):
        parts = filename.split('_')
        if len(parts) == 4:
            id_, camid, date, id2 = parts
            if(len(id_) != 4): 
                wrong_data += 1
                continue
            Groupid = id_[:2]
            id_ = id_[2:]
            id2 = ''.join(filter(str.isdigit, id2)) # remove non-numeric characters
            date = dt.datetime.strptime(date, '%Y%m%d').date()
            data.append((Groupid, id_, camid, date, id2))
            
    # Create a DataFrame
    df.append(pd.DataFrame(data, columns=['GROUP', 'ID', 'CAM', 'DATE', 'ID2']))
    print(f"Directory {d} had {wrong_data} wrong named images")

# general stats

In [None]:
for i, d in enumerate(dirs):
    print(f"{d}:")
    print(f"\t Images: {df[i].shape[0]}")
    print(f"\t Individuals: {df[i].groupby(['GROUP', 'ID']).ngroups}")
    print(f"\t Groups: {df[i]['GROUP'].nunique()}")
    print(f"\t Cameras: {df[i]['CAM'].nunique()}")
    print(f"\t Days: {df[i]['DATE'].nunique()}")
    print(f"\t DateRange: {df[i]['DATE'].min()} - {df[i]['DATE'].max()}")
    print(f"\t Video_clips: {df[i].groupby(['CAM', 'DATE', 'ID2']).ngroups}")
    print(f"\t Videos: {df[i].groupby(['CAM', 'DATE']).ngroups}")

# video/id

In [None]:
for i, d in enumerate(dirs):
    unique_videos = df[i].drop_duplicates(subset=['GROUP', 'ID', 'CAM', 'DATE'])
    videos_per_id = unique_videos.groupby(['GROUP', 'ID']).size()
    videos_count_distribution = videos_per_id.value_counts().sort_index()
    plt.figure(figsize=(5, 2))
    plt.bar(videos_count_distribution.index, videos_count_distribution.values)
    plt.xlabel('Num Videos/ID')
    plt.ylabel('Individuals')
    plt.title(f'{d}: Number of Videos per ID')
    plt.xticks(videos_count_distribution.index)
    plt.show()
    

In [None]:
for i, d in enumerate(dirs):
    unique_videos = df[i].drop_duplicates(subset=['GROUP', 'ID', 'CAM', 'DATE', 'ID2'])
    videos_per_id = unique_videos.groupby(['GROUP', 'ID']).size()
    videos_count_distribution = videos_per_id.value_counts().sort_index()
    plt.figure(figsize=(5, 2))
    plt.bar(videos_count_distribution.index, videos_count_distribution.values)
    plt.xlabel('Num VideoClips/ID')
    plt.ylabel('Individuals')
    plt.title(f'{d}: Number of Video Clips per ID')
    plt.xticks(videos_count_distribution.index)
    plt.show()

In [None]:
for i, d in enumerate(dirs):
    unique_videos = df[i].drop_duplicates(subset=['GROUP', 'ID', 'CAM'])
    videos_per_id = unique_videos.groupby(['GROUP', 'ID']).size()
    videos_count_distribution = videos_per_id.value_counts().sort_index()
    plt.figure(figsize=(5, 2))
    plt.bar(videos_count_distribution.index, videos_count_distribution.values)
    plt.xlabel('Num Cam/ID')
    plt.ylabel('Individuals')
    plt.title(f'{d}: Number of Cameras per ID')
    plt.xticks(videos_count_distribution.index)
    plt.show()

In [None]:
for i, d in enumerate(dirs):
    unique_videos = df[i].drop_duplicates(subset=['GROUP', 'ID', 'CAM', 'DATE', 'ID2'])
    images_per_camera = unique_videos.groupby('CAM').size()
    plt.figure(figsize=(10, 5))
    plt.bar(images_per_camera.index, images_per_camera.values)
    plt.xlabel('Camera ID')
    plt.ylabel('Images')
    plt.title(f'{d}: Images/Camera')
    plt.show()

In [None]:
for i, d in enumerate(dirs):
    unique_videos = df[i].drop_duplicates(subset=['GROUP', 'ID', 'CAM', 'DATE', 'ID2'])
    individuals_per_camera = unique_videos.groupby(['CAM', 'GROUP', 'ID']).size().reset_index().groupby('CAM').size()
    plt.figure(figsize=(10, 5))
    plt.bar(individuals_per_camera.index, individuals_per_camera.values)
    plt.xlabel('Camera ID')
    plt.ylabel('Individuals')
    plt.title(f'{d}: Individuals/Camera')
    plt.show()

In [None]:
for i, d in enumerate(dirs):
    unique_videos = df[i].drop_duplicates(subset=['GROUP', 'ID', 'CAM', 'DATE', 'ID2'])
    unique_individuals_per_group = unique_videos.groupby(['GROUP', 'ID']).size().reset_index().groupby('GROUP').size()
    plt.figure(figsize=(10, 5))
    plt.bar(unique_individuals_per_group.index, unique_individuals_per_group.values)
    plt.xlabel('Group ID')
    plt.ylabel('Individuals')
    plt.title(f'{d}: Individuals/Group')
    plt.show()

In [None]:
def categorize_id(id_):
    id_num = int(id_)
    if id_num == 0:
        return 'Silverback'
    elif 1 <= id_num <= 19:
        return 'Adult female'
    elif 20 <= id_num <= 39:
        return 'Blackback'
    elif 40 <= id_num <= 59:
        return 'Adolescent & Juvenil'
    elif 60 <= id_num <= 79:
        return 'Infant'
    else:
        return 'Unknown'

for i, d in enumerate(dirs):
    unique_individuals = df[i].drop_duplicates(subset=['GROUP', 'ID']).copy()
    
    # Categorize each ID and add to the DataFrame using assign()
    unique_individuals = unique_individuals.assign(Type=unique_individuals['ID'].apply(categorize_id))
    
    # Calculate the number of each type
    type_distribution = unique_individuals['Type'].value_counts().sort_index()
    
    # Plot the distribution of types
    plt.figure(figsize=(10, 5))
    plt.bar(type_distribution.index, type_distribution.values)
    plt.xlabel('Type')
    plt.ylabel('Number of Individuals')
    plt.title(f'{d}: Types of Individuals')
    plt.show()