<a href="https://colab.research.google.com/github/lucarinelli/conditional_text_generation/blob/main/notebooks/COCO_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import utilities

In [None]:
!pip install --quiet transformers datasets tokenizers

In [None]:
repo_dir = "/content/conditional_text_generation"
!rm -r {repo_dir}
!git clone https://github.com/lucarinelli/conditional_text_generation.git {repo_dir}

In [None]:
import sys
import os

module_path = os.path.abspath(repo_dir+"/src")
if module_path not in sys.path:
    sys.path.append(module_path)

In [None]:
from captions_dataset import *

# Configuration

In [None]:
%mkdir "data"
data_path="./data"

# Coco Analysis

In [None]:
def computeAverageOnDataset(dataset, fieldExtractor):
  count = 0
  minV = None
  maxV = None
  for i in dataset:
    l = len(fieldExtractor(i)) 
    count += l
    if minV is None or l < minV: minV = l
    if maxV is None or l > maxV: maxV = l
  return count / len(dataset), minV, maxV

In [None]:
def logControlCodeAnalysisOfDataset(dataset_type, control_code_type, use_supercategories, use_categories):
  print("\n\nAnalysing {} dataset".format(dataset_type))
  dataset, _, _ = load_or_setup_dataset(data_path=data_path, split=dataset_type, use_supercategories = use_supercategories, use_categories = use_categories, force_dataset_update = True)
  print("Creating dataset using {}".format(control_code_type))
  number_of_categories = list(map(lambda e: len(e["categories"]), dataset))
  average, min, max = computeAverageOnDataset(dataset, lambda e: e["categories"])
  percentile = 100 - len(list(filter(lambda nc: nc > average, number_of_categories))) / len(dataset) *100

  print("For {} dataset using {} the average number of control codes per caption is {}.\nIt's the {:.0f}th percentile. Minimum is {}. Maximum is {}"
    .format(dataset_type, control_code_type, average, percentile, min, max))

In [None]:
def logControlCodeAnalysis(type, use_supercategories, use_categories):
  logControlCodeAnalysisOfDataset("train", type, use_supercategories, use_categories)
  logControlCodeAnalysisOfDataset("val", type, use_supercategories, use_categories)

In [None]:
logControlCodeAnalysis("supercategories only", True, False)

In [None]:
logControlCodeAnalysis("categories only", False, True )

In [None]:
logControlCodeAnalysis("categories and supercategories", True, True)

In [None]:
def compute_average_length_of_dataset(ds_type):
  print("\n\nAnalysing {} dataset.".format(ds_type))
  ds, _, categories = load_or_setup_dataset(data_path, ds_type, True, False, True)
  averageChar, minC, maxC = computeAverageOnDataset(ds, lambda e: e["caption"])
  print("Average length of captions is {} chars. Min {} and max {}".format(averageChar, minC, maxC))
  averageWords, minW, maxW = computeAverageOnDataset(ds, lambda x: x["caption"].split())
  print("Average length of captions is {} words. Min {} and max {}".format(averageWords, minW, maxW))
  print("Dataset of type {} has {} entries".format(ds_type,len(ds)))


In [None]:
compute_average_length_of_dataset("train")
compute_average_length_of_dataset("val")