In [3]:
import csv
from pathlib import Path
from helpers.get_cut_paths import get_cut_paths


def load_samples_csv(path: Path) -> list[Path]:
    with (path).open() as samples_f:
        samples = map(Path, list(csv.reader(samples_f))[0])
    return list(samples)

path = Path("../data/cuts/")

samples = {
    'train_samples': load_samples_csv(path / "train.csv"),
    'test_samples': load_samples_csv(path / "test.csv"),
    'res_train_samples': load_samples_csv(path / "train_res.csv"),
    'res_test_samples': load_samples_csv(path / "test_res.csv"),
}
samples['all'] = samples['train_samples'] + samples['test_samples']

ModuleNotFoundError: No module named 'helpers'

In [11]:
import json
from typing import TypedDict, TypeVar, Iterable, Optional
from collections import Counter

T = TypeVar("T")

class CutData(TypedDict):
    '''type of the cuts json data file'''
    label: str
    start: float
    end: float
    video: str
    playlist: str

class SetStatistics(TypedDict):
    '''type of the cuts json data file'''
    labels: list[list[str]]
    unique_labels: set[str]
    vocab: set[str]
    singletons: list[str]
    labels_wo_sing: list[list[str]]


def get_words_for_set(paths: list[Path], inv_chars: list[str]) -> list[list[str]]:
    '''Returns list of words for each cut'''
    words: list[list[str]] = []
    for cut_path in paths:
        with cut_path.open() as datafile:
            data: CutData = json.load(datafile)
        words.append([w for w in clean_word(data['label'], inv_chars, ' ').lower().split(' ')])
    return words

def clean_word(word: str, chars: list[str], rep: str = '') -> str:
    for c in chars:
        word = word.replace(c,rep)
    return word

def flatten(list: Iterable[Iterable[T]]) -> list[T]:
    return [item for sublist in list for item in sublist]

def get_statistics(paths: list[Path], inv_chars: list[str], log_playlist: Optional[str] = None) -> SetStatistics:
    if log_playlist is not None:
        print(f"Processing {log_playlist}")
    labels = get_words_for_set(paths, inv_chars)
    unique_labels = set(map(' '.join, labels))
    vocab = set(flatten(labels))
    words_freq = Counter(flatten(labels))
    singletons = [word for word, freq in words_freq.items() if freq == 1]
    labels_w_sing = [label for label in labels if any(((word in singletons) for word in label))]
    return {
        'labels': labels,
        'unique_labels': unique_labels,
        'vocab': vocab,
        'singletons': singletons,
        'labels_wo_sing': labels_w_sing
    }

inv_chars = ['\n', ',', '.', '"', '-', '?', '!', '¿', '¡', '_']

statistics = {k: get_statistics(v, inv_chars, log_playlist=k) for k,v in samples.items()}

Processing train_samples
Processing test_samples
Processing res_train_samples
Processing res_test_samples
Processing all


In [12]:
import pandas as pd
from typing import Any


def format_stats(set_name: str, stats: SetStatistics) -> list[Any]:
    return [
        set_name,
        len(stats['labels']),
        len(stats['unique_labels']),
        100 * len(stats['unique_labels']) / len(stats['labels']),
        len(stats['vocab']),
        len(stats['singletons']),
        100 * len(stats['singletons']) / len(stats['vocab']),
        100 * len(stats['labels_wo_sing']) / len(stats['labels'])
    ]

pd.DataFrame([format_stats(name, stats) for name, stats in statistics.items()],
    columns=[
        "set",
        "sentences",
        "unique_sentences",
        "unique_sentences_perc",
        "vocab_size",
        "singletons",
        "singletons_perc",
        "labels_w_sing_perc"
    ])


Unnamed: 0,set,sentences,unique_sentences,unique_sentences_perc,vocab_size,singletons,singletons_perc,labels_w_sing_perc
0,train_samples,11931,11494,96.337273,12702,6560,51.64541,38.957338
1,test_samples,2949,2915,98.847067,5702,3502,61.417047,65.140726
2,res_train_samples,10970,10628,96.882407,11978,6234,52.045417,39.93619
3,res_test_samples,2716,2692,99.116348,5350,3302,61.719626,65.868925
4,all,14880,14254,95.793011,14239,7150,50.2142,34.966398


In [13]:
def out_of_vocabulary(labels: Iterable[list[str]], vocab: Iterable[str]) -> list[list[str]]:
    return [label for label in labels if all(((word in vocab) for word in label))]

oov = out_of_vocabulary(statistics['test_samples']['labels'], statistics['train_samples']['vocab'])
res_oov = out_of_vocabulary(statistics['res_test_samples']['labels'], statistics['res_train_samples']['vocab'])

print(100 * len(oov) / len(statistics['test_samples']['labels']))
print(100 * len(res_oov) / len(statistics['res_test_samples']['labels']))


60.664632078670735
59.86745213549337
