In [47]:
import csv, sys
from pathlib import Path

sys.path.append("/mnt/data/datasets/LSA-T")
from type_hints import CutData
from helpers.utils import flatten


def load_samples_csv(path: Path) -> list[Path]:
    with (path).open() as samples_f:
        samples = map(Path, list(csv.reader(samples_f))[0])
    return list(samples)

path = Path("../data/cuts/")
samples = {
    samples_set.name[:-4]: load_samples_csv(path / samples_set.name) for samples_set in path.glob("*.csv")
}
samples['all'] = samples['train_min_freq_5_threshold_05'] + samples['test_min_freq_5_threshold_05']

In [48]:
import json
from typing import TypedDict, Iterable, Optional
from collections import Counter


class SetStatistics(TypedDict):
    '''type of the cuts json data file'''
    labels: list[list[str]]
    unique_labels: set[str]
    vocab: set[str]
    singletons: list[str]
    labels_wo_sing: list[list[str]]


def get_words_for_set(paths: list[Path], inv_chars: list[str]) -> list[list[str]]:
    '''Returns list of words for each cut'''
    words: list[list[str]] = []
    for cut_path in paths:
        with cut_path.open() as datafile:
            data: CutData = json.load(datafile)
        words.append([w for w in clean_word(data['label'], inv_chars, ' ').lower().split(' ')])
    return words

def clean_word(word: str, chars: list[str], rep: str = '') -> str:
    for c in chars:
        word = word.replace(c,rep)
    return word

def get_statistics(paths: list[Path], inv_chars: list[str], log_playlist: Optional[str] = None) -> SetStatistics:
    if log_playlist is not None:
        print(f"Processing {log_playlist}")
    labels = get_words_for_set(paths, inv_chars)
    unique_labels = set(map(' '.join, labels))
    vocab = set(flatten(labels))
    words_freq = Counter(flatten(labels))
    singletons = [word for word, freq in words_freq.items() if freq == 1]
    labels_w_sing = [label for label in labels if any(((word in singletons) for word in label))]
    return {
        'labels': labels,
        'unique_labels': unique_labels,
        'vocab': vocab,
        'singletons': singletons,
        'labels_wo_sing': labels_w_sing
    }

inv_chars = ['\n', ',', '.', '"', '-', '?', '!', '¿', '¡', '_']

statistics = {k: get_statistics(v, inv_chars, log_playlist=k) for k,v in samples.items()}

Processing test_min_freq_2_threshold_05
Processing test_min_freq_1_threshold_05
Processing train_min_freq_2_threshold_05
Processing test_min_freq_5_threshold_05
Processing train_min_freq_1_threshold_05
Processing train_min_freq_5_threshold_05
Processing all


In [49]:
import pandas as pd
from typing import Any


def format_stats(set_name: str, stats: SetStatistics) -> list[Any]:
    return [
        set_name,
        len(stats['labels']),
        len(stats['unique_labels']),
        100 * len(stats['unique_labels']) / len(stats['labels']),
        len(stats['vocab']),
        len(stats['singletons']),
        100 * len(stats['singletons']) / len(stats['vocab']),
        100 * len(stats['labels_wo_sing']) / len(stats['labels'])
    ]

pd.DataFrame([format_stats(name, stats) for name, stats in statistics.items()],
    columns=[
        "set",
        "sentences",
        "unique_sentences",
        "unique_sentences_perc",
        "vocab_size",
        "singletons",
        "singletons_perc",
        "labels_w_sing_perc"
    ])


Unnamed: 0,set,sentences,unique_sentences,unique_sentences_perc,vocab_size,singletons,singletons_perc,labels_w_sing_perc
0,test_min_freq_2_threshold_05,1776,1750,98.536036,3318,1807,54.460518,60.135135
1,test_min_freq_1_threshold_05,2735,2708,99.012797,5546,3433,61.900469,67.97075
2,train_min_freq_2_threshold_05,7226,6853,94.838085,6287,2024,32.193415,22.695821
3,test_min_freq_5_threshold_05,910,895,98.351648,1579,771,48.828372,54.285714
4,train_min_freq_1_threshold_05,11065,10693,96.638048,12385,6442,52.014534,40.976051
5,train_min_freq_5_threshold_05,3767,3495,92.7794,2694,625,23.199703,14.361561
6,all,4677,4299,91.917896,2826,517,18.294409,9.856746


In [5]:
def out_of_vocabulary(labels: Iterable[list[str]], vocab: Iterable[str]) -> list[list[str]]:
    return [label for label in labels if all(((word in vocab) for word in label))]

oov = out_of_vocabulary(statistics['test_samples']['labels'], statistics['train_samples']['vocab'])
res_oov = out_of_vocabulary(statistics['res_test_samples']['labels'], statistics['res_train_samples']['vocab'])

print(100 * len(oov) / len(statistics['test_samples']['labels']))
print(100 * len(res_oov) / len(statistics['res_test_samples']['labels']))


60.664632078670735
59.86745213549337
