In [1]:
import json
import os


def get_folder_size_gb(folder_path: str) -> float:
    return sum(
        os.path.getsize(os.path.join(dirpath, filename))
        for dirpath, _, filenames in os.walk(folder_path)
        for filename in filenames
    ) / (1024**3)


datasets = json.load(open("datasets.json", "r"))
for dataset in datasets:
    if "size" not in dataset:
        print(f"Processing {dataset['id']}")
        dataset["size"] = get_folder_size_gb(dataset["path"])
        print(f"Size: {dataset['size']} GB")
json.dump(datasets, open("datasets.json", "w"), indent=4)

In [2]:
import pandas as pd


df = pd.DataFrame(datasets)
df["size"] = df["size"].round(2)
df

Unnamed: 0,id,path,size
0,greek_sl_dataset,/mnt/disk3Tb/slt-datasets/GSL,5.42
1,indian_sl_dataset,/mnt/disk3Tb/slt-datasets/ISL,422.51
2,LSFB-CONT,/mnt/disk3Tb/slt-datasets/LSFB-CONT,28.05
3,How2Sign,/mnt/disk3Tb/slt-datasets/How2Sign,69.47
4,lsat,/mnt/disk3Tb/slt-datasets/lsat,45.59
5,phoenix14t,/mnt/disk3Tb/slt-datasets/RWTH_PHOENIX_2014T,5.75
6,Content4All-SWISSTXT-NEWS,/mnt/disk3Tb/slt-datasets/Content4All/ANNOTATE...,27.72
7,Content4All-SWISSTXT-WEATHER,/mnt/disk3Tb/slt-datasets/Content4All/ANNOTATE...,2.74
8,Content4All-VRT-NEWS,/mnt/disk3Tb/slt-datasets/Content4All/ANNOTATE...,17.33
9,WMT-SLT23 (part 1),/mnt/disk3Tb/slt-datasets/WMT-SLT23,160.37


In [3]:
df["size"].sum()

1088.46

In [4]:
from SLTDataset import SLTDataset

In [5]:
from collections import Counter


def text_analysis(text_series: pd.Series, freq_lt_thresholds: list[int] = [10]):
    sentence_count = text_series.value_counts()
    words = [
        word
        for sentence in map(lambda t: str(t).split(), text_series.to_list())
        for word in sentence
    ]
    word_counts = Counter(words)

    output = {
        "num_samples": len(text_series),
        "unique_sentence_count": len(sentence_count[sentence_count == 1]),
        "unique_sentence_percentage": round(
            100 * text_series.nunique() / len(text_series), 2
        ),
        "vocabulary_size": len(word_counts),
        "singleton_count": len(
            [word for word, count in word_counts.items() if count == 1]
        ),
        "singleton_percentage": round(
            100
            * len([word for word, count in word_counts.items() if count == 1])
            / len(word_counts),
            2,
        ),
    }
    for threshold in freq_lt_thresholds:
        output[f"word_count_lt_{threshold}"] = len(
            [word for word, count in word_counts.items() if count <= threshold]
        )
        output[f"word_percentage_lt_{threshold}"] = round(
            100
            * len([word for word, count in word_counts.items() if count <= threshold])
            / len(word_counts),
            2,
        )
    return output

In [6]:
texts_analysis = []
indices = []

Loading Greek Sign Language Dataset

In [7]:
DATA_DIR = "/mnt/disk3Tb/slt-datasets/GSL"

gsl_dataset = SLTDataset(data_dir=DATA_DIR, input_mode="pose", output_mode="text")
texts_analysis.append(text_analysis(gsl_dataset.annotations["text"]))
indices.append(gsl_dataset.metadata["id"])

Loaded metadata: {
    "name": "The Greek Sign Language (GSL) Dataset",
    "id": "greek_sl_dataset",
    "url": "https://vcl.iti.gr/dataset/gsl/",
    "download_link": "https://drive.google.com/drive/folders/18ruYi9MULMm1KQtUgdIhN0m-XilRhncg",
    "mirror_link": "https://drive.google.com/drive/folders/1EAVE5dxQIKGAL2yB0alvgTlv2t3JKW5v?usp=sharing",
    "input_language": "Greek Sign Language (GSL)",
    "output_language": "greek",
    "input_types": [
        "video",
        "pose"
    ],
    "output_types": [
        "text",
        "gloss"
    ]
}
Loaded annotations at /mnt/disk3Tb/slt-datasets/GSL/annotations.csv


Validating files:   0%|          | 0/10290 [00:00<?, ?it/s]

Validating files: 100%|██████████| 10290/10290 [00:00<00:00, 263301.86it/s]

Dataset loaded correctly





In [8]:
DATA_DIR = "/mnt/disk3Tb/slt-datasets/ISL"

isl_dataset = SLTDataset(data_dir=DATA_DIR, input_mode="pose", output_mode="text")
texts_analysis.append(text_analysis(isl_dataset.annotations["text"]))
indices.append(isl_dataset.metadata["id"])

Loaded metadata: {
    "name": "ISLTranslate",
    "id": "indian_sl_dataset",
    "url": "https://github.com/exploration-lab/isltranslate?tab=readme-ov-file",
    "download_link": "https://1drv.ms/f/s!AuBOJ2hW9GimgblntP72D_agQdokdQ?e=ZbeA6y",
    "mirror_link": "",
    "input_language": "Indian Sign Language (ISL)",
    "output_language": "english",
    "input_types": [
        "video",
        "pose"
    ],
    "output_types": [
        "text"
    ]
}
Loaded annotations at /mnt/disk3Tb/slt-datasets/ISL/annotations.csv


Validating files: 100%|██████████| 125856/125856 [00:00<00:00, 273270.39it/s]





In [9]:
DATA_DIR = "/mnt/disk3Tb/slt-datasets/RWTH_PHOENIX_2014T"

rwth_dataset = SLTDataset(data_dir=DATA_DIR, input_mode="pose", output_mode="text")
texts_analysis.append(text_analysis(rwth_dataset.annotations["text"]))
indices.append(rwth_dataset.metadata["id"])

Loaded metadata: {
    "name": "RWTH-PHOENIX-Weather 2014 T: Parallel Corpus of Sign Language Video, Gloss and Translation",
    "id": "rwth_phoenix_weather_2014_t",
    "url": "https://www-i6.informatik.rwth-aachen.de/~koller/RWTH-PHOENIX-2014-T/",
    "download_link": "https://www-i6.informatik.rwth-aachen.de/ftp/pub/rwth-phoenix/2016/phoenix-2014-T.v3.tar.gz",
    "mirror_link": "",
    "input_language": "German Sign Language (GSL)",
    "output_language": "german",
    "input_types": [
        "video",
        "pose"
    ],
    "output_types": [
        "text",
        "gloss"
    ]
}
Loaded annotations at /mnt/disk3Tb/slt-datasets/RWTH_PHOENIX_2014T/annotations.csv


Validating files:   0%|          | 0/8257 [00:00<?, ?it/s]

Validating files: 100%|██████████| 8257/8257 [00:00<00:00, 256342.38it/s]

Dataset loaded correctly





In [10]:
DATA_DIR = "/mnt/disk3Tb/slt-datasets/lsat"

lsat_dataset = SLTDataset(data_dir=DATA_DIR, input_mode="pose", output_mode="text")
texts_analysis.append(text_analysis(lsat_dataset.annotations["text"]))
indices.append(lsat_dataset.metadata["id"])

Loaded metadata: {
    "name": "LSA-T",
    "id": "lsat",
    "url": "https://midusi.github.io/LSA-T/",
    "download_link": "https://midusi.github.io/LSA-T/",
    "mirror_link": "",
    "input_language": "Argentinian Sign Language (LSA)",
    "output_language": "spanish",
    "input_types": [
        "video",
        "pose"
    ],
    "output_types": [
        "text"
    ]
}
Loaded annotations at /mnt/disk3Tb/slt-datasets/lsat/annotations.csv


Validating files: 100%|██████████| 8459/8459 [00:00<00:00, 275503.51it/s]







In [11]:
pd.DataFrame(texts_analysis, index=indices)

Unnamed: 0,num_samples,unique_sentence_count,unique_sentence_percentage,vocabulary_size,singleton_count,singleton_percentage,word_count_lt_10,word_percentage_lt_10
greek_sl_dataset,10290,55,3.18,479,22,4.59,33,6.89
indian_sl_dataset,125856,119008,96.0,75155,37456,49.84,65391,87.01
rwth_phoenix_weather_2014_t,8257,7900,96.35,3000,1110,37.0,2176,72.53
lsat,8459,7727,94.81,20479,12002,58.61,19156,93.54
