The DGS data that I obtained comes with .eaf files that have annotations that I plan to use to extract key
parts of the videos with relevant words. My first objective is to determine the relevant tiers and then to get
word counts to determine which words would be best for use in a small DGS translation prototype.

The archive can be found at:

https://www.sign-lang.uni-hamburg.de/dgs-korpus/

In [7]:
import pandas as pd
import pympi
import os
import re
import json
from moviepy import VideoFileClip

"""
loads in the .eaf files and prints the tier names.
"""

directory = '../data/raw/dialogues'
files = [f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f)) and f.endswith('.eaf')]

for file in files:
    eaf = pympi.Elan.Eaf(os.path.join(directory, file))
    tier_names = eaf.get_tier_names()
    print(tier_names)

dict_keys(['Deutsche_Übersetzung_A', 'Translation_into_English_A', 'Lexem_Gebärde_r_A', 'Lexeme_Sign_r_A', 'Gebärde_r_A', 'Sign_r_A', 'Lexem_Gebärde_l_A', 'Lexeme_Sign_l_A', 'Gebärde_l_A', 'Sign_l_A', 'Mundbild_Mundgestik_A', 'Deutsche_Übersetzung_B', 'Translation_into_English_B', 'Lexem_Gebärde_r_B', 'Lexeme_Sign_r_B', 'Gebärde_r_B', 'Sign_r_B', 'Lexem_Gebärde_l_B', 'Lexeme_Sign_l_B', 'Gebärde_l_B', 'Sign_l_B', 'Mundbild_Mundgestik_B', 'Moderator'])
dict_keys(['Deutsche_Übersetzung_A', 'Translation_into_English_A', 'Lexem_Gebärde_r_A', 'Lexeme_Sign_r_A', 'Gebärde_r_A', 'Sign_r_A', 'Lexem_Gebärde_l_A', 'Lexeme_Sign_l_A', 'Gebärde_l_A', 'Sign_l_A', 'Mundbild_Mundgestik_A', 'Deutsche_Übersetzung_B', 'Translation_into_English_B', 'Lexem_Gebärde_r_B', 'Lexeme_Sign_r_B', 'Gebärde_r_B', 'Sign_r_B', 'Lexem_Gebärde_l_B', 'Lexeme_Sign_l_B', 'Gebärde_l_B', 'Sign_l_B', 'Mundbild_Mundgestik_B', 'Moderator', 'Translation_into_English_Mod'])
dict_keys(['Deutsche_Übersetzung_A', 'Translation_into_Eng

Based on the documentation and looking at these print outs, it seems Lexem_Gebärde and Gebärde are probably
the most relevant tiers. A/B refers to the person doing the signing (2 person dialogues) and l and r
refer to the hand used for the gesture.

In [8]:
"""
prints annotations of each identified tier.
"""

eaf = pympi.Elan.Eaf(os.path.join(directory, files[0]))

tier_name = 'Lexem_Gebärde_r_A'
annotations = eaf.get_annotation_data_for_tier(tier_name)

print(f"Annotations from {tier_name}:\n{annotations}")

tier_name = 'Lexem_Gebärde_l_A'
annotations = eaf.get_annotation_data_for_tier(tier_name)

print(f"\nAnnotations from {tier_name}:\n{annotations}")

eaf = pympi.Elan.Eaf(os.path.join(directory, files[0]))

tier_name = 'Lexem_Gebärde_r_B'
annotations = eaf.get_annotation_data_for_tier(tier_name)

print(f"\nAnnotations from {tier_name}:\n{annotations}")

tier_name = 'Lexem_Gebärde_l_B'
annotations = eaf.get_annotation_data_for_tier(tier_name)

print(f"\nAnnotations from {tier_name}:\n{annotations}")

Annotations from Lexem_Gebärde_r_A:
[(500, 660, 'ICH1'), (780, 1020, 'GEWESEN1'), (1060, 1860, 'VERKEHR1B*'), (2020, 2540, '$INDEX1'), (2660, 2780, '$NUM-EINS-BIS-ZEHN1A:1d'), (2900, 4140, 'FAHREN-LENKRAD1^*'), (4220, 4340, 'ICH1'), (4500, 4700, 'ÜBER1'), (4700, 4820, '$NUM-EINS-BIS-ZEHN1A:1d'), (4940, 5340, 'SEHEN1'), (5580, 5740, 'ICH1'), (5900, 6300, '$PROD*'), (6620, 6740, '$INDEX1'), (6900, 7260, 'PRÜFUNG1'), (7380, 7500, '$INDEX1'), (7660, 8180, 'FAHRER1'), (8220, 8340, 'PRÜFUNG1'), (8900, 9060, 'ICH1*'), (9220, 11300, '$PROD*'), (11780, 11900, 'MANN1'), (12420, 12580, 'SAGEN1'), (12820, 13140, 'TACHOMETER1*'), (13140, 13380, '$ORAL^'), (13380, 13500, 'ICH1'), (13580, 14060, 'GRUND4A'), (14140, 14260, 'NICHT3A*'), (14340, 14420, 'VERGESSEN1'), (14580, 14660, '$NUM-EINS-BIS-ZEHN1A:1d'), (14780, 14900, 'TAG1B'), (14940, 15220, 'VOR-ZEIT1D'), (15300, 15420, 'ICH1'), (15540, 15660, 'SCHON1A'), (15740, 15820, 'ICH1'), (15900, 16740, 'BESOFFEN3A*'), (16860, 17420, 'NACHT2'), (17460, 18

In [9]:
tier_name = 'Gebärde_r_A'
try:
    annotations = eaf.get_annotation_data_for_tier(tier_name)
    print(f"Annotations from {tier_name}:\n{annotations}")
except KeyError:
    print(f"\n{tier_name}: Key Not Found.")
          
tier_name = 'Gebärde_l_A'
try:
    annotations = eaf.get_annotation_data_for_tier(tier_name)
    print(f"\nAnnotations from {tier_name}:\n{annotations}")
except KeyError:
    print(f"\n{tier_name}: Key Not Found.")
          
tier_name = 'Gebärde_r_B'
try:
    annotations = eaf.get_annotation_data_for_tier(tier_name)
    print(f"\nAnnotations from {tier_name}:\n{annotations}")
except KeyError:
    print(f"\n{tier_name}: Key Not Found.")

tier_name = 'Gebärde_l_B'
try:
    annotations = eaf.get_annotation_data_for_tier(tier_name)
    print(f"\nAnnotations from {tier_name}:\n{annotations}")
except KeyError:
    print(f"\n{tier_name}: Key Not Found.")

Annotations from Gebärde_r_A:
[(500, 660, 'ICH1^', 'ICH1'), (780, 1020, 'GEWESEN1^', 'GEWESEN1'), (1060, 1860, 'VERKEHR1B^*', 'VERKEHR1B*'), (2020, 2540, '$INDEX1^', '$INDEX1'), (2660, 2780, '$NUM-EINS-BIS-ZEHN1A^*', '$NUM-EINS-BIS-ZEHN1A:1d'), (2900, 4140, 'FAHREN-LENKRAD1^*', 'FAHREN-LENKRAD1^*'), (4220, 4340, 'ICH1^', 'ICH1'), (4500, 4700, 'ÜBER1^', 'ÜBER1'), (4700, 4820, '$NUM-EINS-BIS-ZEHN1A^*', '$NUM-EINS-BIS-ZEHN1A:1d'), (4940, 5340, 'SEHEN1^', 'SEHEN1'), (5580, 5740, 'ICH1^', 'ICH1'), (5900, 6300, '$PROD*', '$PROD*'), (6620, 6740, '$INDEX1^', '$INDEX1'), (6900, 7260, 'BEVORZUGEN1^', 'PRÜFUNG1'), (7380, 7500, '$INDEX1^', '$INDEX1'), (7660, 8180, 'STEUERN-LENKRAD1^', 'FAHRER1'), (8220, 8340, 'BEVORZUGEN1^', 'PRÜFUNG1'), (8900, 9060, 'ICH1^*', 'ICH1*'), (9220, 11300, '$PROD*', '$PROD*'), (11780, 11900, 'MANN1^', 'MANN1'), (12420, 12580, 'SAGEN1^', 'SAGEN1'), (12820, 13140, 'SKALA1^*', 'TACHOMETER1*'), (13140, 13380, '$ORAL^', '$ORAL^'), (13380, 13500, 'ICH1^', 'ICH1'), (13580, 140

It looks like either one of these pairs of tiers could be used, so I want to check which pair of tier names
is present in all the annotations that I currently have.

In [10]:
tiers_present = []

tier_checks = ['Gebärde_r_A', 'Gebärde_l_A', 'Gebärde_r_B', 'Gebärde_l_B']

for file in files:
    eaf = pympi.Elan.Eaf(os.path.join(directory, file))
    tier_names = eaf.get_tier_names()

    if all(x in tier_names for x in tier_checks):
        tiers_present.append(True)
    else:
        tiers_present.append(False)

print(f"{tier_checks} present in all files: {all(tiers_present)}")

tiers_present = []

tier_checks = ['Lexem_Gebärde_r_A', 'Lexem_Gebärde_l_A', 'Lexem_Gebärde_r_B', 'Lexem_Gebärde_l_B']

for file in files:
    eaf = pympi.Elan.Eaf(os.path.join(directory, file))
    tier_names = eaf.get_tier_names()

    if all(x in tier_names for x in tier_checks):
        tiers_present.append(True)
    else:
        tiers_present.append(False)

print(f"{tier_checks} present in all files: {all(tiers_present)}")

tiers_present = []

tier_checks = ['Gebärde_r_A', 'Gebärde_r_B']

for file in files:
    eaf = pympi.Elan.Eaf(os.path.join(directory, file))
    tier_names = eaf.get_tier_names()

    if all(x in tier_names for x in tier_checks):
        tiers_present.append(True)
    else:
        tiers_present.append(False)

print(f"{tier_checks} present in all files: {all(tiers_present)}")

tiers_present = []

tier_checks = ['Lexem_Gebärde_r_A', 'Lexem_Gebärde_r_B']

for file in files:
    eaf = pympi.Elan.Eaf(os.path.join(directory, file))
    tier_names = eaf.get_tier_names()

    if all(x in tier_names for x in tier_checks):
        tiers_present.append(True)
    else:
        tiers_present.append(False)

print(f"{tier_checks} present in all files: {all(tiers_present)}")

tiers_present = []

tier_checks = ['Gebärde_l_A', 'Gebärde_l_B']

for file in files:
    eaf = pympi.Elan.Eaf(os.path.join(directory, file))
    tier_names = eaf.get_tier_names()

    if all(x in tier_names for x in tier_checks):
        tiers_present.append(True)
    else:
        tiers_present.append(False)

print(f"{tier_checks} present in all files: {all(tiers_present)}")

tiers_present = []

tier_checks = ['Lexem_Gebärde_l_A', 'Lexem_Gebärde_l_B']

for file in files:
    eaf = pympi.Elan.Eaf(os.path.join(directory, file))
    tier_names = eaf.get_tier_names()

    if all(x in tier_names for x in tier_checks):
        tiers_present.append(True)
    else:
        tiers_present.append(False)

print(f"{tier_checks} present in all files: {all(tiers_present)}")

['Gebärde_r_A', 'Gebärde_l_A', 'Gebärde_r_B', 'Gebärde_l_B'] present in all files: False
['Lexem_Gebärde_r_A', 'Lexem_Gebärde_l_A', 'Lexem_Gebärde_r_B', 'Lexem_Gebärde_l_B'] present in all files: False
['Gebärde_r_A', 'Gebärde_r_B'] present in all files: True
['Lexem_Gebärde_r_A', 'Lexem_Gebärde_r_B'] present in all files: True
['Gebärde_l_A', 'Gebärde_l_B'] present in all files: False
['Lexem_Gebärde_l_A', 'Lexem_Gebärde_l_B'] present in all files: False


The only thing present in all .eaf files are the right hand gestures, which makes sense
as it is the dominant hand.

Next, I remove the endings of the lexemes in order to make sure that I can get an accurate
lexeme count for each of the signs.

In [11]:
def extract_base_lexeme(text):
    match = re.match(r'^([A-ZÄÖÜß]+)', text.upper())
    return match.group(1) if match else text

In [12]:
tier_names = ['Lexem_Gebärde_r_A', 'Lexem_Gebärde_r_B', 'Lexem_Gebärde_l_A', 'Lexem_Gebärde_l_B']
word_counts = {}

for file in files:
    eaf = pympi.Elan.Eaf(os.path.join(directory, file))
    
    for tier_name in tier_names:
        if tier_name not in eaf.get_tier_names():
            continue

        annotations = eaf.get_annotation_data_for_tier(tier_name)
        
        for values in annotations:
            value = values[2]
            
            if value.startswith('$'):
                continue
            
            lexeme = extract_base_lexeme(value)
            word_counts[lexeme] = word_counts.get(lexeme, 0) + 1


In [13]:
items = [item for item in word_counts.items()]
counts = pd.DataFrame(items, columns = ['Lexeme', 'Count']).sort_values(by = ['Count'], ascending = False)
print(counts.head(20))

       Lexeme  Count
0         ICH   1595
12      NICHT    204
77       TAUB    168
199       GUT    151
53        WIE    143
183      AUCH    130
163      ALLE    119
82     WISSEN    119
100        DU    113
204    FERTIG    112
113      NEIN    105
9       SAGEN    102
5       SEHEN    102
299  GEBÄRDEN    102
137      MUSS    101
66     STIMMT     97
61        WAS     96
168    KOMMEN     92
54   ARBEITEN     90
58       VIEL     90


There seems to be sufficient data to train the model on a few key words of choice.
I will take the top 10 words for use in training. I am saving it as a list that
I will convert to a set when searching for start and stop times so that I can
get an O(1) lookup time.

In [14]:
words = counts['Lexeme'].iloc[0:10].to_list()

print(words)

with open('../data/word_set.json', 'w') as f:
    json.dump(words, f)


['ICH', 'NICHT', 'TAUB', 'GUT', 'WIE', 'AUCH', 'ALLE', 'WISSEN', 'DU', 'FERTIG']


The next step is to extract all the start and stop times for the gestures and the gloss for
the gestures so that I can trim clips and create the dataset for model training.

In [15]:
word_set = set(words)
tier_names = ['Lexem_Gebärde_r_A', 'Lexem_Gebärde_r_B', 'Lexem_Gebärde_l_A', 'Lexem_Gebärde_l_B']
extracted_values = []

directory = '../data/raw/dialogues'
files = [f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f)) and f.endswith('.eaf')]

for file in files:

    filename = os.path.splitext(file)[0]
    eaf = pympi.Elan.Eaf(os.path.join(directory, file))

    for tier_name in tier_names:

        ich_count = 0

        if tier_name[-1] == 'A':
            extension = '_1a1.mp4'
        else:
            extension = '_1b1.mp4'

        if tier_name not in eaf.get_tier_names():
            continue 

        annotations = eaf.get_annotation_data_for_tier(tier_name)

        for values in annotations:
            value = values[2]

            if value.startswith('$'):
                continue

            lexeme = extract_base_lexeme(value)

            if lexeme not in word_set:
                continue
            
            if lexeme == 'ICH':
                ich_count += 1

                if ich_count > 5:
                    continue

            start = values[0]
            end = values[1]

            extracted_values.append((filename + extension, start, end, lexeme))

dataset = pd.DataFrame(extracted_values, columns = ['filename', 'start', 'end', 'lexeme'])
dataset.to_csv('../data/start_end_times.csv', index = False)

I want to check the value counts for these items to make sure there is some balance.
In the above code I basically put a cap on how much 'ICH' clips there could be because
there are way more total 'ICH' and having such a strong imbalance could cause 
overgeneralizations. Below I simply check the counts to see they are more balanced.

In [16]:
counts = dataset.value_counts(subset = 'lexeme')

print(counts)

lexeme
NICHT     204
TAUB      168
ICH       164
GUT       151
WIE       143
AUCH      130
ALLE      119
WISSEN    119
DU        113
FERTIG    112
Name: count, dtype: int64


Now I have generated a list of all the filenames, start and end times, and the lexeme of interest,
I can begin the cutting part of the preprocessing. For this, I use 'ffmpeg-python'.

In [18]:
df = pd.read_csv('../data/start_end_times.csv')

video_dir = '../data/raw/dialogues'
output_dir = '../data/clips'

os.makedirs(output_dir, exist_ok=True)

for idx, row in df.iterrows():

    buffer = 0.2

    input_file = os.path.join(video_dir, row['filename'])
    start = (row['start'] / 1000) - buffer
    end = (row['end'] / 1000) + buffer
    lexeme = row['lexeme']

    base_name = os.path.splitext(row['filename'])[0]
    output_filename = f"{base_name}_{start:.3f}_{lexeme}.mp4"
    output_path = os.path.join(output_dir, output_filename)

    if os.path.exists(output_path):
        continue

    try:
        with VideoFileClip(input_file) as video:
            clip = video.subclipped(start, end)
            clip.write_videofile(output_path, codec = 'libx264', audio_codec = 'aac', logger = None)
    except Exception as e:
        print(f"Error cutting {input_file} at {start}-{end}: {e}")