# Building Vocabulary

This notebook was used to define the words selected as target vocabulary.

In [1]:
import os
import pandas as pd

# Change working directory to the project root directory
current_dir = os.getcwd()
os.chdir(current_dir + "/../")
os.getcwd()


'/Users/gustavosantos/code/omdena/SaoPauloBrazilChapter_BrazilianSignLanguage'

In [2]:
def text_normalization(text):
    text = text.lower()
    text = text.strip()
    text = text.replace("\n", " ")
    text = text.replace("\r", " ")
    text = text.replace("\t", " ")
    text = text.replace("-", " ")
    text = text.replace(" ", "_")
    return text

In [3]:
vocab_df = pd.read_json("data/raw/selected_video_variants.json").drop_duplicates()
vocab_df.duplicated().sum()

0

In [4]:
count_data = (
    vocab_df.groupby("word")["dataset"]
    .agg(["nunique", "count"])
    .sort_values(["count", "nunique"], ascending=False)
)

for i in range(3, 7):
    print(f"Words with {i} options and 3 datasets:")
    display(
        count_data[(count_data["nunique"] == 3) & (count_data["count"] == i)].shape[0]
    )

Words with 3 options and 3 datasets:


28

Words with 4 options and 3 datasets:


55

Words with 5 options and 3 datasets:


216

Words with 6 options and 3 datasets:


21

In [5]:
candidates = (
    count_data[(count_data["nunique"] == 3) & (count_data["count"] >= 5)]
    .sort_index()
    .index
)

In [6]:
with open("data/interim/word_candidates.txt", "w") as f:
    for word in candidates:
        f.write(word + "\n")

In [7]:
with open("data/interim/best_candidates.txt", "r") as f:
    best_candidates = [line.strip() for line in f]

In [8]:
best_candidates = [word.split(":")[0] for word in best_candidates]
print(f"Number of best candidates: {len(best_candidates)}")

Number of best candidates: 66


In [9]:
BASE_DIR = os.getcwd() + "/data/raw/"

# Set paths to the raw data files
ne_path = BASE_DIR + "INES/"
sb_path = BASE_DIR + "SignBank/"
uf_path = BASE_DIR + "UFV/"
vl_path = BASE_DIR + "V-Librasil/"
try:
    ne_raw_df = pd.read_csv(ne_path + "metadata.csv")
except FileNotFoundError:
    print("INES metadata not found")

try:
    sb_raw_df = pd.read_csv(sb_path + "metadata.csv")
except FileNotFoundError:
    print("SignBank metadata not found")

try:
    uf_raw_df = pd.read_csv(uf_path + "metadata.csv")
except FileNotFoundError:
    print("UFV metadata not found")

try:
    vl_raw_df = pd.read_csv(vl_path + "metadata.csv")
except FileNotFoundError:
    print("V-Librasil metadata not found")

UFV metadata not found


In [10]:
ne_df = ne_raw_df[ne_raw_df["file_exists"]]
ne_df[ne_df["scraped_label"].duplicated(keep=False)].head(10).sort_values("label")
ne_df = ne_df.drop_duplicates(keep="first")
ne_df["label"] = ne_df["label"].apply(text_normalization)
ne_df["label_number"] = ne_df["scraped_label"].str.extract(r"(\d+)").astype(float)
ne_df["label_number"] = ne_df["label_number"].fillna(0).astype(int)

sb_df = sb_raw_df.drop_duplicates(keep="first")
sb_df["label"] = sb_df["label"].apply(text_normalization)

vl_df = vl_raw_df.drop_duplicates(keep="first")
vl_df["label"] = vl_df["label"].apply(text_normalization)

In [11]:
vocab_df

Unnamed: 0,dataset,word,chosen_video
0,INES,abacaxi,0
1,SIGNBANK,abacaxi,1
2,INES,abanar,0
3,SIGNBANK,abanar,1
4,INES,abandonar,1
...,...,...,...
2063,SIGNBANK,zíper,1
2064,INES,zíper,0
2065,V-LIBRASIL,zíper,2
2066,V-LIBRASIL,zíper,1


In [12]:
source_map = {"INES": "ne", "SIGNBANK": "sb", "UFV": "uf", "V-LIBRASIL": "vl"}
vocab_df["source"] = vocab_df["dataset"].map(source_map)
vocab_df["word"] = vocab_df["word"].apply(text_normalization)
vocab_df.head()

Unnamed: 0,dataset,word,chosen_video,source
0,INES,abacaxi,0,ne
1,SIGNBANK,abacaxi,1,sb
2,INES,abanar,0,ne
3,SIGNBANK,abanar,1,sb
4,INES,abandonar,1,ne


In [13]:
selected_ne_df = ne_df.loc[:, ["label", "label_number", "data_source", "video_url"]]
selected_ne_df["sign_id"] = selected_ne_df["label_number"]
selected_ne_df.drop("label_number", axis=1, inplace=True)

selected_sb_df = sb_df.loc[:, ["label", "data_source", "video_url", "sign_variant"]]
selected_sb_df["sign_id"] = selected_sb_df["sign_variant"]
selected_sb_df.drop("sign_variant", axis=1, inplace=True)

selected_vl_df = vl_df.loc[:, ["label", "data_source", "video_url", "signer_number"]]
selected_vl_df["sign_id"] = selected_vl_df["signer_number"]
selected_vl_df.drop("signer_number", axis=1, inplace=True)

In [14]:
combined_df = pd.concat([selected_ne_df, selected_sb_df, selected_vl_df], axis=0)
combined_df.head()

Unnamed: 0,label,data_source,video_url,sign_id
0,a,ne,https://www.ines.gov.br/dicionario-de-libras/p...,0
1,abacate,ne,https://www.ines.gov.br/dicionario-de-libras/p...,0
2,abacaxi,ne,https://www.ines.gov.br/dicionario-de-libras/p...,0
3,abafar,ne,https://www.ines.gov.br/dicionario-de-libras/p...,0
4,abaixo,ne,https://www.ines.gov.br/dicionario-de-libras/p...,0


In [15]:
full_data = combined_df.merge(
    vocab_df,
    left_on=["label", "data_source", "sign_id"],
    right_on=["word", "source", "chosen_video"],
    how="inner",
)
full_data = full_data.sort_values(["label", "data_source", "sign_id"])
full_data = full_data.reset_index(drop=True)
display(full_data.head())
print(full_data.shape)

Unnamed: 0,label,data_source,video_url,sign_id,dataset,word,chosen_video,source
0,abacaxi,ne,https://www.ines.gov.br/dicionario-de-libras/p...,0,INES,abacaxi,0,ne
1,abacaxi,sb,https://videos.nals.cce.ufsc.br/SignBank/Vídeo...,1,SIGNBANK,abacaxi,1,sb
2,abacaxi,vl,https://libras.cin.ufpe.br/storage/videos/2021...,1,V-LIBRASIL,abacaxi,1,vl
3,abacaxi,vl,https://libras.cin.ufpe.br/storage/videos/2021...,2,V-LIBRASIL,abacaxi,2,vl
4,abacaxi,vl,https://libras.cin.ufpe.br/storage/videos/2021...,3,V-LIBRASIL,abacaxi,3,vl


(2051, 8)


In [16]:
with open("data/interim/best_candidates.txt", "r") as f:
    selected_labels = [line.strip() for line in f]
    selected_labels = [word.split(":")[0] for word in selected_labels]
len(selected_labels)

66

In [17]:
selected_data = full_data[full_data["label"].isin(selected_labels)]
selected_data = selected_data.drop(
    ["dataset", "word", "word", "source", "chosen_video"], axis=1
)
display(selected_data.head())
print(selected_data.shape)

Unnamed: 0,label,data_source,video_url,sign_id
0,abacaxi,ne,https://www.ines.gov.br/dicionario-de-libras/p...,0
1,abacaxi,sb,https://videos.nals.cce.ufsc.br/SignBank/Vídeo...,1
2,abacaxi,vl,https://libras.cin.ufpe.br/storage/videos/2021...,1
3,abacaxi,vl,https://libras.cin.ufpe.br/storage/videos/2021...,2
4,abacaxi,vl,https://libras.cin.ufpe.br/storage/videos/2021...,3


(337, 4)


In [18]:
selected_data.to_csv("data/raw/combined/metadata_combined.csv", index=False)