In [None]:
import os
import glob
import pandas as pd

We clone the repository containing the data here:

In [None]:
!git clone https://github.com/mahfuzibnalam/dialect_mt.git

Cloning into 'dialect_mt'...
remote: Enumerating objects: 2047, done.[K
remote: Counting objects: 100% (29/29), done.[K
remote: Compressing objects: 100% (26/26), done.[K
remote: Total 2047 (delta 4), reused 25 (delta 3), pack-reused 2018 (from 1)[K
Receiving objects: 100% (2047/2047), 83.43 MiB | 12.82 MiB/s, done.
Resolving deltas: 100% (1081/1081), done.
Updating files: 100% (1298/1298), done.


We then load the variety specific data:

In [None]:
langs = ["Italian", "Swiss_German", "Basque", "Arabic", "Bengali", "Kurdish"]
lang_dir_paths = {}
for lang in langs:
  lang_dir_paths[lang] = "/content/dialect_mt/"+lang+"/"

In [None]:
def load_parallel_files(paths):
    def read_clean(path):
        with open(path, encoding="utf-8") as f:
            lines = f.read().splitlines()
        while lines and lines[-1] == "":
            lines.pop()
        return lines
    data = {}
    for p in paths:
        name = os.path.splitext(os.path.basename(p))[0]
        data[name] = read_clean(p)
    max_len = max(len(lines) for lines in data.values())
    for name, lines in data.items():
        if len(lines) < max_len:
            pad_amount = max_len - len(lines)
            data[name] = lines + [""] * pad_amount
    return pd.DataFrame(data)

In [None]:
lang_dfs = {}
for lang_dir_path in lang_dir_paths.items():
  all_lang_files = glob.glob(lang_dir_path[1]+"*")
  lang_df = load_parallel_files(all_lang_files)
  lang_dfs[lang_dir_path[0]] = lang_df

Each group of language varieties is now its own pandas dataframe (e.g. Italian varieties here), where "Standard" is the standard variety:

In [None]:
lang_dfs['Italian'].head()

Unnamed: 0,Bitti,Villafranca_Padovana,Palmanova,San_Valentino,Padova5,San_Marco_in_Lamis2,Nimis,Forlì,Aldeno2,Palermo6,...,Comano,Alassio,Zianigo6,Cosenza,Taglio_di_Po2,Padola,Villacidro,Villa_di_Tirano,San_Cesario_di_Lecce,Cardito3
0,,,I comercianti i ga aumanta(do) i presi,,,,,I comercent i ha aumenté i prez,I comercianti i ha aumentà i prezi.,,...,I buteghee i a cressut i prezi,,,,,,,,,
1,,,I putei i se ga visto (i se ze visti) tuti nel...,,,,,I ragaz i s'è vest tot in t'e spec,I putéi i sa vardadi tuti al spègio.,,...,I tuson i s'a vist tucc ar specc,,,,,,,,,
2,,,I presi i ze cresui tanto,,,,,I prez i è carsù tent,I prezi i è alle stelle,,...,I prezi i è nai su da tantu,,,,,,,,,
3,,,I ga dovuo parlarme in segreto,,,,,I m'ha duvù parlé ad nascost,I ma parlà de scondom,,...,I gn'à vut da paralmm in segrett,,,,,,,,,
4,,,I crede de poderlo misurar ben,,,,,I crid ad putel misurè ben,I crede de poder misurarlo bem.,,...,I cred da podé misurall ben,,,,,,,,,


In [None]:
lang_dfs['Italian']['Standard'].head()

Unnamed: 0,Standard
0,I commercianti hanno aumentato i prezzi
1,I ragazzi si sono visti tutti allo specchio
2,I prezzi sono cresciuti tanto
3,Mi hanno dovuto parlare in segreto
4,Credono di poterlo misurare bene


Number of varieties:

In [None]:
print("Italian: ", len(lang_dfs['Italian'].columns.values))
print("Swiss German: ", len(lang_dfs["Swiss_German"].columns.values))
print("Basque: ", len(lang_dfs['Basque'].columns.values))
print("Arabic: ", len(lang_dfs['Arabic'].columns.values))
print("Bengali: ", len(lang_dfs['Bengali'].columns.values))
print("Kurdish: ", len(lang_dfs['Kurdish'].columns.values))

Italian:  440
Swiss German:  736
Basque:  41
Arabic:  26
Bengali:  7
Kurdish:  5


Load mBERT tokenizer:

In [None]:
from transformers import BertTokenizer

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Pairwise comparison:

In [None]:
standard_italian_tokens = tokenizer(list(lang_dfs['Italian']['Standard'])).input_ids
# etc.