In [1]:
!mkdir -p data
print('Downloading data ...')
!wget https://dl.fbaipublicfiles.com/laser/WikiMatrix/v1/WikiMatrix.en-fa.tsv.gz -O data/WikiMatrix.en-fa.tsv.gz
print('---------------------')
print('Unzipping file ...')
!gunzip -k -f data/WikiMatrix.en-fa.tsv.gz
print('---------------------')
print('Peek into the file')
!head -10 data/WikiMatrix.en-fa.tsv
print('---------------------')

Downloading data ...
--2023-11-06 04:25:57--  https://dl.fbaipublicfiles.com/laser/WikiMatrix/v1/WikiMatrix.en-fa.tsv.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 18.173.166.48, 18.173.166.51, 18.173.166.74, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|18.173.166.48|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 139938612 (133M) [application/gzip]
Saving to: ‘data/WikiMatrix.en-fa.tsv.gz’


2023-11-06 04:26:02 (33.9 MB/s) - ‘data/WikiMatrix.en-fa.tsv.gz’ saved [139938612/139938612]

---------------------
Unzipping file ...
---------------------
Peek into the file
1.2771435232553505	That purpose will be fulfilled with the advent of the Lord of the Age (may God hasten his advent).	این امر خداوندگار شیوا تا زمان چیتراواهانا جاری است.
1.2557609887189667	Eat Within Your Means.	Eat Within Your Means (به انگلیسی).
1.2397336646789108	It will destroy everything at the bidding of its Lord."	وهمهٔ شکستگی‌ها که کشیدی به سوی خدای خو

In [2]:
print('File length ...')
!wc -l data/WikiMatrix.en-fa.tsv
print('---------------------')

File length ...
1128507 data/WikiMatrix.en-fa.tsv
---------------------


**LASER CONFIDENCE**

LASER is a multi-lingual neural sentence embedding model that is often used for cross-lingual sentence/document retrieval. Similarities in the embedding space are often used as proxies for cross-lingual similarities

In [3]:
from tqdm import tqdm
import numpy as np

def num_lines_in_file(fname):
    """
    Returns the number of lines in a file.
    """
    with open(fname, 'r') as f:
        for i, _ in enumerate(f):
            pass
    return i + 1

def filter_tsv_with_conf(
    input_file, output_file_lang_1, output_file_lang_2,
    confidence_threshold=None, confidence_column=None
):
    """
    Filters a tsv file that has confidence scores associated with each parallel example.

    For example:

    1.23 \t This is a sentence in lang1 \t This is a sentence in lang2
    """
    print()
    print('====================================')
    print('======= TSV Conf Filtering =========')
    print('====================================')
    print()
    num_lines = num_lines_in_file(input_file)
    scores = []
    num_output_lines = 0
    lang_1_col = 0
    lang_2_col = 1
    with open(input_file, 'r') as f, \
        open(output_file_lang_1, 'w') as f_out_1, \
        open(output_file_lang_2, 'w') as f_out_2:
        for line in tqdm(f, total=num_lines, desc=f"Filtering file by confidence {confidence_threshold}"):
            if line.strip() == '':
                continue
            line = line.strip().split('\t')
            if len(line) < 2:
                continue
            if confidence_threshold is not None and float(line[confidence_column]) < confidence_threshold:
                continue
            else:
                if confidence_threshold is not None:
                    scores.append(float(line[confidence_column]))
                    if confidence_column == 0:
                        lang_1_col, lang_2_col = 1, 2
                    elif confidence_column == 2:
                        lang_1_col, lang_2_col = 0, 1
                    elif confidence_column == 1:
                        lang_1_col, lang_2_col = 0, 2
                    else:
                        raise ValueError(f"Invalid Column for confidence {confidence_column}")
                f_out_1.write(line[lang_1_col] + '\n')
                f_out_2.write(line[lang_2_col] + '\n')
                num_output_lines += 1

    if confidence_threshold is not None:
        print(f'Confidence score average  : {np.mean(scores)}')
        print(f'Confidence score variance : {np.var(scores)}')
        print(f'Kept {num_output_lines} out of {num_lines} after conversion ({(num_output_lines / num_lines) * 100}%)')
        print('====================================')

filter_tsv_with_conf(
    'data/WikiMatrix.en-fa.tsv',
    'data/WikiMatrix.en-fa.en',
    'data/WikiMatrix.en-fa.fa',
    confidence_threshold=1.04, confidence_column=0
)





Filtering file by confidence 1.04: 100%|██████████| 1128507/1128507 [00:05<00:00, 191646.49it/s]


Confidence score average  : 1.0574949045215685
Confidence score variance : 0.0002476336996916896
Kept 303805 out of 1128507 after conversion (26.920967260282836%)


**Language ID filtering**

Noisy parallel corpora often contain sentences that are not in the intended language. A classifier that determines the language in which a sentence is written can be used to filter out sentences that aren't in the appropriate language

In [4]:
!pip install fasttext

Collecting fasttext
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.8/68.8 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.11.1-py3-none-any.whl (227 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp310-cp310-linux_x86_64.whl size=4199771 sha256=274c296e50738dc00c7e0f96398a2b0a3456abb63188a71ae60fbdf43bbda03b
  Stored in directory: /root/.cache/pip/wheels/a5/13/75/f811c84a8ab36eedbaef977a6a58a98990e8e0f1967f98f394
Successfully built fasttext
Installing collected packages: pybind11, fasttext
Successfully installed fasttext-0.9.2 pybind11-2.11.1


In [5]:
!wget https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin -O data/lid.176.bin
print()
print('====================================')
print('====== Language ID Filtering =======')
print('====================================')
print()


!wget https://raw.github.com/NVIDIA/NeMo/main/scripts/neural_machine_translation/filter_langs_nmt.py \
    -O filter_langs_nmt.py

!python filter_langs_nmt.py \
    --input-src data/WikiMatrix.en-fa.en  \
    --input-tgt data/WikiMatrix.en-fa.fa \
    --output-src data/WikiMatrix.en-fa.langidfilter.en  \
    --output-tgt data/WikiMatrix.en-fa.langidfilter.fa  \
    --source-lang en \
    --target-lang fa \
    --removed-src data/WikiMatrix.en-fa.langidfilter.removed.en  \
    --removed-tgt data/WikiMatrix.en-fa.langidfilter.removed.fa  \
    --fasttext-model data/lid.176.bin

print()
print('-----------------------------------------')
print('Number of removed sentences:')
print('-----------------------------------------')
print()
!wc -l data/WikiMatrix.en-fa.langidfilter.removed.fa

print()
print('-----------------------------------------')
print('Examples of removed sentences')
print('-----------------------------------------')
print()

!paste -d "\t" \
    data/WikiMatrix.en-fa.langidfilter.removed.en \
    data/WikiMatrix.en-fa.langidfilter.removed.fa \
    | head -10
print('-----------------------------------------')

--2023-11-06 04:39:59--  https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 18.173.166.48, 18.173.166.31, 18.173.166.74, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|18.173.166.48|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 131266198 (125M) [application/octet-stream]
Saving to: ‘data/lid.176.bin’


2023-11-06 04:40:00 (110 MB/s) - ‘data/lid.176.bin’ saved [131266198/131266198]



--2023-11-06 04:40:00--  https://raw.github.com/NVIDIA/NeMo/main/scripts/neural_machine_translation/filter_langs_nmt.py
Resolving raw.github.com (raw.github.com)... 185.199.110.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.github.com (raw.github.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://raw.githubusercontent.com/NVIDIA/NeMo/main/scripts/neural_machine_translation/filter_langs_nmt.py [follow

**Length and Ratio Filtering**

This step filters out sentences based on their lengths and the ratio between source and target lengths. If (a) src_len / tgt_len or tgt_len / src_len exceed 1.3 or (b) source or target sequence lengths are less than 1 or greater than 250, the sentence pair will be removed

In [7]:
!git clone https://github.com/moses-smt/mosesdecoder data/mosesdecoder
!cd data/mosesdecoder
!git checkout RELEASE-4.0 && cd ../..
!perl data/mosesdecoder/scripts/training/clean-corpus-n.perl -ratio 1.3 \
    data/WikiMatrix.en-fa.langidfilter \
    en fa \
    data/WikiMatrix.en-fa.langidfilter.lengthratio \
    1 250

fatal: destination path 'data/mosesdecoder' already exists and is not an empty directory.
fatal: not a git repository (or any of the parent directories): .git
clean-corpus.perl: processing data/WikiMatrix.en-fa.langidfilter.en & .fa to data/WikiMatrix.en-fa.langidfilter.lengthratio, cutoff 1-250, ratio 1.3
..........(100000)..........(200000).........
Input sentences: 297450  Output sentences:  220337


**Normalize Punctuation**

Punctuation can vary across languages and even between ascii and unicode variants of the same punctuation marker. For example, across languages. For example, in German, quotes are often written as „ and “ while in English we typically just use ". This step normalizes such punctuation differences to use the same character everywhere.

In [9]:
!pip install https://github.com/kpu/kenlm/archive/master.zip

Collecting https://github.com/kpu/kenlm/archive/master.zip
  Downloading https://github.com/kpu/kenlm/archive/master.zip
[2K     [32m\[0m [32m553.6 kB[0m [31m7.3 MB/s[0m [33m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: kenlm
  Building wheel for kenlm (pyproject.toml) ... [?25l[?25hdone
  Created wheel for kenlm: filename=kenlm-0.2.0-cp310-cp310-linux_x86_64.whl size=3184347 sha256=434a3ee2beeb7b5ae8c5382c0e8a1d008df144fbd14020e17649fe52ef81ce9a
  Stored in directory: /tmp/pip-ephem-wheel-cache-dvz8cemz/wheels/a5/73/ee/670fbd0cee8f6f0b21d10987cb042291e662e26e1a07026462
Successfully built kenlm
Installing collected packages: kenlm
Successfully installed kenlm-0.2.0


In [11]:
!pip install wget
!apt-get install libboost-all-dev
!apt-get install gawk

Collecting wget
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9655 sha256=a4c131622c71a24d29c37be43348d73bb6dcc1680ceef7a40b640a3404d5ff2f
  Stored in directory: /root/.cache/pip/wheels/8b/f1/7f/5c94f0a7a505ca1c81cd1d9208ae2064675d97582078e6c769
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
libboost-all-dev is already the newest version (1.74.0.3ubuntu7).
0 upgraded, 0 newly installed, 0 to remove and 19 not upgraded.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
Suggested packages:
  gawk-doc
The following NEW packages will be installed:
  gawk
0 upgraded, 1 newly installed, 0 to remove 

In [12]:
!apt-get update
!apt-get install -y libsndfile1 ffmpeg
!git clone https://github.com/NVIDIA/NeMo

0% [Working]            Hit:1 http://archive.ubuntu.com/ubuntu jammy InRelease
0% [Waiting for headers] [Connecting to security.ubuntu.com (185.125.190.39)] [Connected to cloud.r-                                                                                                    Get:2 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [119 kB]
0% [2 InRelease 12.7 kB/119 kB 11%] [Connecting to security.ubuntu.com (185.125.190.39)] [Connected                                                                                                     Hit:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
0% [2 InRelease 21.4 kB/119 kB 18%] [Waiting for headers] [Waiting for headers] [Connecting to ppa.l                                                                                                    Get:4 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Get:5 http://security.ubuntu.com/ubuntu jammy-security InRel

In [13]:
cd NeMo

/content/NeMo


In [14]:
!bash ./reinstall.sh

Collecting pip
  Downloading pip-23.3.1-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 23.1.2
    Uninstalling pip-23.1.2:
      Successfully uninstalled pip-23.1.2
Successfully installed pip-23.3.1
Uninstalling stuff
[0mInstalling nemo
Obtaining file:///content/NeMo
  Installing build dependencies ... [?25l[?25hdone
  Checking if build backend supports build_editable ... [?25l[?25hdone
  Getting requirements to build editable ... [?25l[?25hdone
  Preparing editable metadata (pyproject.toml) ... [?25l[?25hdone
Collecting huggingface-hub
  Downloading huggingface_hub-0.18.0-py3-none-any.whl.metadata (13 kB)
Collecting onnx>=1.7.0
  Downloading onnx-1.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)
Collecting ruamel.yaml
  Downloading ruamel.yaml

In [16]:
cd ..

/content


**Sacremoses**

In [17]:
print('Normalizing English ...')
!sacremoses -j 4 normalize \
    < data/WikiMatrix.en-fa.langidfilter.lengthratio.en > \
    data/WikiMatrix.en-fa.langidfilter.lengthratio.sacremoses.norm.en

print('Normalizing Farsi ...')
!sacremoses -j 4 normalize \
    < data/WikiMatrix.en-fa.langidfilter.lengthratio.fa > \
    data/WikiMatrix.en-fa.langidfilter.lengthratio.sacremoses.norm.fa

Normalizing English ...
100% 220337/220337 [00:26<00:00, 8421.19it/s] 
Normalizing Farsi ...
100% 220337/220337 [00:29<00:00, 7377.01it/s]


**Moses**

In [18]:
print('Normalizing English ...')
!perl data/mosesdecoder/scripts/tokenizer/normalize-punctuation.perl -l en \
    < data/WikiMatrix.en-fa.langidfilter.lengthratio.en > \
    data/WikiMatrix.en-fa.langidfilter.lengthratio.moses.norm.en

print('Normalizing Farsi ...')
!perl data/mosesdecoder/scripts/tokenizer/normalize-punctuation.perl -l fa \
    < data/WikiMatrix.en-fa.langidfilter.lengthratio.fa > \
    data/WikiMatrix.en-fa.langidfilter.lengthratio.moses.norm.fa

Normalizing English ...
Normalizing Farsi ...


**Tokenize**

"This is a sentence." will be tokenized as ["This, is, a, sentence."].

However, we'd typically like punctuation to be separate tokens for example,

"This is a sentence." will be tokenized my moses or sacremoses as [", This, is, a, sentence, ., "].

**Sacremoses**

In [19]:
print('Tokenizing English ...')
!sacremoses -j 4 -l en tokenize -x \
    < data/WikiMatrix.en-fa.langidfilter.lengthratio.sacremoses.norm.en > \
    data/WikiMatrix.en-fa.langidfilter.lengthratio.sacremoses.norm.tok.en

print('Tokenizing Farsi ...')
!sacremoses -j 4 -l fa tokenize -x \
    < data/WikiMatrix.en-fa.langidfilter.lengthratio.sacremoses.norm.fa > \
    data/WikiMatrix.en-fa.langidfilter.lengthratio.sacremoses.norm.tok.fa

Tokenizing English ...
100% 220337/220337 [00:49<00:00, 4473.27it/s]
Tokenizing Farsi ...
100% 220337/220337 [00:33<00:00, 6573.21it/s]


**Moses**

In [20]:
print('Tokenizing English ...')
!perl data/mosesdecoder/scripts/tokenizer/tokenizer.perl -l en -no-escape -threads 4 \
    < data/WikiMatrix.en-fa.langidfilter.lengthratio.moses.norm.en > \
    data/WikiMatrix.en-fa.langidfilter.lengthratio.moses.norm.tok.en

print('Tokenizing Farsi ...')
!perl data/mosesdecoder/scripts/tokenizer/tokenizer.perl -l fa -no-escape -threads 4 \
    < data/WikiMatrix.en-fa.langidfilter.lengthratio.moses.norm.fa > \
    data/WikiMatrix.en-fa.langidfilter.lengthratio.moses.norm.tok.fa

Tokenizing English ...
Tokenizer Version 1.1
Language: en
Number of threads: 4
Tokenizing Farsi ...
Tokenizer Version 1.1
Language: fa
Number of threads: 4


In [21]:
print()
print('-----------------------------------------')
print('Tokenized Farsi Sentences ...')
print('-----------------------------------------')
print()

!head -10 data/WikiMatrix.en-fa.langidfilter.lengthratio.moses.norm.tok.fa

print()
print('-----------------------------------------')
print('Tokenized English Sentences ...')
print('-----------------------------------------')
print()

!head -10 data/WikiMatrix.en-fa.langidfilter.lengthratio.moses.norm.tok.en


-----------------------------------------
Tokenized Farsi Sentences ...
-----------------------------------------

وهمهٔ شکستگی ‌ ها که کشیدی به سوی خدای خواهدت برد .
این آیه در رابطه با قوم بنی اسرائیل است .
بخوان به نام پروردگارت که آفرید ؛ که انسان را از خون بسته ( علق ) آفرید .
پس هنگامی که برای او آشکار شد ، گفت : " می ‌ دانم که خداوند بر هر چیزی تواناست " .
در ربّنای شجریان ، هر آیه از ابتدای ربّنا تا آخر آیه خوانده می ‌ شود .
او با نام رب ‌ السیف و القلم ( پروردگار شمشیر و قلم ) نیز شناخته می ‌ شود .
بخوان به نام پروردگارت که آفرید .
ابراهیم در نجف عالم می شوند ولی علی اکبر به سار بر می گردند .
دلایل روشنی از طرف پروردگار برای شما آمده .
پس این امور هم یقینی هستند .

-----------------------------------------
Tokenized English Sentences ...
-----------------------------------------

It will destroy everything at the bidding of its Lord . "
And this is true of the people of Israel . "
Recite in the name of your Lord who created - Created man from a clinging substance .
When this 

**Deduplicate**

 This step removes duplicate translation pairs from the corpus

In [22]:
import xxhash

def dedup_file(input_file_lang_1, input_file_lang_2, output_file_lang_1, output_file_lang_2):
    print()
    print('====================================')
    print('========== De-duplicate ============')
    print('====================================')
    print()
    num_lines = num_lines_in_file(input_file_lang_1)
    hashes = set()
    num_output_lines = 0
    with open(input_file_lang_1, 'r') as f_lang1, \
        open(input_file_lang_2, 'r')  as f_lang2, \
        open(output_file_lang_1, 'w') as f_out_lang1, \
        open(output_file_lang_2, 'w') as f_out_lang2:
        for line_1, line_2 in tqdm(zip(f_lang1, f_lang2), total=num_lines, desc=f"Deduplicating files"):
            parallel_hash = xxhash.xxh64((line_1.strip() + '\t' + line_2.strip()).encode('utf-8')).hexdigest()
            if parallel_hash not in hashes:
                hashes.add(parallel_hash)
                f_out_lang1.write(line_1.strip() + '\n')
                f_out_lang2.write(line_2.strip() + '\n')
                num_output_lines += 1

    print(f"Kept {num_output_lines} out of {num_lines} after deduplication")

dedup_file(
    'data/WikiMatrix.en-fa.langidfilter.lengthratio.moses.norm.tok.en',
    'data/WikiMatrix.en-fa.langidfilter.lengthratio.moses.norm.tok.fa',
    'data/WikiMatrix.en-fa.langidfilter.lengthratio.moses.norm.tok.dedup.en',
    'data/WikiMatrix.en-fa.langidfilter.lengthratio.moses.norm.tok.dedup.fa'
)





Deduplicating files: 100%|██████████| 220337/220337 [00:01<00:00, 154208.67it/s]


Kept 220337 out of 220337 after deduplication


In [24]:
import shutil

folder_path = '/content/data'
output_zip_path = '/content/data.zip'

shutil.make_archive(output_zip_path.split('.zip')[0], 'zip', folder_path)

'/content/data.zip'

In [25]:
!cp '/content/data.zip' '/content/drive/MyDrive/Colab Notebooks'