Notebook downloads data and creates train, validation and test datasets.

In [5]:
!pip install langid



In [6]:
from functools import reduce
from bs4 import BeautifulSoup
import os
import numpy as np
import langid
import warnings
warnings.filterwarnings('ignore')

#### Download data from all used corpuses

In [7]:
os.makedirs("../data", exist_ok=True)

In [8]:
!wget -nc https://object.pouta.csc.fi/OPUS-CCMatrix/v1/moses/ar-en.txt.zip -P ../data

--2022-05-07 15:48:48--  https://object.pouta.csc.fi/OPUS-CCMatrix/v1/moses/ar-en.txt.zip
Resolving object.pouta.csc.fi (object.pouta.csc.fi)... 86.50.254.18, 86.50.254.19
Connecting to object.pouta.csc.fi (object.pouta.csc.fi)|86.50.254.18|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5012858187 (4,7G) [application/zip]
Saving to: ‘../data/ar-en.txt.zip’


2022-05-07 15:51:39 (28,0 MB/s) - ‘../data/ar-en.txt.zip’ saved [5012858187/5012858187]



In [9]:
!unzip -o ../data/ar-en.txt.zip -d ../data

Archive:  ../data/ar-en.txt.zip
  inflating: ../data/README          
  inflating: ../data/LICENSE         
  inflating: ../data/CCMatrix.ar-en.ar  
  inflating: ../data/CCMatrix.ar-en.en  
  inflating: ../data/CCMatrix.ar-en.scores  


In [10]:
!rm -rf ../data/ar-en.txt.zip

In [11]:
!wget -nc https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/moses/ar-en.txt.zip -P ../data

--2022-05-07 15:53:03--  https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/moses/ar-en.txt.zip
Resolving object.pouta.csc.fi (object.pouta.csc.fi)... 86.50.254.18, 86.50.254.19
Connecting to object.pouta.csc.fi (object.pouta.csc.fi)|86.50.254.18|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1014474587 (967M) [application/zip]
Saving to: ‘../data/ar-en.txt.zip’


2022-05-07 15:53:45 (22,9 MB/s) - ‘../data/ar-en.txt.zip’ saved [1014474587/1014474587]



In [12]:
!unzip -o ../data/ar-en.txt.zip -d ../data

Archive:  ../data/ar-en.txt.zip
  inflating: ../data/OpenSubtitles.ar-en.ar  
  inflating: ../data/OpenSubtitles.ar-en.en  
  inflating: ../data/OpenSubtitles.ar-en.ids  
  inflating: ../data/README          


In [13]:
!rm -rf ../data/ar-en.txt.zip

In [14]:
!wget -nc https://object.pouta.csc.fi/OPUS-News-Commentary/v16/moses/ar-en.txt.zip -P ../data

--2022-05-07 15:54:03--  https://object.pouta.csc.fi/OPUS-News-Commentary/v16/moses/ar-en.txt.zip
Resolving object.pouta.csc.fi (object.pouta.csc.fi)... 86.50.254.18, 86.50.254.19
Connecting to object.pouta.csc.fi (object.pouta.csc.fi)|86.50.254.18|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 25988755 (25M) [application/zip]
Saving to: ‘../data/ar-en.txt.zip’


2022-05-07 15:54:06 (12,2 MB/s) - ‘../data/ar-en.txt.zip’ saved [25988755/25988755]



In [15]:
!unzip -o ../data/ar-en.txt.zip -d ../data

Archive:  ../data/ar-en.txt.zip
  inflating: ../data/README          
  inflating: ../data/LICENSE         
  inflating: ../data/News-Commentary.ar-en.ar  
  inflating: ../data/News-Commentary.ar-en.en  
  inflating: ../data/News-Commentary.ar-en.xml  


In [16]:
!rm -rf ../data/ar-en.txt.zip

In [17]:
!wget -nc https://object.pouta.csc.fi/OPUS-TED2013/v1.1/moses/ar-en.txt.zip -P ../data

--2022-05-07 15:54:07--  https://object.pouta.csc.fi/OPUS-TED2013/v1.1/moses/ar-en.txt.zip
Resolving object.pouta.csc.fi (object.pouta.csc.fi)... 86.50.254.18, 86.50.254.19
Connecting to object.pouta.csc.fi (object.pouta.csc.fi)|86.50.254.18|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 12065234 (12M) [application/zip]
Saving to: ‘../data/ar-en.txt.zip’


2022-05-07 15:54:08 (12,6 MB/s) - ‘../data/ar-en.txt.zip’ saved [12065234/12065234]



In [18]:
!unzip -o ../data/ar-en.txt.zip -d ../data

Archive:  ../data/ar-en.txt.zip
  inflating: ../data/TED2013.ar-en.ar  
  inflating: ../data/TED2013.ar-en.en  
  inflating: ../data/TED2013.ar-en.ids  
  inflating: ../data/README          


In [19]:
!rm -rf ../data/ar-en.txt.zip

#### Help functions to preprocess data

In [21]:
# Set how many sentences you want from each datset
OPEN_SUBTITLES_SEN = 2000000
CCMATRIX_SEN = 1000000
NEWS_COMMENTARY_SEN = np.inf # all sentences from this dataset
TED_SEN = np.inf # all sentences from this dataset
MIN_SEN_LEN = 3
MAX_SEN_LEN = 100

In [22]:
def remove_extra_whitespaces(text):
    return " ".join(text.split())

In [23]:
remove_extra_whitespaces("Hello1      hi2                      hi3\n")

'Hello1 hi2 hi3'

In [24]:
def remove_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    stripped_text = soup.get_text(separator=" ")
    return stripped_text

In [25]:
remove_html_tags("<br>Hello</br><p>Hi</p>")

'Hello Hi'

In [26]:
def filter_wrong_lang(text, expected_lang):
    detected_lang = langid.classify(text)[0]
    if detected_lang != expected_lang.lower():
        return ''
    return text

In [27]:
def filter_not_ar(text):
    return filter_wrong_lang(text, 'ar')

In [28]:
def filter_not_en(text):
    return filter_wrong_lang(text, 'en')

In [29]:
filter_wrong_lang('Język polski nie jest wykorzystywanym językiem w tym projekcie', 'en')

''

In [30]:
filter_wrong_lang('Although english is indeed used', 'en')

'Although english is indeed used'

In [31]:
def filter_wrong_len(text):
    splitted = len(text.split(' '))
    if splitted < MIN_SEN_LEN or splitted > MAX_SEN_LEN:
        return ''
    return text

In [32]:
filter_wrong_len('Short sentence.')

''

In [33]:
filter_wrong_len('Not that short sentence anymore.')

'Not that short sentence anymore.'

In [34]:
PREPROCESSING_FUNCTIONS_EN = [remove_extra_whitespaces, remove_html_tags, filter_not_en, filter_wrong_len]
PREPROCESSING_FUNCTIONS_AR = [remove_extra_whitespaces, remove_html_tags, filter_not_ar, filter_wrong_len]

In [35]:
def process_data(src_dir, ds_name, dst_dir, num_expected_sen, processing_fun_en, processing_fun_ar):
    en_src_path = os.path.join(src_dir, f"{ds_name}.ar-en.en")
    ar_src_path = os.path.join(src_dir, f"{ds_name}.ar-en.ar")
    en_dst_path = os.path.join(dst_dir, f"{ds_name}_processed_en.txt")
    ar_dst_path = os.path.join(dst_dir, f"{ds_name}_processed_ar.txt")
    os.makedirs(dst_dir, exist_ok=True)
    valid_lines = 0
    
    with open(en_src_path, "r") as src_file_en:
        with open(ar_src_path, "r") as src_file_ar:
            with open(en_dst_path, "w") as dst_file_en:
                with open(ar_dst_path, "w") as dst_file_ar:
                    for (en_line, ar_line) in zip(src_file_en, src_file_ar):
                        processed_en_line = reduce(lambda a, f: f(a), processing_fun_en, en_line.rstrip())
                        if processed_en_line:
                            processed_ar_line = reduce(lambda a, f: f(a), processing_fun_ar, ar_line.rstrip())
                            if processed_ar_line:
                                dst_file_en.write(processed_en_line + "\n")
                                dst_file_ar.write(processed_ar_line + "\n")
                                valid_lines += 1
                        
                        if valid_lines >= num_expected_sen:
                            break


In [36]:
process_data(
    src_dir=os.path.join("..", "data"),
    ds_name="OpenSubtitles",
    dst_dir=os.path.join("..", "processed_data"),
    num_expected_sen=OPEN_SUBTITLES_SEN,
    processing_fun_en=PREPROCESSING_FUNCTIONS_EN,
    processing_fun_ar=PREPROCESSING_FUNCTIONS_AR
)

In [37]:
process_data(
    src_dir=os.path.join("..", "data"),
    ds_name="CCMatrix",
    dst_dir=os.path.join("..", "processed_data"),
    num_expected_sen=CCMATRIX_SEN,
    processing_fun_en=PREPROCESSING_FUNCTIONS_EN,
    processing_fun_ar=PREPROCESSING_FUNCTIONS_AR
)

In [38]:
process_data(
    src_dir=os.path.join("..", "data"),
    ds_name="News-Commentary",
    dst_dir=os.path.join("..", "processed_data"),
    num_expected_sen=NEWS_COMMENTARY_SEN,
    processing_fun_en=PREPROCESSING_FUNCTIONS_EN,
    processing_fun_ar=PREPROCESSING_FUNCTIONS_AR
)

In [39]:
process_data(
    src_dir=os.path.join("..", "data"),
    ds_name="TED2013",
    dst_dir=os.path.join("..", "processed_data"),
    num_expected_sen=TED_SEN,
    processing_fun_en=PREPROCESSING_FUNCTIONS_EN,
    processing_fun_ar=PREPROCESSING_FUNCTIONS_AR
)

#### Create train, validation and test set

Concatenate datasets

In [40]:
!cat ../processed_data/OpenSubtitles_processed_en.txt ../processed_data/CCMatrix_processed_en.txt ../processed_data/News-Commentary_processed_en.txt > ../processed_data/processed_en.txt

In [41]:
!cat ../processed_data/OpenSubtitles_processed_ar.txt ../processed_data/CCMatrix_processed_ar.txt ../processed_data/News-Commentary_processed_ar.txt > ../processed_data/processed_ar.txt

In [42]:
!cat ../processed_data/TED2013_processed_en.txt> ../processed_data/test_en.txt

In [43]:
!cat ../processed_data/TED2013_processed_ar.txt> ../processed_data/test_ar.txt

In [44]:
lines_num = np.arange(0, CCMATRIX_SEN+NEWS_COMMENTARY_SEN+OPEN_SUBTITLES_SEN)
val_nums = np.random.choice(lines_num, size=int(0.1*len(lines_num)), replace=False)
with open('../processed_data/processed_en.txt', 'r') as en_file:
    with open('../processed_data/processed_ar.txt', 'r') as ar_file:
        with open('../processed_data/train_en.txt', 'w') as train_en_file:
            with open('../processed_data/train_ar.txt', 'w') as train_ar_file:
                with open('../processed_data/val_en.txt', 'w') as val_en_file:
                    with open('../processed_data/val_ar.txt', 'w') as val_ar_file:
                        for i, (en_line, ar_line) in enumerate(zip(en_file, ar_file)):
                            if i in val_nums:
                                val_en_file.write(en_line)
                                val_ar_file.write(ar_line)
                            else:
                                train_en_file.write(en_line)
                                train_ar_file.write(ar_line)
                