In [1]:
import string
from functools import reduce
from bs4 import BeautifulSoup
import unidecode
import contractions
import os

#### Download data

In [2]:
os.makedirs("./data", exist_ok=True)

In [9]:
!wget -nc https://object.pouta.csc.fi/OPUS-CCMatrix/v1/moses/ar-en.txt.zip -P ./data

--2022-04-05 21:36:59--  https://object.pouta.csc.fi/OPUS-CCMatrix/v1/moses/ar-en.txt.zip
Resolving object.pouta.csc.fi (object.pouta.csc.fi)... 86.50.254.18, 86.50.254.19
Connecting to object.pouta.csc.fi (object.pouta.csc.fi)|86.50.254.18|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5012858187 (4,7G) [application/zip]
Saving to: ‘./data/ar-en.txt.zip’


2022-04-05 21:39:27 (32,3 MB/s) - ‘./data/ar-en.txt.zip’ saved [5012858187/5012858187]



In [13]:
!unzip -o ./data/ar-en.txt.zip -d ./data

Archive:  ./data/ar-en.txt.zip
  inflating: ./data/README           
  inflating: ./data/LICENSE          
  inflating: ./data/CCMatrix.ar-en.ar  
  inflating: ./data/CCMatrix.ar-en.en  
  inflating: ./data/CCMatrix.ar-en.scores  


In [14]:
!rm -rf ./data/ar-en.txt.zip

In [15]:
!wget -nc https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/moses/ar-en.txt.zip -P ./data

--2022-04-05 21:44:11--  https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/moses/ar-en.txt.zip
Resolving object.pouta.csc.fi (object.pouta.csc.fi)... 86.50.254.18, 86.50.254.19
Connecting to object.pouta.csc.fi (object.pouta.csc.fi)|86.50.254.18|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1014474587 (967M) [application/zip]
Saving to: ‘./data/ar-en.txt.zip’


2022-04-05 21:44:41 (32,5 MB/s) - ‘./data/ar-en.txt.zip’ saved [1014474587/1014474587]



In [16]:
!unzip -o ./data/ar-en.txt.zip -d ./data

Archive:  ./data/ar-en.txt.zip
  inflating: ./data/OpenSubtitles.ar-en.ar  
  inflating: ./data/OpenSubtitles.ar-en.en  
  inflating: ./data/OpenSubtitles.ar-en.ids  
  inflating: ./data/README           


In [17]:
!rm -rf ./data/ar-en.txt.zip

In [18]:
def remove_punctuations(text):
    return "".join([c for c in text if c not in string.punctuation])

In [19]:
remove_punctuations("H()e?ll..o~,!")

'Hello'

In [20]:
def remove_extra_whitespaces(text):
    return " ".join(text.split())

In [21]:
remove_extra_whitespaces("Hello1      hi2                      hi3\n")

'Hello1 hi2 hi3'

In [22]:
def remove_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    stripped_text = soup.get_text(separator=" ")
    return stripped_text

In [23]:
remove_html_tags("<br>Hello</br><p>Hi</p>")

'Hello Hi'

In [24]:
def remove_accented_chars(text):
    return unidecode.unidecode(text)

In [25]:
remove_accented_chars("à partir de l'âge")

"a partir de l'age"

In [26]:
def expand_contractions(text):
    """expand shortened words, e.g. don't to do not"""
    return contractions.fix(text)

In [27]:
expand_contractions("My baby doesn't want to eat vegetables")

'My baby does not want to eat vegetables'

In [28]:
PREPROCESSING_FUNCTIONS = [remove_punctuations, remove_extra_whitespaces, remove_html_tags]

In [29]:
OPEN_SUBTITLES_SEN = 1000000
CCMATRIX_SEN = 2000000

In [40]:
def process_data(src_path, dst_path, num_lines, processing_fun):
    with open(src_path, "r") as src_file:
        os.makedirs(os.path.dirname(dst_path), exist_ok=True)
        with open(dst_path, "w") as dst_file:
            for i, line in enumerate(src_file):
                processed_line = reduce(lambda a, f: f(a), processing_fun, line.lower().rstrip())      
                dst_file.write(processed_line + "\n")
                
                if i >= num_lines - 1:
                    break

In [41]:
process_data("./data/OpenSubtitles.ar-en.en", "./processed_data/OpenSubtitles_processed_en.txt", OPEN_SUBTITLES_SEN, PREPROCESSING_FUNCTIONS)



In [43]:
process_data("./data/OpenSubtitles.ar-en.ar", "./processed_data/OpenSubtitles_processed_ar.txt", OPEN_SUBTITLES_SEN, PREPROCESSING_FUNCTIONS)

In [44]:
process_data("./data/CCMatrix.ar-en.en", "./processed_data/CCMatrix_processed_en.txt", CCMATRIX_SEN, PREPROCESSING_FUNCTIONS)

In [45]:
process_data("./data/CCMatrix.ar-en.ar", "./processed_data/CCMatrix_processed_ar.txt", OPEN_SUBTITLES_SEN, PREPROCESSING_FUNCTIONS)