1. PDFs in Croatian and English are read and converted to text using the pdfplumber library.

In [None]:
import pdfplumber

def read_croatian_pdfs():
    text = []
    sum = 0
    for i in range(1, 15):
        n = str(i)

        try:
            with pdfplumber.open(f'Manuals/{i}_C.pdf') as pdf:
                text = ''
                for page_num in range(len(pdf.pages)):
                    page = pdf.pages[page_num]
                    text += page.extract_text()
        except:
            print(f"couldn't find a {i}_C.pdf file")

        with open(f"Manuals/txt/{i}_C.txt", 'w', encoding='utf-8') as txt_file:
            txt_file.write(text)

2. The text is cleaned from undesirable user manual "spam".

In [1]:
import re

def cluster_text_by_punctuation(input_file_path, output_file_path):
    with open(input_file_path, 'r', encoding='utf-8') as file:
        text = file.read()

    cleaned_text = re.sub(r'[^.!?a-zčćžđšžA-ZČĆŠŽ]', ' ', text)
    cleaned_text = re.sub(r'\.\.+', ' ', cleaned_text)
    cleaned_text = re.sub(r'\s\s+', ' ', cleaned_text)
    cleaned_text = re.sub(r'\.\s(\.\s)+', ' ', cleaned_text)
    cleaned_text = re.sub(r'\s\.', '.', cleaned_text)
    with open(output_file_path, 'w', encoding='utf-8') as file:
        file.write(cleaned_text)

3. The sentences in Croatian and English are aligned using cosine similarity and sentence vectors.

In [None]:
from googletrans import Translator

def find_best_matches(sentences_cr, sentences_en, nlp2):
    cr_matches = []
    en_matches = []
    translator = Translator()

    min_len = min(len(sentences_cr), len(sentences_en))
    best_match_index = []
    cr_index = []
    ctrl = 0

    for i in range(min_len):
        matched = 0
        similarities = []
        try:
            cr = nlp2(translator.translate(sentences_cr[i], dest='en').text).vector.reshape(1, -1)
            ctrlNum = i - ctrl
            step = 0
            top_range = i+ctrlNum+1

            if(top_range > min_len):
                top_range = len(sentences_en)

            for j in range(ctrlNum,top_range):
                en = nlp2(sentences_en[j]).vector.reshape(1, -1)
                similarities.append(cosine_similarity(cr, en)[0, 0])
                if (similarities[step]>0.95):
                    print(f"got match in {i} sentence")
                    best_match_index.append(j)
                    cr_index.append(i)
                    matched = 1
                    break
                step+= 1

            if (matched == 0):
                ctrl += 1
        except Exception as e:
            print(e)
    return best_match_index, cr_index

4. The aligned sentences are used to create a parallel corpus.

In [None]:
import ast
from nltk.translate import AlignedSent, Alignment
import pickle

cr_sentences = []
en_sentences = []

for i in range(1, 8):
    cr_file_path = f'Manuals/txt/{i}_cMatches'
    en_file_path = f'Manuals/txt/{i}_eMatches'
    with open(cr_file_path, 'r', encoding='utf-8') as file:
        cr_text = file.read()
    with open(en_file_path, 'r', encoding='utf-8') as file:
        en_text = file.read()

    croatian_out = ast.literal_eval(cr_text)
    english_out = ast.literal_eval(en_text)

    for sent in croatian_out:
        cr_sentences.append(sent)
    for sent in english_out:
        en_sentences.append(sent)

aligned_sentences = []

for cr_sent, en_sent in zip(cr_sentences, en_sentences):
    alignments = [(i, i) for i in range(min(len(cr_sent), len(en_sent)))]

    aligned_sent = AlignedSent(en_sent, cr_sent, Alignment(alignments))

    aligned_sentences.append(aligned_sent)

corpus = aligned_sentences

with open("corpus.pkl", "wb") as pickle_file:
    pickle.dump(corpus, pickle_file)

<font size="4"></font>