In [135]:
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from io import StringIO
import csv

def read_pdf(pdf_file, pages):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    for page in PDFPage.get_pages(pdf_file, pages, caching=True, check_extractable=True):
        interpreter.process_page(page)
    text = retstr.getvalue()

    device.close()
    retstr.close()
    return text

pages = set(range(32, 262))

with open('texts/GLOSSIKA-ENIT-F1-GMS.pdf', 'rb') as f:
    text = read_pdf(f, pages)

streams = text.split("\n")
english = None
italian = None
results = []
for stream in streams:
    if stream.startswith("EN "):
        english = stream[3:]
    elif stream.startswith("IT "):
        if english and not italian:
            italian = stream[3:]
        elif english and italian:
            italian += ' ' + stream
    elif stream.startswith("IPA "):
        if english and italian:
            translations = english, italian
            results.append(translations)
            english = None
            italian = None
    else:
        if english and not italian:
            english += ' ' + stream
        elif english and italian:
            italian += ' ' + stream

with open('english_italian_sentences.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(["Sentence number", "English sentence", "Italian Sentence"])
    for i, result in enumerate(results, 1):
        writer.writerow([i] + list(result))