In [1]:
%load_ext autoreload
%autoreload 2

import pprint
import pandas as pd
import yaml
import os
import logging

import sys 
sys.path.append("../src")

from llama_index import SimpleDirectoryReader
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

from utils import join_csv_files, split_text_into_chunks

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
with open('../config/config.yaml', 'r') as file:
    config = yaml.safe_load(file)

config

{'meps_list': {'output_dir': 'meps-text-mining/data/country_meps'},
 'meps_speeches': {'europal_website': 'https://www.europarl.europa.eu',
  'href_root': 'https://www.europarl.europa.eu/meps/en/',
  'output_dir': 'meps-text-mining/data/meps_plenary_speeches',
  'logs_dir': 'meps-text-mining/logs',
  'logs_filename': 'scraping_speeches.txt'},
 'meps_speeches_dataframe': {'output_dir': 'meps-text-mining/data/meps_speeches_dataframe',
  'filename': 'meps_speeches.csv'},
 'meps_speeches_en_translation': {'output_dir': 'meps-text-mining/data/meps_speeches_en_translations',
  'filename': 'speeches_translations.txt',
  'logs_filename': 'translating_speeches.txt'}}

### Load all MEPS speeches available



In [3]:
import re
from typing import List

class HuggingfaceTranslator():

    def __init__(self, model:str):
        self.tokenizer = AutoTokenizer.from_pretrained(model)
        self.model = AutoModelForSeq2SeqLM.from_pretrained(model)  

    @staticmethod
    def split_text_into_chunks(text: str, max_tokens: int) -> List[str]:
        """
        Split a given text into chunks of manageable size based on the maximum number of tokens.
        Splitting at punctuation and concatenating sentences if their total length is less than max tokens.

        Parameters:
        - text (str): The input text to be split.
        - max_tokens (int): The maximum number of tokens allowed per chunk.

        Returns:
        - List[str]: A list of text chunks.
        """
        # Split the text into sentences using regex
        sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
        
        chunks = []
        current_chunk = ""

        for sentence in sentences:
            sentence_tokens = len(sentence.split())
            if len(current_chunk.split()) + sentence_tokens <= max_tokens:
                current_chunk += " " + sentence if current_chunk else sentence
            else:
                chunks.append(current_chunk.strip())
                current_chunk = sentence

        if current_chunk:
            chunks.append(current_chunk.strip())

        return chunks
    

    def translate(self, sample_text):
    
        batch = self.tokenizer([sample_text], return_tensors = "pt")
        
        generated_ids = self.model.generate(**batch)
        tr = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

        return tr
            

In [4]:
def translator(content_to_translate_df, outpath, translator):
    """Translation function
    
    Arguments:
        content_to_translate_df: pandas dataframe
        outpath: str
            location where translations file is store
        translator: class instance
    """


    for idx in content_to_translate_df.itertuples():
        
        url = idx.Url
        content_to_translate = idx.Content

        if isinstance(idx.Content, str):

            try:
                current_df = pd.read_csv(outpath, sep = '|', names= ['Url', 'Content'])
            except:
                print('No file created yet')
                pass

            if url in current_df['Url'].tolist():
                logging.info(f"""{url} already_translated""")
            else:
                print(f"""translating {url}""")
                logging.info(f"""translating {url}""")
            
                txt = translator.split_text_into_chunks(content_to_translate, max_tokens=150)

                try:
                    translated_text = ""

                    for t in txt:
                        trsl = translator.translate(t)
                        translated_text += trsl[0]
                except:
                    logging.info(f"""Could not traslate {url}""")
                    continue

                with open(outpath, 'a') as file:
                    # Write the tab-separated content to the file
                    file.write('\n' + '|'.join([url, translated_text]))
        
        else:
            logging.info(f"For {url} I couldn't find a content to translate")


In [5]:
# Load speeches dataframe
current_directory = os.path.dirname(os.path.abspath(os.getcwd()))
file_name = os.path.join(config['meps_speeches_dataframe']['output_dir'], config['meps_speeches_dataframe']['filename'])
speeches_df = pd.read_csv(os.path.join(os.path.dirname(current_directory), file_name))
speeches_df.head()

# Url must be unique as we are will be using it a key
assert speeches_df['Url'].nunique() == len(speeches_df)

In [6]:
# Configure logging 

logs_dir = os.path.join(os.path.dirname(current_directory), config['meps_speeches']['logs_dir'])
logs_filename = config['meps_speeches_en_translation']['logs_filename']

logging.basicConfig(
    filename=os.path.join(logs_dir, logs_filename),
    level=logging.INFO, 
    format='%(asctime)s - %(levelname)s - %(message)s',
    force=True
    )

In [7]:
# Speeches to translate
speeches = speeches_df[speeches_df['Language'] == 'IT']

out_path = os.path.join(os.path.dirname(current_directory), config['meps_speeches_en_translation']['output_dir'])
file_path = os.path.join(out_path, config['meps_speeches_en_translation']['filename'])

# Now reduce the speeches to the one that have not been translated yet
translations = pd.read_csv(file_path, sep = '|', names = ['Url', 'Translation'])

speeches_to_translate = speeches[~speeches['Url'].isin(translations['Url'])]



In [10]:
# Inizialize Translator Class
model = "Helsinki-NLP/opus-mt-it-en"
translator_class = HuggingfaceTranslator(model)



In [11]:
# Translate Content passing the translator_class to the translator function
translator(speeches_to_translate, file_path, translator_class)

translating https://www.europarl.europa.eu/doceo/document/CRE-9-2022-05-04-INT-3-065-0000_IT.html


Token indices sequence length is longer than the specified maximum sequence length for this model (4342 > 512). Running this sequence through the model will result in indexing errors


translating https://www.europarl.europa.eu/doceo/document/CRE-9-2023-12-12-INT-2-207-0000_IT.html
translating https://www.europarl.europa.eu/doceo/document/CRE-9-2024-01-17-INT-3-487-0000_IT.html
translating https://www.europarl.europa.eu/doceo/document/CRE-9-2024-02-07-INT-3-390-0000_IT.html
translating https://www.europarl.europa.eu/doceo/document/CRE-9-2024-01-15-INT-1-086-0000_IT.html
translating https://www.europarl.europa.eu/doceo/document/CRE-9-2024-01-18-INT-4-099-0000_IT.html
translating https://www.europarl.europa.eu/doceo/document/CRE-9-2024-01-16-INT-2-167-0000_IT.html
translating https://www.europarl.europa.eu/doceo/document/CRE-9-2023-12-14-INT-4-040-0000_IT.html
translating https://www.europarl.europa.eu/doceo/document/CRE-9-2024-02-06-INT-2-332-0000_IT.html
translating https://www.europarl.europa.eu/doceo/document/CRE-9-2024-01-16-INT-2-445-0000_IT.html
translating https://www.europarl.europa.eu/doceo/document/CRE-9-2024-02-08-INT-4-036-0000_IT.html
translating https://