In [105]:
%load_ext autoreload
%autoreload 2

import pprint
import pandas as pd
import yaml
import os
import logging

import sys 
sys.path.append("../src")

from llama_index import SimpleDirectoryReader
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

from utils import join_csv_files, split_text_into_chunks

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [102]:
with open('../config/config.yaml', 'r') as file:
    config = yaml.safe_load(file)

config

{'meps_speeches': {'europal_website': 'https://www.europarl.europa.eu',
  'href_root': 'https://www.europarl.europa.eu/meps/en/',
  'output_dir': 'data/meps_plenary_speeches',
  'logs_dir': 'logs',
  'logs_filename': 'scraping_speeches.txt'},
 'meps_speeches_dataframe': {'output_dir': 'data/meps_speeches_dataframe',
  'filename': 'meps_speeches.csv'},
 'meps_speeches_en_translation': {'output_dir': 'data/meps_speeches_en_translations',
  'filename': 'speeches_translations.txt',
  'logs_filename': 'translating_speeches.txt'}}

### Load all MEPS speeches available



In [150]:
import re
from typing import List


model = "Helsinki-NLP/opus-mt-it-en"

class HuggingfaceTranslator():

    def __init__(self, model:str):
        self.tokenizer = AutoTokenizer.from_pretrained(model)
        self.model = AutoModelForSeq2SeqLM.from_pretrained(model)  

    @staticmethod
    def split_text_into_chunks(text: str, max_tokens: int) -> List[str]:
        """
        Split a given text into chunks of manageable size based on the maximum number of tokens.
        Splitting at punctuation and concatenating sentences if their total length is less than max tokens.

        Parameters:
        - text (str): The input text to be split.
        - max_tokens (int): The maximum number of tokens allowed per chunk.

        Returns:
        - List[str]: A list of text chunks.
        """
        # Split the text into sentences using regex
        sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
        
        chunks = []
        current_chunk = ""

        for sentence in sentences:
            sentence_tokens = len(sentence.split())
            if len(current_chunk.split()) + sentence_tokens <= max_tokens:
                current_chunk += " " + sentence if current_chunk else sentence
            else:
                chunks.append(current_chunk.strip())
                current_chunk = sentence

        if current_chunk:
            chunks.append(current_chunk.strip())

        return chunks
    

    def translate(self, sample_text):
    
        batch = self.tokenizer([sample_text], return_tensors = "pt")
        
        generated_ids = self.model.generate(**batch)
        tr = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

        return tr
            



In [180]:
def translator(content_to_translate_df, outpath, translator):
    """Translation function
    
    Arguments:
        content_to_translate_df: pandas dataframe
        outpath: str
            location where translations file is store
        translator: class instance
    """


    for idx in content_to_translate_df.itertuples():
        
        url = idx.Url
        content_to_translate = idx.Content

        if isinstance(idx.Content, str):

            try:
                current_df = pd.read_csv(outpath, sep = '|', names= ['Url', 'Content'])
            except:
                print('No file created yet')
                pass

            if url in current_df['Url'].tolist():
                logging.info(f"""{url} already_translated""")
            else:
                print(f"""translating {url}""")
                logging.info(f"""translating {url}""")
            
                txt = translator.split_text_into_chunks(content_to_translate, max_tokens=150)

                try:
                    translated_text = ""

                    for t in txt:
                        trsl = translator.translate(t)
                        translated_text += trsl[0]
                except:
                    logging.info(f"""Could not traslate {url}""")
                    continue

                with open(outpath, 'a') as file:
                    # Write the tab-separated content to the file
                    file.write('\n' + '|'.join([url, translated_text]))
        
        else:
            logging.info(f"For {url} I couldn't find a content to translate")


In [37]:
# Load speeches dataframe
base_directory = os.path.dirname(os.path.abspath(os.getcwd()))
file_name = os.path.join(config['meps_speeches_dataframe']['output_dir'], config['meps_speeches_dataframe']['filename'])
speeches_df = pd.read_csv(os.path.join(base_directory, file_name))
speeches_df.head()

# Url must be unique as we are will be using it a key
assert speeches_df['Url'].nunique() == len(speeches_df)

In [106]:
# Configure logging 

logs_dir = os.path.join(base_directory, config['meps_speeches']['logs_dir'])
logs_filename = config['meps_speeches_en_translation']['logs_filename']

logging.basicConfig(
    filename=os.path.join(logs_dir, logs_filename),
    level=logging.INFO, 
    format='%(asctime)s - %(levelname)s - %(message)s',
    force=True
    )

In [175]:
speeches = speeches_df[speeches_df['Language'] == 'FR']

out_path = os.path.join(base_directory, config['meps_speeches_en_translation']['output_dir'])
file_path = os.path.join(out_path, config['meps_speeches_en_translation']['filename'])

model = "Helsinki-NLP/opus-mt-fr-en"
translator_class = HuggingfaceTranslator(model)




In [181]:
translator(speeches, file_path, translator_class)

translating https://www.europarl.europa.eu/doceo/document/CRE-9-2023-11-20-INT-1-171-0000_FR.html
translating https://www.europarl.europa.eu/doceo/document/CRE-9-2023-07-13-INT-4-025-0000_FR.html
translating https://www.europarl.europa.eu/doceo/document/CRE-9-2023-06-14-INT-3-188-0000_FR.html
translating https://www.europarl.europa.eu/doceo/document/CRE-9-2023-05-09-INT-2-429-0000_FR.html
translating https://www.europarl.europa.eu/doceo/document/CRE-9-2023-03-29-INT-1-165-0000_FR.html
translating https://www.europarl.europa.eu/doceo/document/CRE-9-2023-03-13-INT-1-108-0000_FR.html
translating https://www.europarl.europa.eu/doceo/document/CRE-9-2023-03-13-INT-1-133-0000_FR.html
translating https://www.europarl.europa.eu/doceo/document/CRE-9-2023-01-17-INT-2-115-0000_FR.html
translating https://www.europarl.europa.eu/doceo/document/CRE-9-2022-12-12-INT-1-058-0000_FR.html
translating https://www.europarl.europa.eu/doceo/document/CRE-9-2022-12-12-INT-1-060-0000_FR.html
translating https://