In [34]:
%load_ext autoreload
%autoreload 2

import pprint
import pandas as pd
import yaml
import os

import sys 
sys.path.append("../src")

from llama_index import SimpleDirectoryReader
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

from utils import join_csv_files, split_text_into_chunks

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [35]:
from dotenv import load_dotenv, find_dotenv
import openai

# Load Environment Variables
_ = load_dotenv(find_dotenv())
openai.api_key = os.getenv("OPENAI_API_KEY")

In [36]:
with open('../config/config.yaml', 'r') as file:
    config = yaml.safe_load(file)

# Get Data

In [37]:
current_directory = os.path.dirname(os.path.abspath(os.getcwd()))

# Speeches Paths
input_path = os.path.join(config['meps_speeches_dataframe']['output_dir'], config['meps_speeches_dataframe']['filename'])
input_path_tr = os.path.join(config['meps_speeches_en_translation']['output_dir'], config['meps_speeches_en_translation']['filename'])
input_meps_list = config['meps_list']['output_dir']

# Speeches Dataframe
speeches = pd.read_csv(os.path.join(os.path.dirname(current_directory), input_path))
translations = pd.read_csv(os.path.join(os.path.dirname(current_directory), input_path_tr), sep = '|', names = ['Url', 'Translation'])

# Join Speeches to translations and add metadata
df = pd.merge(speeches, translations, on = 'Url', how = 'left')

In [8]:
# If language is English replace column Translation with the actual Content

en_translation = []

for idx in df.itertuples():

    if idx.Language == 'EN':
        txt = idx.Content
        en_translation.append(txt)
    else:
        txt = idx.Translation
        en_translation.append(txt)

df['Translation'] = en_translation

# Column names in capital letters
df.columns = [x.upper() for x in df.columns]

df.head(2)

Unnamed: 0,MP,DATE,LANGUAGE,TITLE,URL,CONTENT,COUNTRY,POLITICALGROUP,ID,NATIONALPOLITICALGROUP,TRANSLATION
0,Daniel CASPARY,2023-11-21,DE,EU/New Zealand Free Trade Agreement (debate),https://www.europarl.europa.eu/doceo/document/...,"Daniel Caspary, Berichterstatter. – Herr Pr...",Germany,Group of the European People's Party (Christia...,28219,Christlich Demokratische Union Deutschlands,
1,Daniel CASPARY,2023-11-21,DE,EU/New Zealand Free Trade Agreement (debate),https://www.europarl.europa.eu/doceo/document/...,"Daniel Caspary (PPE), Frage nach dem Verfah...",Germany,Group of the European People's Party (Christia...,28219,Christlich Demokratische Union Deutschlands,


# Rag with Llama Index

In [1]:
#df[~df['TRANSLATION'].isnull()]['LANGUAGE'].value_counts()

In [9]:
# Do RAG only on Italian speeches translated in English
tt = df[df['COUNTRY'] == 'Italy']

g = "\n\n".join([x for x in tt['TRANSLATION'].dropna().tolist()])

In [10]:
from llama_index import Document
from llama_index import VectorStoreIndex
from llama_index import ServiceContext
from llama_index.llms import OpenAI

document = Document(text=g)

llm = OpenAI(model="gpt-3.5-turbo", temperature=0.1)
service_context = ServiceContext.from_defaults(
    llm=llm, embed_model="local:BAAI/bge-small-en-v1.5"
)
index = VectorStoreIndex.from_documents([document],
                                        service_context=service_context)

In [11]:
query_engine = index.as_query_engine(similarity_top_k=10)

In [38]:
YOUR_QUERY = """

    Which MEP made an emotional speech? What was the topic?

"""

response = query_engine.query(
    YOUR_QUERY)

pprint.pprint(str(response))

('Pina Picierno, a Member of the European Parliament, made an emotional '
 'speech. The topic of her speech was about the urgency of addressing violence '
 'against women and the need for immediate action.')


In [39]:
[x.text for x in response.source_nodes]

["Salvatore De Meo (PPE). - (IT) Mr President, ladies and gentlemen, with the proposal for the revision of the European Treaties, today we are starting to write a new important page in the history of Europe and of our institution.European integration is a step-by-step process which has been stalled for too long now, since the Treaty of Lisbon of 2007 which needs to be relaunched today with strength and conviction. Our proposal for a revision offers many ideas for a debate on the future of our Europe, starting with the need to strengthen our democracy through, for example, the recognition of Parliament's legislative initiative or the role of co-legislator on the budget, but we must also look at the role of the Union on the international stage and increase our credibility as a world player, through, for example, a European Defence Community. This report is only the beginning of a long and complex path that we hope the Member States will have the courage to undertake, putting aside prejud