In [14]:
%load_ext autoreload
%autoreload 2

import pprint
import pandas as pd
import yaml
import os

import sys 
sys.path.append("../src")

from llama_index import SimpleDirectoryReader
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

from utils import join_csv_files, split_text_into_chunks

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
from dotenv import load_dotenv, find_dotenv
import openai

# Load Environment Variables
_ = load_dotenv(find_dotenv())
openai.api_key = os.getenv("OPENAI_API_KEY")

In [10]:
with open('../config/config.yaml', 'r') as file:
    config = yaml.safe_load(file)

# Get Data

In [6]:
base_directory = os.path.dirname(os.path.abspath(os.getcwd()))

# Speeches Paths
input_path = os.path.join(config['meps_speeches_dataframe']['output_dir'], config['meps_speeches_dataframe']['filename'])
input_path_tr = os.path.join(config['meps_speeches_en_translation']['output_dir'], config['meps_speeches_en_translation']['filename'])
input_meps_list = config['meps_list']['output_dir']

# Speeches Dataframe
speeches = pd.read_csv(os.path.join(base_directory, input_path))
translations = pd.read_csv(os.path.join(base_directory, input_path_tr), sep = '|', names = ['Url', 'Translation'])

# Join Speeches to translations and add metadata
df = pd.merge(speeches, translations, on = 'Url', how = 'left')

In [7]:
# If language is English replace column Translation with the actual Content

en_translation = []

for idx in df.itertuples():

    if idx.Language == 'EN':
        txt = idx.Content
        en_translation.append(txt)
    else:
        txt = idx.Translation
        en_translation.append(txt)

df['Translation'] = en_translation

# Column names in capital letters
df.columns = [x.upper() for x in df.columns]

df.head()

Unnamed: 0,MP,DATE,LANGUAGE,TITLE,URL,CONTENT,COUNTRY,POLITICALGROUP,ID,NATIONALPOLITICALGROUP,TRANSLATION
0,Daniel CASPARY,2023-11-21,DE,EU/New Zealand Free Trade Agreement (debate),https://www.europarl.europa.eu/doceo/document/...,"Daniel Caspary, Berichterstatter. – Herr Pr...",Germany,Group of the European People's Party (Christia...,28219,Christlich Demokratische Union Deutschlands,
1,Daniel CASPARY,2023-11-21,DE,EU/New Zealand Free Trade Agreement (debate),https://www.europarl.europa.eu/doceo/document/...,"Daniel Caspary (PPE), Frage nach dem Verfah...",Germany,Group of the European People's Party (Christia...,28219,Christlich Demokratische Union Deutschlands,
2,Daniel CASPARY,2023-11-21,DE,EU/New Zealand Free Trade Agreement (debate),https://www.europarl.europa.eu/doceo/document/...,"Daniel Caspary, Berichterstatter. – Frau Pr...",Germany,Group of the European People's Party (Christia...,28219,Christlich Demokratische Union Deutschlands,
3,Daniel CASPARY,2023-11-08,DE,Urgent need for immediate measures against the...,https://www.europarl.europa.eu/doceo/document/...,"Daniel Caspary, im Namen der PPE-Fraktion. ...",Germany,Group of the European People's Party (Christia...,28219,Christlich Demokratische Union Deutschlands,
4,Daniel CASPARY,2023-10-18,DE,The despicable terrorist attacks by Hamas agai...,https://www.europarl.europa.eu/doceo/document/...,Daniel Caspary (PPE). – Herr Präsident! De...,Germany,Group of the European People's Party (Christia...,28219,Christlich Demokratische Union Deutschlands,


# Rag with Llama Index

In [1]:
#df[~df['TRANSLATION'].isnull()]['LANGUAGE'].value_counts()

In [8]:
# Do RAG only on Italian speeches translated in English
tt = df[df['COUNTRY'] == 'Italy']

g = "\n\n".join([x for x in tt['TRANSLATION'].dropna().tolist()])

In [11]:
from llama_index import Document
from llama_index import VectorStoreIndex
from llama_index import ServiceContext
from llama_index.llms import OpenAI

document = Document(text=g)

llm = OpenAI(model="gpt-3.5-turbo", temperature=0.1)
service_context = ServiceContext.from_defaults(
    llm=llm, embed_model="local:BAAI/bge-small-en-v1.5"
)
index = VectorStoreIndex.from_documents([document],
                                        service_context=service_context)

In [12]:
query_engine = index.as_query_engine(similarity_top_k=10)

In [18]:
YOUR_QUERY = """

    Find Meps who ask for a curbing on immigration and summarize their position
"""

response = query_engine.query(
    YOUR_QUERY)

pprint.pprint(str(response))

('Susanna Ceccardi (ID) and Vincenzo Sofo (ECR) are MEPs who emphasize the '
 'need to curb immigration by addressing irregular migration, promoting '
 "orderly entry, and protecting Europe's borders. They prioritize preventing "
 'illegal immigration, saving lives in the Mediterranean, and enabling member '
 'states to regulate entry into their territories.')


In [19]:
#chat_engine = index.as_chat_engine()
#response = chat_engine.chat(YOUR_QUERY)