In [72]:
%load_ext autoreload
%autoreload 2

import pprint
import pandas as pd
import yaml
import os
import logging

import sys 
sys.path.append("../src")

from llama_index import SimpleDirectoryReader
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

from utils import join_csv_files, split_text_into_chunks

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [146]:
!pip install update llama_index

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting update
  Downloading update-0.0.1-py2.py3-none-any.whl (2.9 kB)
Collecting style==1.1.0 (from update)
  Downloading style-1.1.0-py2.py3-none-any.whl (6.4 kB)
Installing collected packages: style, update
Successfully installed style-1.1.0 update-0.0.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [79]:
with open('../config/config.yaml', 'r') as file:
    config = yaml.safe_load(file)

config

{'meps_list': {'output_dir': 'data/country_meps'},
 'meps_speeches': {'europal_website': 'https://www.europarl.europa.eu',
  'href_root': 'https://www.europarl.europa.eu/meps/en/',
  'output_dir': 'data/meps_plenary_speeches',
  'logs_dir': 'logs',
  'logs_filename': 'scraping_speeches.txt'},
 'meps_speeches_dataframe': {'output_dir': 'data/meps_speeches_dataframe',
  'filename': 'meps_speeches.csv'},
 'meps_speeches_en_translation': {'output_dir': 'data/meps_speeches_en_translations',
  'filename': 'speeches_translations.txt',
  'logs_filename': 'translating_speeches.txt'}}

### Get Data


In [91]:
base_directory = os.path.dirname(os.path.abspath(os.getcwd()))

# Speeches Paths
input_path = os.path.join(config['meps_speeches_dataframe']['output_dir'], config['meps_speeches_dataframe']['filename'])
input_path_tr = os.path.join(config['meps_speeches_en_translation']['output_dir'], config['meps_speeches_en_translation']['filename'])
input_meps_list = config['meps_list']['output_dir']

# Speeches Dataframe
speeches = pd.read_csv(os.path.join(base_directory, input_path))
translations = pd.read_csv(os.path.join(base_directory, input_path_tr), sep = '|', names = ['Url', 'Translation'])
list_of_meps = join_csv_files(os.path.join(base_directory, input_meps_list))

# Join Speeches to translations and add metadata
df = pd.merge(speeches, translations, on = 'Url', how = 'left')
df = pd.merge(df, list_of_meps.rename(columns = {'fullName': 'MP'}), on = 'MP', how = 'left' )

In [95]:
# If language is English replace column Translation with the actual Content

en_translation = []

for idx in df.itertuples():

    if idx.Language == 'EN':
        txt = idx.Content
        en_translation.append(txt)
    else:
        txt = idx.Translation
        en_translation.append(txt)

df['Translation'] = en_translation

In [195]:
# Column names in capital letters
df.columns = [x.upper() for x in df.columns]

df.head()

Unnamed: 0,MP,DATE,LANGUAGE,TITLE,URL,CONTENT,TRANSLATION,COUNTRY,POLITICALGROUP,ID,NATIONALPOLITICALGROUP
0,Daniel CASPARY,2023-11-21,DE,EU/New Zealand Free Trade Agreement (debate),https://www.europarl.europa.eu/doceo/document/...,"Daniel Caspary, Berichterstatter. – Herr Pr...",,Germany,Group of the European People's Party (Christia...,28219,Christlich Demokratische Union Deutschlands
1,Daniel CASPARY,2023-11-21,DE,EU/New Zealand Free Trade Agreement (debate),https://www.europarl.europa.eu/doceo/document/...,"Daniel Caspary (PPE), Frage nach dem Verfah...",,Germany,Group of the European People's Party (Christia...,28219,Christlich Demokratische Union Deutschlands
2,Daniel CASPARY,2023-11-21,DE,EU/New Zealand Free Trade Agreement (debate),https://www.europarl.europa.eu/doceo/document/...,"Daniel Caspary, Berichterstatter. – Frau Pr...",,Germany,Group of the European People's Party (Christia...,28219,Christlich Demokratische Union Deutschlands
3,Daniel CASPARY,2023-11-08,DE,Urgent need for immediate measures against the...,https://www.europarl.europa.eu/doceo/document/...,"Daniel Caspary, im Namen der PPE-Fraktion. ...",,Germany,Group of the European People's Party (Christia...,28219,Christlich Demokratische Union Deutschlands
4,Daniel CASPARY,2023-10-18,DE,The despicable terrorist attacks by Hamas agai...,https://www.europarl.europa.eu/doceo/document/...,Daniel Caspary (PPE). – Herr Präsident! De...,,Germany,Group of the European People's Party (Christia...,28219,Christlich Demokratische Union Deutschlands


# RAG with LLama Index

In [102]:
tt = df[df['country'] == 'Italy']

g = "\n\n".join([x for x in tt['Translation'].dropna().tolist()])


In [103]:
from llama_index import Document

document = Document(text=g)

In [None]:
Document()

In [126]:
from llama_index import VectorStoreIndex
from llama_index import ServiceContext
from llama_index.llms import OpenAI

llm = OpenAI(model="gpt-3.5-turbo", temperature=0.1)
service_context = ServiceContext.from_defaults(
    llm=llm, embed_model="local:BAAI/bge-small-en-v1.5"
)
index = VectorStoreIndex.from_documents([document],
                                        service_context=service_context, top_k = 5)

In [148]:
query_engine = index.as_query_engine(similarity_top_k=5)

In [140]:
dir(query_engine)[-20:]

['_get_prompt_modules',
 '_get_prompts',
 '_node_postprocessors',
 '_query',
 '_response_synthesizer',
 '_retriever',
 '_update_prompts',
 '_validate_prompts',
 'aquery',
 'aretrieve',
 'asynthesize',
 'callback_manager',
 'from_args',
 'get_prompts',
 'query',
 'retrieve',
 'retriever',
 'synthesize',
 'update_prompts',
 'with_retriever']

In [181]:


response = query_engine.query(
    "What was the opinion of Meps on Tunisia")

print(str(response))

MEPs expressed their deep concern about the situation in Tunisia, highlighting the suspension of democracy, the persecution of political leaders and activists, and the economic and social challenges faced by the country. They stressed the urgent need for the restoration of democracy and the establishment of the Constitutional Court. Additionally, they emphasized the importance of the European Union providing support to Tunisia, while ensuring that this support is contingent upon the return of democracy.


In [182]:
import pprint 

pprint.pprint(response)

Response(response='MEPs expressed their deep concern about the situation in '
                  'Tunisia, highlighting the suspension of democracy, the '
                  'persecution of political leaders and activists, and the '
                  'economic and social challenges faced by the country. They '
                  'stressed the urgent need for the restoration of democracy '
                  'and the establishment of the Constitutional Court. '
                  'Additionally, they emphasized the importance of the '
                  'European Union providing support to Tunisia, while ensuring '
                  'that this support is contingent upon the return of '
                  'democracy.',
         source_nodes=[NodeWithScore(node=TextNode(id_='317b6e34-226a-40a3-9fbd-64aa8f26a4a3', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='3ffe0fe5-ac1b-4892-ad

In [161]:
pprint.pprint(response.source_nodes[0].text)

('Susanna Ceccardi (ID). - (IT) Madam President, ladies and gentlemen, I have '
 'listened to many fine words, but I would like to tell you a story. In Pisa, '
 'in the 2000s - in Pisa and Tuscany she ruled the left for many years - with '
 'the funds of the European Community for about one billion of the old lire, '
 '500 000 EUR, houses were built to overcome a Roma camp that had hundreds and '
 'hundreds of inhabitants. Well, what happened? That these villas were built, '
 'among other things very beautiful, completely disassembled to sell the '
 'pieces from the same inhabitants, from the same occupants of the villas, and '
 'a few years ago in that village arrested an entire family that had forced a '
 '15-year-old to a forced marriage and had practically kept it secreted with '
 'violence and unspeakable abuse. Here is that project, which then gave life '
 'to an incredible illegality, was financed with the money of the European '
 "Community.But I ask myself and ask you: don't y

# Sentence Window

In [175]:
from llama_index import ServiceContext, VectorStoreIndex, StorageContext
from llama_index.node_parser import SentenceWindowNodeParser
from llama_index.indices.postprocessor import MetadataReplacementPostProcessor
from llama_index.indices.postprocessor import SentenceTransformerRerank
from llama_index import load_index_from_storage
import os


def build_sentence_window_index(
    document, llm, embed_model="local:BAAI/bge-small-en-v1.5", save_dir="sentence_index"
):
    # create the sentence window node parser w/ default settings
    node_parser = SentenceWindowNodeParser.from_defaults(
        window_size=3,
        window_metadata_key="window",
        original_text_metadata_key="original_text",
    )
    sentence_context = ServiceContext.from_defaults(
        llm=llm,
        embed_model=embed_model,
        node_parser=node_parser,
    )
    if not os.path.exists(save_dir):
        sentence_index = VectorStoreIndex.from_documents(
            [document], service_context=sentence_context
        )
        sentence_index.storage_context.persist(persist_dir=save_dir)
    else:
        sentence_index = load_index_from_storage(
            StorageContext.from_defaults(persist_dir=save_dir),
            service_context=sentence_context,
        )

    return sentence_index


def get_sentence_window_query_engine(
    sentence_index,
    similarity_top_k=6,
    rerank_top_n=2,
):
    # define postprocessors
    postproc = MetadataReplacementPostProcessor(target_metadata_key="window")
    rerank = SentenceTransformerRerank(
        top_n=rerank_top_n, model="BAAI/bge-reranker-base"
    )

    sentence_window_engine = sentence_index.as_query_engine(
        similarity_top_k=similarity_top_k, node_postprocessors=[postproc, rerank]
    )
    return sentence_window_engine

In [177]:
sentence_index = build_sentence_window_index(
    document,
    llm,
    embed_model="local:BAAI/bge-small-en-v1.5",
    save_dir="sentence_index"
)

KeyboardInterrupt: 

In [None]:
sentence_window_engine = get_sentence_window_query_engine(sentence_index)

In [None]:
#!pip install python-dotenv


import os
from dotenv import load_dotenv, find_dotenv

import numpy as np
from trulens_eval import (
    Feedback,
    TruLlama,
    OpenAI
)

from trulens_eval.feedback import Groundedness
import nest_asyncio

nest_asyncio.apply()


def get_openai_api_key():
    _ = load_dotenv(find_dotenv())

    return os.getenv("OPENAI_API_KEY")


def get_hf_api_key():
    _ = load_dotenv(find_dotenv())

    return os.getenv("HUGGINGFACE_API_KEY")

openai = OpenAI()

qa_relevance = (
    Feedback(openai.relevance_with_cot_reasons, name="Answer Relevance")
    .on_input_output()
)

qs_relevance = (
    Feedback(openai.relevance_with_cot_reasons, name = "Context Relevance")
    .on_input()
    .on(TruLlama.select_source_nodes().node.text)
    .aggregate(np.mean)
)

#grounded = Groundedness(groundedness_provider=openai, summarize_provider=openai)
grounded = Groundedness(groundedness_provider=openai)

groundedness = (
    Feedback(grounded.groundedness_measure_with_cot_reasons, name="Groundedness")
        .on(TruLlama.select_source_nodes().node.text)
        .on_output()
        .aggregate(grounded.grounded_statements_aggregator)
)

feedbacks = [qa_relevance, qs_relevance, groundedness]

def get_trulens_recorder(query_engine, feedbacks, app_id):
    tru_recorder = TruLlama(
        query_engine,
        app_id=app_id,
        feedbacks=feedbacks
    )
    return tru_recorder

def get_prebuilt_trulens_recorder(query_engine, app_id):
    tru_recorder = TruLlama(
        query_engine,
        app_id=app_id,
        feedbacks=feedbacks
        )
    return tru_recorder

from llama_index import ServiceContext, VectorStoreIndex, StorageContext
from llama_index.node_parser import SentenceWindowNodeParser
from llama_index.indices.postprocessor import MetadataReplacementPostProcessor
from llama_index.indices.postprocessor import SentenceTransformerRerank
from llama_index import load_index_from_storage
import os


def build_sentence_window_index(
    document, llm, embed_model="local:BAAI/bge-small-en-v1.5", save_dir="sentence_index"
):
    # create the sentence window node parser w/ default settings
    node_parser = SentenceWindowNodeParser.from_defaults(
        window_size=3,
        window_metadata_key="window",
        original_text_metadata_key="original_text",
    )
    sentence_context = ServiceContext.from_defaults(
        llm=llm,
        embed_model=embed_model,
        node_parser=node_parser,
    )
    if not os.path.exists(save_dir):
        sentence_index = VectorStoreIndex.from_documents(
            [document], service_context=sentence_context
        )
        sentence_index.storage_context.persist(persist_dir=save_dir)
    else:
        sentence_index = load_index_from_storage(
            StorageContext.from_defaults(persist_dir=save_dir),
            service_context=sentence_context,
        )

    return sentence_index


def get_sentence_window_query_engine(
    sentence_index,
    similarity_top_k=6,
    rerank_top_n=2,
):
    # define postprocessors
    postproc = MetadataReplacementPostProcessor(target_metadata_key="window")
    rerank = SentenceTransformerRerank(
        top_n=rerank_top_n, model="BAAI/bge-reranker-base"
    )

    sentence_window_engine = sentence_index.as_query_engine(
        similarity_top_k=similarity_top_k, node_postprocessors=[postproc, rerank]
    )
    return sentence_window_engine


from llama_index.node_parser import HierarchicalNodeParser

from llama_index.node_parser import get_leaf_nodes
from llama_index import StorageContext
from llama_index.retrievers import AutoMergingRetriever
from llama_index.indices.postprocessor import SentenceTransformerRerank
from llama_index.query_engine import RetrieverQueryEngine


def build_automerging_index(
    documents,
    llm,
    embed_model="local:BAAI/bge-small-en-v1.5",
    save_dir="merging_index",
    chunk_sizes=None,
):
    chunk_sizes = chunk_sizes or [2048, 512, 128]
    node_parser = HierarchicalNodeParser.from_defaults(chunk_sizes=chunk_sizes)
    nodes = node_parser.get_nodes_from_documents(documents)
    leaf_nodes = get_leaf_nodes(nodes)
    merging_context = ServiceContext.from_defaults(
        llm=llm,
        embed_model=embed_model,
    )
    storage_context = StorageContext.from_defaults()
    storage_context.docstore.add_documents(nodes)

    if not os.path.exists(save_dir):
        automerging_index = VectorStoreIndex(
            leaf_nodes, storage_context=storage_context, service_context=merging_context
        )
        automerging_index.storage_context.persist(persist_dir=save_dir)
    else:
        automerging_index = load_index_from_storage(
            StorageContext.from_defaults(persist_dir=save_dir),
            service_context=merging_context,
        )
    return automerging_index


def get_automerging_query_engine(
    automerging_index,
    similarity_top_k=12,
    rerank_top_n=2,
):
    base_retriever = automerging_index.as_retriever(similarity_top_k=similarity_top_k)
    retriever = AutoMergingRetriever(
        base_retriever, automerging_index.storage_context, verbose=True
    )
    rerank = SentenceTransformerRerank(
        top_n=rerank_top_n, model="BAAI/bge-reranker-base"
    )
    auto_merging_engine = RetrieverQueryEngine.from_args(
        retriever, node_postprocessors=[rerank]
    )
    return auto_merging_engine
