# Part 1: Pipeline to generate LLM summaries

In [24]:
#import libraries
import os
import dotenv
from datasets import Dataset 

dotenv.load_dotenv();

### Define the functions for the LLM summarisation pipeline

In [2]:
def attend_prompt(text, questions):
    """
    Generates LLM-responses to a list based on a provided context.

    This function takes into account a context string (text) and a list of questions (questions).
    It presents the context string and the list of questions to GPT-3.5 and the LLM answers the question in the list
    based on the context string

    Parameters:
    - text (str): The context or background information to be used by the model for generating responses.
    - questions (list of str): A list of questions for which answers have to be generated.

    Returns:
    - list of strings (List(str): A list that contains the answers to the questions that were asked.
    """

    prompts = questions
    
    responses = []
    for prompt in prompts:
       
        combined_content = f"context: {text}\n\nUser: {prompt}"

        simple_prompt = [{"role": "user", "content": combined_content}]

        chatgpt_output = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=simple_prompt,
            temperature=0.3,
            max_tokens=1000,
        )
        gpt_output = chatgpt_output.to_dict()
        message_content = gpt_output['choices'][0]['message']['content']
        responses.append(message_content)
    
    return responses

In [3]:
def summary_prompt(instruction, info_to_summarise):
    """
    Prompts the LLM to create a summary.

    Parameters:
    - instruction (str): The instruction that prompts the LLM to summarise the information.
    - info_to_summarise (lList[List(str)]): A nested list where each inner list contains key information extracted from every 
    document in your multi-documents corpus.
  
    Returns:
    - summary (str). The summary generated by the LLM based on the key information.
    """

    prompt = instruction
    combined_content = f"Information: {info_to_summarise}\n\nUser: {prompt}"

    simple_prompt = [{"role": "user", "content": combined_content}]
    
    chatgpt_output = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=simple_prompt,
            temperature=0.3,
            max_tokens=1000,
        )
    gpt_output = chatgpt_output.to_dict()
    response = gpt_output['choices'][0]['message']['content']
    return response

In [4]:
def retrieve_info(data, column_name, prompts):
    """
    Retrieves information from a specified column in the dataset and processes it with attend_prompt.
    
    Parameters:
    data (DataFrame): The dataframe containing the documents in your text corpus.
    column_name (str): The name of the column where the documents are stored.
    prompts (list(str)): The different questions the attend_prompt has to answer.
    
    Returns:
    list (list[list(str)]): A nested list where each inner list contains the responses to every question asked in the attend_prompt.
    """
    # Initialize an empty list to store the processed information
    list_with_info = []

    # Iterate over each item in the specified column
    for item in data[column_name]:
        # Process the item with attend_prompt and add the result to the list
        list_with_info.append(attend_prompt(item, prompts))
    
    # Return the list of processed information
    return list_with_info

## Example usage

### Step 0: create your dataset

In [5]:
"""
To illustrate how this pipeline can be used a dummy dataset is downloaded from S3.
This dummy dataset acts as a multi-document corpus, which needs to be summarised.
"""

from discovery_utils.utils import s3
client_s = s3.s3_client()

df_speeches = s3._download_obj(
    s3_client=client_s,
    bucket=s3.BUCKET_NAME_RAW,
    path_from="data/policy_scanning_data/dummy_speeches.csv",
    download_as="dataframe"
)

In [6]:
#The documents to summarise are stored in the 'speech' column of a DataFrame called df_speeches
for item in df_speeches['speech']:
    print(item)

my hon. friend makes an important point. we want to go with the grain of human nature, which means that, when it is time to replace a gas boiler, the heat pump is a competitive option in terms of price. that is why we think the cost of heat pumps can reduce by 25% to 50% by 2025. we have our £450 million boiler upgrade scheme to provide capital grants of up to £6,000, and that is in addition to the zero per cent rate of vat on installation. This speech was held in 2022 by Greh Hands. Party: None
i welcome this debate and congratulate the hon. member for brighton, pavilion (caroline lucas) not only on securing it but on all the work she has done over many years to bring environmental issues to the fore in this house. i also thank my right hon. friend the member for newcastle upon tyne east (mr brown) for drawing attention to the fact that, on may day 2019, this house became the first parliament in the world to declare a climate emergency, which i am pleased to say many local authorities

### Step 1 define the prompts

In [7]:
#Define the prompts you want the attend_prompt to answer based on the documents in your corpus
prompts = ['what is the context in which heat pumps are mentioned given the speech?', 
            'What is the name of the speaker?', 
            'In what year was the speech given?',
            'To which party does the speaker belong?']

### Step 2 run the attend_prompt for every item in your dataframe

In [13]:
#Set up your API key to access the LLM
from langfuse.openai import openai
client = openai.OpenAI(api_key=os.environ["OPENAI_API_KEY"])

In [15]:
#Call the retrieve_info function to apply the attend_prompt to every document in your dataframe
key_info = retrieve_info(df_speeches, 'speech', prompts)

In [26]:
key_info

[['In the speech, heat pumps are mentioned in the context of promoting their use as a competitive option for replacing gas boilers. The speaker emphasizes the importance of aligning with human nature and making heat pumps more affordable in order to encourage their adoption. The government has implemented a boiler upgrade scheme and zero per cent VAT rate on installation to support the transition to heat pumps.',
  "The speaker's name is Greh Hands.",
  'The speech was given in 2022.',
  'The speaker, Greh Hands, does not belong to any specific party as mentioned in the context provided.'],
 ['The mention of heat pumps in the speech is in the context of discussing local initiatives and actions taken by Islington Council to address environmental issues and promote sustainability. Heat pumps are highlighted as a specific example of a technology being utilized to meet the passive house standard and contribute to the green agenda. The speech emphasizes the importance of local authorities h

### Step 3: Run the summary prompt


In [16]:
#Present the combined key information from the documents to the summary_prompt along with the instruction to summaries the information.
summary_key_info = summary_prompt("summarize this information and make sure to include everything that is important",key_info)

In [17]:
summary_key_info

"The information provided includes speeches by different speakers discussing the promotion and adoption of heat pumps as a sustainable heating option to reduce carbon emissions and combat climate change. The speakers emphasize the importance of making heat pumps competitive in terms of price, with initiatives such as the boiler upgrade scheme and capital grants to incentivize their transition. Heat pumps are seen as practical and efficient ways to reduce carbon emissions and promote renewable energy sources in local communities. The government's heat and buildings strategy aims to bring parity with gas boilers by 2030, transitioning towards technologies like air source heat pumps to support the net zero strategy. However, challenges and limitations exist in relying solely on heat pumps for off-gas grid homes, particularly in rural and coastal areas. Funding is needed to replace outdated heating systems in historic churches with ground-source heat pumps. The speakers mentioned include G

# Part 2: Implementation of RAGAS for evaluating the attend_prompt and summary_prompt

## RAGAS evaluation for the attend_prompt

In [18]:
#If possible select the ground_truth for the answers of your question(s).
ground_truth_= ['the heat pump is a competitive option in terms of price. that is why we think the cost of heat pumps can reduce by 25% to 50% by 2025. we have our £450 million boiler upgrade scheme to provide capital grants of up to £6,000, and that is in addition to the zero per cent rate of vat on installation. This speech was held in 2022 by Greg Hands. Party: None',
                      'i was at an excellent meeting on monday morning organised by islington council to launch its brilliant green agenda. it will mean better insulation in homes; transport initiatives; using waste heat from an underground station as part of a district heating scheme; using waste heat from a stepped down transformer owned by the national grid to heat a school and neighbouring properties; and installing a heat pump in a community centre to meet the passive house standard. there will be no hiding place, however rich we might be. This speech was held in 2021 by Jeremy Corbyn. Party: None',
                      'the hon. gentleman is right to say that we need to be moving towards technologies such as air source heat pumps. that is why our heat and buildings strategy sets out a plan to bring parity with gas boilers by 2030. that is precisely what we want to see, because those costs need to come down, and that is what we are enabling through our net zero strategy. This speech was held in 2022 by Simon Clarke. Party: None',
                      'if that was to be replaced with a ground-source heat pump, that would cost in excess of £750,000. what can my hon. friend suggest to help the church? This speech was held in 2022 by Nicholas Fletcher. Party: None',
                      'we have 1.7 million homes in this country that are currently off the gas grid, most of which use kerosene at the moment. under the current government plan, which is born out of a strategy that dates all the way back to 2017—several governments ago—the intention is that all those 1.7 million homes would be banned from having a replacement boiler after 2026 and told that, instead, they must have, effectively, either an air source heat pump or a ground source heat pump. as the right hon. member for leeds central (hilary benn) said, there is a role for those heat pumps, but they are not for every home. in particular, in rural and especially coastal areas, air source heat pumps can be prone to rusting and decay. it is also the case that they need a lot of insulation to make them work, and, in some older homes, high levels of insulation mean less ventilation, which can lead to problems with damp, mould and all of the health problems that go with that. perhaps, more important than anything, the capital cost of these air source or ground source heat pumps for a single property is around four times that of a conventional boiler. This speech was held in 2023 by George Eustice. Party: None']

In [19]:
#Define a function you can use to create the datasets RAGAS needs to calculate the metrics scores. 
def create_dataset(prompts, llm_responses, context, ground_truth):
    """
    This fucntion creates a dataset for every document in your sample to evualate the accarcy of the attend_prompt.

    Parameters:
    - prompts (list of str): The list of questions asked within the attend_prompt.
    - llm_responses (list[list(str)]): A nested list where every inner list contains the LLM-responses to each question asked 
    within the attend_prompt.
    - context (list of str): List that contains the documents that provided the information to the LLM for each prompt-response pair.
    - ground_truth (list[list(str)]): A nested list where each inner list contains the actual answers for every question asked.

    Returns:
    - list_dataset: A list of `Dataset` objects where each object contains a single datasample with 
    the questions, generated answers, context, and ground truth for one specific document.
    """
    list_datasets = []

    for index, item in enumerate(llm_responses):
        data_sample = {
        'question': [prompts],
        'answer': [item],
        'contexts' : [[f"{context[index]}"]],
        'ground_truth': [[f"{ground_truth[index]}"]]
        }

        data = Dataset.from_dict(data_sample)
        list_datasets.append(data)
    
    return list_datasets

In [20]:
#Create a list of datasamples that you can use to evaluate the accuracy of your attend_prompt
datasets_list = create_dataset(prompts, key_info, df_speeches['speech'], ground_truth_)

In [25]:
#Set up langfuse environment
!echo $LANGFUSE_HOST

from langfuse import Langfuse

langfuse = Langfuse()

https://langfuse.dap-tools.uk


In [26]:
#import the metrics from RAGAS that you think are relevant for evaluation
from ragas.metrics import faithfulness, answer_relevancy, context_precision

metrics = [faithfulness, answer_relevancy, context_precision]

In [27]:
#Run this cell to set up Langfuse connection
from ragas.run_config import RunConfig
from ragas.metrics.base import MetricWithLLM, MetricWithEmbeddings

def init_ragas_metrics(metrics, llm, embedding):
    for metric in metrics:
        if isinstance(metric, MetricWithLLM):
            metric.llm = llm
        if isinstance(metric, MetricWithEmbeddings):
            metric.embeddings = embedding
        run_config = RunConfig()
        metric.init(run_config)

In [28]:
#Run this cell to set up Langfuse connection
from langchain_openai.chat_models import ChatOpenAI
from langchain_openai.embeddings import OpenAIEmbeddings

# wrappers
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper

llm = ChatOpenAI()
emb = OpenAIEmbeddings()

init_ragas_metrics(
    metrics,
    llm=LangchainLLMWrapper(llm),
    embedding=LangchainEmbeddingsWrapper(emb),
)

In [29]:
#Define the function that calculates the RAGAS metrics that you have imported.
async def score_with_ragas(question, speech, answer):
    scores = {}
    for m in metrics:
        print(f"calculating {m.name}")
        scores[m.name] = await m.ascore(
            row={"question": question, "contexts": speech, "answer": answer}
        )
    return scores

# Present the Data to the RAGAS function to calculate metric scores

In [30]:
"""
Present your data to RAGAS and calculate the RAGAS scores
Note: This cell calculates the RAGAS score for one question asked in the attend_prompt. Hence, to evaluate the LLM responses for every question
within the attend_prompt you need to loop over the list that stores the questions.
"""
for item in datasets_list:
    
    information = item[0]
    
    question = information['question'][0]
    llm_response = information['answer'][0]
    speech = information['contexts']
   
    ragas_scores = await score_with_ragas(question, speech, llm_response)
    print(ragas_scores)


calculating faithfulness
calculating answer_relevancy


Using 'context_precision' without ground truth will be soon depreciated. Use 'context_utilization' instead


calculating context_precision
{'faithfulness': 0.3333333333333333, 'answer_relevancy': 0.9802587929652528, 'context_precision': 0.9999999999}
calculating faithfulness
calculating answer_relevancy


Using 'context_precision' without ground truth will be soon depreciated. Use 'context_utilization' instead


calculating context_precision
{'faithfulness': 0.6666666666666666, 'answer_relevancy': 0.9428754566110783, 'context_precision': 0.9999999999}
calculating faithfulness
calculating answer_relevancy


Using 'context_precision' without ground truth will be soon depreciated. Use 'context_utilization' instead


calculating context_precision
{'faithfulness': 0.8333333333333334, 'answer_relevancy': 0.906619101794445, 'context_precision': 0.9999999999}
calculating faithfulness
calculating answer_relevancy


Using 'context_precision' without ground truth will be soon depreciated. Use 'context_utilization' instead


calculating context_precision
{'faithfulness': 1.0, 'answer_relevancy': 0.9802587929652528, 'context_precision': 0.9999999999}
calculating faithfulness
calculating answer_relevancy


Using 'context_precision' without ground truth will be soon depreciated. Use 'context_utilization' instead


calculating context_precision
{'faithfulness': 0.6666666666666666, 'answer_relevancy': 0.8441707874644511, 'context_precision': 0.9999999999}


# RAGAS Summarisation Score

In [31]:
import tiktoken
# Load the GPT-3.5 tokenizer
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")

In [32]:
#change the key information into a string
key_info_string = ""
for item in key_info:
    for string in item:
        key_info_string += string + " "

In [33]:
#Calculate summarisation score
import nest_asyncio
from datasets import Dataset 
from ragas.metrics import summarization_score
from ragas import evaluate

nest_asyncio.apply()

data_samples = {
    'contexts' : [[key_info_string]],
    'summary': [summary_key_info]
}
dataset = Dataset.from_dict(data_samples)
score = evaluate(dataset,metrics=[summarization_score])
score.to_pandas()

Evaluating: 100%|██████████| 1/1 [00:08<00:00,  8.69s/it]


Unnamed: 0,contexts,summary,summary_score
0,[The context in which heat pumps are mentioned...,The information provided includes speeches by ...,0.619294
