<a href="https://colab.research.google.com/github/nifemi-alonge/cp30_llm_app/blob/main/CP30_LLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Clean Power 2030 LLM

In [None]:
!pip install langchain_community
!pip install pypdf
!pip install --upgrade --quiet  langchain langchain-huggingface sentence_transformers huggingface_hub
!pip install chromadb

Collecting langchain_community
  Downloading langchain_community-0.3.27-py3-none-any.whl.metadata (2.9 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain_community)
  Downloading pydantic_settings-2.10.1-py3-none-any.whl.metadata (3.4 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain_community)
  Downloading httpx_sse-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain_community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain_community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting python-dotenv>=0.21.0 (from pydantic-settings<3.0.0,>=2.4.0->langchain_community)
  Downloading python_dotenv-1.1.1-py3-none-any.whl.metadata (24 k

In [None]:
import numpy as np
from numpy.linalg import norm
import pandas as pd
import requests
import os
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_text_splitters import CharacterTextSplitter
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_huggingface import HuggingFaceEndpoint
from langchain.chains import LLMChain
from langchain_core.prompts import PromptTemplate
from langchain.vectorstores import Chroma
from langchain.document_loaders import TextLoader
from huggingface_hub import InferenceClient



In [None]:
hugging_face_api_token = ''

## Load Doc

In [None]:
# use langchain to load data
loader = WebBaseLoader("https://www.gov.uk/government/publications/clean-power-2030-action-plan/clean-power-2030-action-plan-a-new-era-of-clean-electricity-main-report")

In [None]:
docs = loader.load()
docs[0]



In [None]:
print(docs[0].metadata)

{'source': 'https://www.gov.uk/government/publications/clean-power-2030-action-plan/clean-power-2030-action-plan-a-new-era-of-clean-electricity-main-report', 'title': '\n      Clean Power 2030 Action Plan: A new era of clean electricity – main report - GOV.UK\n  ', 'language': 'en'}


In [None]:
print(docs[0].page_content)

In [None]:
# document = docs[0].page_content

# text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
#     encoding_name="cl100k_base", chunk_size=100, chunk_overlap=0
# )
# texts = text_splitter.split_text(document)

## Split Text

In [None]:
article_text = docs[0].page_content

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 105,
    chunk_overlap  = 20,
    length_function = len,
)

texts = text_splitter.create_documents([article_text])
len(texts)

3984

In [None]:
print(texts[0].page_content)
print(texts[400].page_content)

print(texts[0])

Clean Power 2030 Action Plan: A new era of clean electricity – main report - GOV.UK
to put an end to interminable delays in the system that mean it can take over a decade to develop and
page_content='Clean Power 2030 Action Plan: A new era of clean electricity – main report - GOV.UK'


## Create embeddings
- Using Hugging Face rather than OpenAI (paid)

In [None]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

In [None]:
# print(texts[0].page_content)
# embedding = embeddings.embed_query(texts[0].page_content)
# len(embedding)

In [None]:
# test
# embeddings.embed_documents([texts[0].page_content])['data'][0]['embedding']

In [None]:
# get page content for all chunks
# append to list
# convert to df
text_chunks=[]

for text in texts:
    text_chunks.append(text.page_content)

df = pd.DataFrame({'text_chunks': text_chunks})

In [None]:
df.head()

Unnamed: 0,text_chunks
0,Clean Power 2030 Action Plan: A new era of cle...
1,Cookies on GOV.UK
2,We use some essential cookies to make this web...
3,We’d like to set additional cookies to underst...
4,and improve government services.


In [None]:
# function to remove n
# get HF embeddings for inputted text
def get_embedding(text):
   text = text.replace("n", " ")
   return embeddings.embed_documents([text])[0]

In [None]:
# 12 mins to run
# create new column in df with embedding
# pply get_embedding function to each row of df[text chunks]
df['hf_embedding'] = df.text_chunks.apply(lambda x: get_embedding(x))

In [None]:
df

Unnamed: 0,text_chunks,hf_embedding
0,Clean Power 2030 Action Plan: A new era of cle...,"[-0.030115395784378052, -0.04793635383248329, ..."
1,Cookies on GOV.UK,"[0.017904821783304214, 0.014801948331296444, 0..."
2,We use some essential cookies to make this web...,"[0.0728289857506752, -0.0019853487610816956, -..."
3,We’d like to set additional cookies to underst...,"[0.014657448045909405, 0.0035390127450227737, ..."
4,and improve government services.,"[-0.008386549539864063, 0.08302715420722961, -..."
...,...,...
3979,Get involved\n\n\n\n\n\n\n\nSupport links\n\n\...,"[0.04266664385795593, -0.018685631453990936, -..."
3980,Contact\n\n\nTerms and conditions\n\n\nRhestr ...,"[0.004223769996315241, -0.06463567167520523, 0..."
3981,All content is available under the Open Govern...,"[0.019884690642356873, -0.00431450642645359, 0..."
3982,where otherwise stated,"[0.03201941400766373, -0.0055270507000386715, ..."


In [None]:
# get embedding for user question
users_question = "What is Clean Power 2030?"

question_embedding = get_embedding(text=users_question)

# create a list to store the calculated cosine similarity
cos_sim = []

for index, row in df.iterrows():
   A = row.hf_embedding
   B = question_embedding

   # calculate the cosine similarity
   cosine = np.dot(A,B)/(norm(A)*norm(B))

   cos_sim.append(cosine)

df["cos_sim"] = cos_sim
df = df.sort_values(by=["cos_sim"], ascending=False)
df

Unnamed: 0,text_chunks,hf_embedding,cos_sim
3475,The Clean Power 2030 Unit itself is underpinne...,"[0.01806826703250408, -0.013461786322295666, -...",0.799672
427,Impact of Clean Power 2030,"[-0.022222846746444702, 0.045123178511857986, ...",0.794216
3414,for Clean Power 2030.,"[0.0010775267146527767, 0.009139172732830048, ...",0.782303
2644,for Clean Power 2030.,"[0.0010775267146527767, 0.009139172732830048, ...",0.782303
98,Foreword from the Head of Clean Power 2030,"[-0.0007598813390359282, 0.05102364718914032, ...",0.777990
...,...,...,...
8,Accept additional cookies\nReject additional c...,"[0.002953567774966359, -0.013488790020346642, ...",0.004432
3970,Please fill in this survey (opens in a new tab...,"[0.011830737814307213, 0.04532671719789505, 0....",0.003822
1343,are fairly treated and the natural world is pr...,"[-0.004892837721854448, 0.08042200654745102, -...",-0.001005
1019,driving new investment and industry into local...,"[-0.0018168751848861575, 0.060745496302843094,...",-0.001485


## Send Question to LLM with context above

In [None]:
# authorise Hugging Face Client Interface using API key
# !hf auth login

In [None]:
# # example
# client = InferenceClient()

# completion = client.chat.completions.create(
#     model="deepseek-ai/DeepSeek-V3-0324",
#     messages=[
#         {
#             "role": "user",
#             "content": "What is clean power 2030 in the UK?"
#         }
#     ],
# )

# completion.choices[0].message['content']

In [None]:
# question = "What is Clean Power 2030 in the UK?"

# template = """Question: {question}

# Answer: Let's think step by step."""

# prompt = PromptTemplate.from_template(template)

# repo_id = "mistralai/Mistral-Nemo-Base-2407"

# llm = HuggingFaceEndpoint(
#     repo_id=repo_id,
#     temperature=0.7,
#     do_sample=False,
#     huggingfacehub_api_token=hugging_face_api_token,
#     provider="auto",
# )
# llm_chain = prompt | llm
# print(llm_chain.invoke({"question": question}))

In [None]:
# like tutorial

In [None]:
users_question = "What is the offshore wind target for the UK?"

In [None]:
repo_id = "mistralai/Mistral-Nemo-Base-2407"

llm = HuggingFaceEndpoint(
    repo_id=repo_id,
    temperature=1,
    huggingfacehub_api_token=hugging_face_api_token,
    provider="auto",
)
# llm.__dict__
print(llm(users_question)) # compare before and after

In [None]:
# define the context for the prompt by joining the most relevant text chunks
context = ""

for index, row in df[0:50].iterrows():
    context = context + " " + row.text_chunks

# define the prompt template
template = """
You are a chat bot who loves to help people! Given the following context sections, answer the
question using only the given context. If you are unsure and the answer is not
explicitly writting in the documentation, say "Sorry, I don't know how to help with that."
Give a short response.

Context sections:
{context}

Question:
{users_question}

Answer:
"""

prompt = PromptTemplate(template=template, input_variables=["context", "users_question"])

# fill the prompt template
prompt_text = prompt.format(context = context, users_question = users_question)
llm(prompt_text)

## Speed up above with vector store

In [None]:
# using docs loaded in above
text = docs[0].page_content
text = text.replace('\n', '')

In [None]:
# Open a new file called 'output.txt' in write mode and store the file object in a variable
with open('cp30_output.txt', 'w', encoding='utf-8') as file:
    # Write the string to the file
    file.write(text)

In [None]:
# split text
# load the document
with open('./cp30_output.txt', encoding='utf-8') as f:
    text = f.read()

# define the text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 500,
    chunk_overlap  = 100,
    length_function = len,
)

texts = text_splitter.create_documents([text])

In [None]:
# define the embeddings model
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

# use the text chunks and the embeddings model to fill our vector store
db = Chroma.from_documents(texts, embeddings)

In [None]:
users_question = "What is the target for demand side response in 2030?"

# use our vector store to find similar text chunks
results = db.similarity_search(
    query=users_question,
    k=5
)

# define the prompt template
template = """
You are a chat bot who loves to help people! Given the following context sections, answer the
question using only the given context. If you are unsure and the answer is not
explicitly writting in the documentation, say "Sorry, I don't know how to help with that."

Context sections:
{context}

Question:
{users_question}

Answer:
"""

prompt = PromptTemplate(template=template, input_variables=["context", "users_question"])

# fill the prompt template
prompt_text = prompt.format(context = results, users_question = users_question)

# ask the defined LLM
llm(prompt_text)

Source:
- Main Code guide: https://towardsdatascience.com/all-you-need-to-know-to-build-your-first-llm-app-eb982c78ffac/#4cff
- Doc for training: https://www.gov.uk/government/publications/clean-power-2030-action-plan
- Langchain Docs: https://python.langchain.com/docs/introduction/
- LangChain Hugging Face Docs: https://python.langchain.com/docs/integrations/text_embedding/huggingfacehub/ , https://python.langchain.com/docs/integrations/llms/huggingface_endpoint/