<a href="https://colab.research.google.com/github/nifemi-alonge/cp30_llm_app/blob/main/CP30_LLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Clean Power 2030 LLM

In [None]:
!pip install langchain_community
!pip install pypdf
!pip install --upgrade --quiet  langchain langchain-huggingface sentence_transformers

In [39]:
import numpy as np
from numpy.linalg import norm
import pandas as pd
import requests
import os
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_text_splitters import CharacterTextSplitter
from langchain_huggingface.embeddings import HuggingFaceEmbeddings

## Load Doc

In [3]:
# use langchain to load data
loader = WebBaseLoader("https://www.gov.uk/government/publications/clean-power-2030-action-plan/clean-power-2030-action-plan-a-new-era-of-clean-electricity-main-report")

In [None]:
docs = loader.load()
docs[0]

In [5]:
print(docs[0].metadata)

{'source': 'https://www.gov.uk/government/publications/clean-power-2030-action-plan/clean-power-2030-action-plan-a-new-era-of-clean-electricity-main-report', 'title': '\n      Clean Power 2030 Action Plan: A new era of clean electricity – main report - GOV.UK\n  ', 'language': 'en'}


In [None]:
print(docs[0].page_content)

In [None]:
# document = docs[0].page_content

# text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
#     encoding_name="cl100k_base", chunk_size=100, chunk_overlap=0
# )
# texts = text_splitter.split_text(document)

## Split Text

In [6]:
article_text = docs[0].page_content

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 105,
    chunk_overlap  = 20,
    length_function = len,
)

texts = text_splitter.create_documents([article_text])
len(texts)

3984

In [7]:
print(texts[0].page_content)
print(texts[400].page_content)

print(texts[0])

Clean Power 2030 Action Plan: A new era of clean electricity – main report - GOV.UK
to put an end to interminable delays in the system that mean it can take over a decade to develop and
page_content='Clean Power 2030 Action Plan: A new era of clean electricity – main report - GOV.UK'


## Create embeddings
- Using Hugging Face rather than OpenAI (paid)

In [63]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

In [55]:
print(texts[0].page_content)
embedding = embeddings.embed_query(texts[0].page_content)
len(embedding)

Clean Power 2030 Action Plan: A new era of clean electricity – main report - GOV.UK


768

In [None]:
# test
embeddings.embed_documents([texts[0].page_content])['data'][0]['embedding']

In [40]:
# get page content for all chunks
# append to list
# convert to df
text_chunks=[]

for text in texts:
    text_chunks.append(text.page_content)

df = pd.DataFrame({'text_chunks': text_chunks})

In [42]:
df.head()

Unnamed: 0,text_chunks
0,Clean Power 2030 Action Plan: A new era of cle...
1,Cookies on GOV.UK
2,We use some essential cookies to make this web...
3,We’d like to set additional cookies to underst...
4,and improve government services.


In [67]:
# function to remove n
# get HF embeddings for inputted text
def get_embedding(text):
   text = text.replace("n", " ")
   return embeddings.embed_documents([text])[0]

In [68]:
# 12 mins to run
# create new column in df with embedding
# pply get_embedding function to each row of df[text chunks]
df['hf_embedding'] = df.text_chunks.apply(lambda x: get_embedding(x))

In [69]:
df

Unnamed: 0,text_chunks,hf_embedding
0,Clean Power 2030 Action Plan: A new era of cle...,"[-0.030115395784378052, -0.04793635383248329, ..."
1,Cookies on GOV.UK,"[0.017904821783304214, 0.014801948331296444, 0..."
2,We use some essential cookies to make this web...,"[0.0728289857506752, -0.0019853487610816956, -..."
3,We’d like to set additional cookies to underst...,"[0.014657448045909405, 0.0035390127450227737, ..."
4,and improve government services.,"[-0.008386549539864063, 0.08302715420722961, -..."
...,...,...
3979,Get involved\n\n\n\n\n\n\n\nSupport links\n\n\...,"[0.04266664385795593, -0.018685631453990936, -..."
3980,Contact\n\n\nTerms and conditions\n\n\nRhestr ...,"[0.004223769996315241, -0.06463567167520523, 0..."
3981,All content is available under the Open Govern...,"[0.019884690642356873, -0.00431450642645359, 0..."
3982,where otherwise stated,"[0.03201941400766373, -0.0055270507000386715, ..."


In [71]:
# get embedding for user question
users_question = "What is Clean Power 2030?"

question_embedding = get_embedding(text=users_question)

# create a list to store the calculated cosine similarity
cos_sim = []

for index, row in df.iterrows():
   A = row.hf_embedding
   B = question_embedding

   # calculate the cosine similarity
   cosine = np.dot(A,B)/(norm(A)*norm(B))

   cos_sim.append(cosine)

df["cos_sim"] = cos_sim
df.sort_values(by=["cos_sim"], ascending=False)

Unnamed: 0,text_chunks,hf_embedding,cos_sim
2991,support scheme for LDES:,"[-0.007200377527624369, 0.005594766233116388, ...",0.680683
3925,LDES in Table 1. ↩,"[-0.025736693292856216, -0.03171781450510025, ...",0.627822
3023,Innovation can make an important contribution ...,"[-0.005140246357768774, -0.04928424954414368, ...",0.511000
2999,NESO have agreed to provide advice on the rang...,"[0.040656134486198425, -0.038679443299770355, ...",0.503868
2977,well as providing electricity during protracte...,"[-0.053964968770742416, 0.01499880850315094, -...",0.491703
...,...,...,...
2450,"Currently, there is 4.5 GW of battery storage ...","[0.008823340758681297, -0.06266471743583679, 0...",-0.012697
2708,is 12-14GW. This shows there is an additional ...,"[0.01988491415977478, -0.05874556675553322, 0....",-0.016174
540,in a typical weather year:,"[-0.033082135021686554, 0.005593315698206425, ...",-0.020624
2847,billion tonnes of theoretical CO2 storage capa...,"[-0.015259237959980965, -0.039938587695360184,...",-0.036148


Source:
- Main Code guide: https://towardsdatascience.com/all-you-need-to-know-to-build-your-first-llm-app-eb982c78ffac/#4cff
- Doc for training: https://www.gov.uk/government/publications/clean-power-2030-action-plan
- Langchain Docs: https://python.langchain.com/docs/introduction/
- LangChain Hugging Face Docs: https://python.langchain.com/docs/integrations/text_embedding/huggingfacehub/