# Notebook for going through langchain tutorial

## Import and Loading Necessary Stuff

In [6]:
from dotenv import load_dotenv
import os
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate

import requests
from bs4 import BeautifulSoup

load_dotenv()

True

## Test getting response

In [7]:
llm = ChatOpenAI()

In [49]:
llm.invoke("Who is Kuma's female friend in One Piece?")

AIMessage(content="Kuma's female friend in One Piece is Nico Robin.")

## Scraping from One Piece Fandom

In [9]:
import requests
#Function for scraping one page, take in the url and also the interesting section
response = requests.get('https://onepiece.fandom.com/wiki/Reverse_Mountain_Arc')

page_soup = BeautifulSoup(response.text, 'html.parser')

In [10]:
def extract_text_from_element(element, prefix = ''):
    """Recursively extracts text from paragraphs and unordered lists."""
    text_content = ''
    if element.name == 'p':
        text_content += prefix + element.get_text().strip() + '\n\n'
    elif element.name == 'ul':
        for item in element.find_all('li', recursive=False):  # Direct children only to handle nesting
            
            item_text = ''.join(str(child) for child in item.children if child.name != 'ul')
            item_soup = BeautifulSoup(item_text, 'html.parser')
            text_content += prefix + '- ' + item_soup.get_text().strip() + '\n'
            # Check for nested <ul> within this <li>
            nested_ul = item.find('ul')
            if nested_ul:
                text_content += extract_text_from_element(nested_ul, prefix = '\t'+prefix)
    return text_content

In [113]:
import re
def scrape_page(url, topic = ['Summary', 'Story Impact']):
    """Scrapes paragraphs and nested unordered lists under a specific header and saves to a text file."""
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')

        page_title = soup.find('h1', {'class': "page-header__title"})
        if page_title:
            page_title = page_title.get_text().strip()
            
        headers = soup.find_all('h2')  # Adjust this to target the specific header you're interested in
        
        if headers:
            titles = [header.get_text().strip().replace('[', '').replace(']', '') for header in headers]
            content = '' #f'{page_title}\n\n\n\n'
            if ('Long Summary' in topic) and ('Long Summary' not in titles):
                if 'Summary' in titles:
                    topic = ['Summary'] + topic
                elif 'Short Summary' in titles:
                    topic = ['Short Summary'] + topic
                    
            for j, header in enumerate(headers):
                title = header.get_text().strip().replace('[', '').replace(']', '')
                # print(title)
                next_header = headers[j+1] if j+1 < len(headers) else None
                if title in topic:
                    content += title+'\n\n\n'
                    # Iterate over siblings of the header that are either <p> or <ul>, including nested <ul>
                    for sibling in header.find_next_siblings():
                        if sibling == next_header:
                            break
                        content += extract_text_from_element(sibling)

                
            # Save the content to a file
            # with open(file_name, 'w', encoding='utf-8') as file:
            #     file.write(content)
            content = re.sub(r'\[.*?\]', '', content) 
            return page_title, content
        else:
            return 'Header not found'
    else:
        return 'Failed to retrieve the linked page'

In [16]:
print(scrape_page('https://onepiece.fandom.com/wiki/Ace%27s_Great_Blackbeard_Search', topic=['Long Summary'])[1])

Long Summary


On an island somewhere in the Grand Line, Ace runs away from a restaurant after eating without paying, with the chef he ran from giving chase. When he reaches the dock, he stops to ask people for information on Blackbeard. However, his searching only leads him to find a man known as Dr. Black Beard. Not realizing that he has the wrong person, Ace kicks Dr. Black Beard, which causes the angry villagers to literally kick him into the river. Being a Devil Fruit user, Ace nearly drowns, but after going downstream, he is rescued by a young girl named Moda. She then takes the unconscious pirate to her house on her family's dairy farm, where he eventually wakes up. Ace and Moda officially meet, and Moda asks him to deliver a letter for her. In exchange for saving his life, Ace agrees to deliver a letter.

Ace goes to the G-2 Marine base, where the letter is supposed to be delivered. He swiftly defeats a Marine soldier and takes his uniform as a disguise to infiltrate the base. 

In [114]:
url = 'https://onepiece.fandom.com/wiki/Chapters_and_Volumes'
response = requests.get(url)

if response.status_code == 200:
    soup = BeautifulSoup(response.text, 'html.parser')

In [115]:
table = soup.find('table', {'class': 'wikitable'})

scraped_contents = []

if table:
    #Iterate through each row
    for i, row in enumerate(table.find_all('tr')):
        if i in [0,1, 8, 9]:
            continue
            
        for cell in row.find_all(['th', 'td']):
            cell_text = cell.get_text().strip()
            print(cell_text)
            a_tag = cell.find('a')
            if a_tag and a_tag.has_attr('href'):
                href = a_tag['href']
                full_url = requests.compat.urljoin(response.url, href)  # Create full URL if necessary
                print(f'Link: {full_url}')
                scraped_content = scrape_page(full_url)
                scraped_content = (scraped_content[0], f'{cell_text}\n\n\n'+scraped_content[1])
                scraped_contents.append(scraped_content)
                # Now scrape the linked page for its title or other information
        print('--- End of Row ---')

#Next, find all the mini-series
h4s = soup.find_all('h4')

other_topics = ['Short-Term Focused Cover Page Serials']
for h4 in h4s:
    #Iterate through to find mini series
    title = h4.get_text().strip().replace('[', '').replace(']', '')
    if title in other_topics:
        siblings = h4.find_next_siblings()

        for sibling in siblings:
            if sibling.name == 'ol':
                ls = sibling.find_all('li')
                for l in ls:
                    a_tag = l.find('a')
                    cell_text = l.get_text().strip()
                    if a_tag and a_tag.has_attr('href'):
                        href = a_tag['href']
                        full_url = requests.compat.urljoin(response.url, href)  # Create full URL if necessary
                        print(f'Link: {full_url}')
                        scraped_content = scrape_page(full_url, topic = ['Long Summary', 'Story Impact'])
                        scraped_content = (scraped_content[0], f'{cell_text}\n\n\n'+scraped_content[1])
                        scraped_contents.append(scraped_content)
                break   

Romance Dawn Arc(Chapters 1 to 7, Volume 1)
Link: https://onepiece.fandom.com/wiki/Romance_Dawn_Arc
Reverse Mountain Arc(Chapters 101 to 105, Volume 12)
Link: https://onepiece.fandom.com/wiki/Reverse_Mountain_Arc
Jaya Arc(Chapters 218 to 236, Volumes 24 and 25)
Link: https://onepiece.fandom.com/wiki/Jaya_Arc
Long Ring Long Land Arc(Chapters 303 to 321, Volumes 32 to 34)
Link: https://onepiece.fandom.com/wiki/Long_Ring_Long_Land_Arc
Thriller Bark Arc(Chapters 442 to 489, Volumes 46 to 50)
Link: https://onepiece.fandom.com/wiki/Thriller_Bark_Arc
Sabaody Archipelago Arc(Chapters 490 to 513, Volumes 50 to 53)
Link: https://onepiece.fandom.com/wiki/Sabaody_Archipelago_Arc
--- End of Row ---
Orange Town Arc(Chapters 8 to 21, Volumes 1 to 3)
Link: https://onepiece.fandom.com/wiki/Orange_Town_Arc
Whisky Peak Arc(Chapters 106 to 114, Volumes 12 and 13)
Link: https://onepiece.fandom.com/wiki/Whisky_Peak_Arc
Skypiea Arc(Chapters 237 to 302, Volumes 26 to 32)
Link: https://onepiece.fandom.com/wiki

  item_soup = BeautifulSoup(item_text, 'html.parser')



Impel Down Arc(Chapters 525 to 549, Volumes 54 to 56)
Link: https://onepiece.fandom.com/wiki/Impel_Down_Arc
--- End of Row ---
Baratie Arc(Chapters 42 to 68, Volumes 5 to 8)
Link: https://onepiece.fandom.com/wiki/Baratie_Arc
Drum Island Arc(Chapters 130 to 154, Volumes 15 to 17)
Link: https://onepiece.fandom.com/wiki/Drum_Island_Arc

Post-Enies Lobby Arc(Chapters 431 to 441, Volumes 45 and 46)
Link: https://onepiece.fandom.com/wiki/Post-Enies_Lobby_Arc

Marineford Arc(Chapters 550 to 580, Volumes 56 to 59)
Link: https://onepiece.fandom.com/wiki/Marineford_Arc
--- End of Row ---
Arlong Park Arc(Chapters 69 to 95, Volumes 8 to 11)
Link: https://onepiece.fandom.com/wiki/Arlong_Park_Arc
Arabasta Arc(Chapters 155 to 217, Volumes 17 to 24)
Link: https://onepiece.fandom.com/wiki/Arabasta_Arc



Post-War Arc(Chapters 581 to 597, Volumes 59 to 61)
Link: https://onepiece.fandom.com/wiki/Post-War_Arc
--- End of Row ---
Loguetown Arc(Chapters 96 to 100, Volumes 11 and 12)
Link: https://onepiece.f

In [None]:
separation, from the decks of the word, hatchan, gedatsu, miss goldenweek

In [116]:
save_directory ='database'

for title, content in scraped_contents:

    with open(f'{save_directory}/{title}.txt', 'w') as f:
        f.write(content)

## Create Vector Store with the scraped database

In [117]:
from langchain_community.document_loaders import DirectoryLoader, TextLoader

loader = DirectoryLoader('database/', glob="*.txt", loader_cls=TextLoader, show_progress=True)
docs = loader.load()

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 49/49 [00:00<00:00, 439.00it/s]


In [118]:
#Add Meta Data
from tqdm import tqdm
for doc in tqdm(docs):
    doc.metadata["title"] = re.split("/|.txt", doc.metadata["source"])[1]
    doc.metadata["Chapters"] = re.search("Chapters \d+ to (?:\d+|Current)", doc.page_content)[0]

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 49/49 [00:00<00:00, 171984.01it/s]


## Store in Vector DataBase

In [20]:
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()

In [39]:
from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(
    model_name="intfloat/e5-small-v2", 
    model_kwargs={"device":"cpu"},
    encode_kwargs={"normalize_embeddings":True, "batch_size":128,}
)

modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/67.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

In [119]:
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter, TokenTextSplitter
from langchain.retrievers import ParentDocumentRetriever

text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1024, chunk_overlap = 0)

split_documents = text_splitter.split_documents(docs)
# db = FAISS.from_documents(split_documents, embeddings)

In [120]:
split_documents[0].metadata

{'source': "database/Ace's Great Blackbeard Search.txt",
 'title': "Ace's Great Blackbeard Search",
 'Chapters': 'Chapters 272 to 305'}

## Save Chunks

In [121]:
os.makedirs('chunks', exist_ok = True)
all_titles = set([doc.metadata['title'] for doc in split_documents])
title_count = {t: 0 for t in all_titles}

import json

for doc in tqdm(split_documents):
    page_content = doc.page_content
    metadata = doc.metadata

    title = metadata['title']
    filename = f'{title}-{title_count[title]}.json'

    with open(f'chunks/{filename}', 'w') as f:
        json.dump({'content': page_content, 'metadata': metadata}, f)

    title_count[title] += 1

    

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2283/2283 [00:04<00:00, 483.20it/s]


In [122]:
title_count

{'Baratie Arc': 12,
 "Enel's Great Space Operations": 6,
 'Fish-Man Island Arc': 256,
 'Punk Hazard Arc': 155,
 'Return to Sabaody Arc': 33,
 'Marineford Arc': 117,
 'Egghead Arc': 228,
 "Ace's Great Blackbeard Search": 4,
 'Romance Dawn Arc': 25,
 'Water 7 Arc': 48,
 "Caribou's Kehihihihi in the New World": 12,
 'Levely Arc': 18,
 "Jango's Dance Paradise": 4,
 'Impel Down Arc': 118,
 'Post-War Arc': 88,
 'Jaya Arc': 16,
 'Syrup Village Arc': 13,
 'Amazon Lily Arc': 49,
 'Reverse Mountain Arc': 8,
 'Whole Cake Island Arc': 92,
 'From the Decks of the World': 7,
 'Solo Journey of Jinbe, Knight of the Sea': 5,
 "Germa 66's Ahh... An Emotionless Excursion": 7,
 'Thriller Bark Arc': 74,
 "Wapol's Omnivorous Hurrah": 3,
 'Drum Island Arc': 11,
 "Hatchan's Sea-Floor Stroll": 1,
 'Dressrosa Arc': 145,
 'From the Decks of the World: The 500,000,000 Man Arc': 5,
 'Skypiea Arc': 38,
 'Little Garden Arc': 12,
 "Gedatsu's Accidental Blue-Sea Life": 1,
 'Enies Lobby Arc': 58,
 'Post-Enies Lobby Arc

In [102]:
#Try searching with vector
query = "What was the name finishing move Luffy used against Kaido?"
embed_query = embeddings.embed_query(query)
similar_docs = db.similarity_search_by_vector(embed_query, 20)

print(similar_docs[0])

page_content="Just as the Emperor thought he had dealt with all his immediate enemies, Luffy rose again while revealing the epiphany he gained from taking Kaidou's last attack, that Haoshoku Haki can also be infused into attacks. Kaidou laughed and then attacked, while stating that only the mightiest can use such a technique. However, Luffy instantly countered, using Haoshoku-infused emissive Haki attacks, chaining a kick which knocked away the kanabo, a punch to Kaidou's gut followed by an uppercut, which floored the Emperor. Luffy then thanked Zoro and Law, telling them to go down, while he himself would take on Kaidou alone. The two clashed, both using Haoshoku Haki in their attacks. Unfortunately, due to Luffy's inexperience in using Haoshoku Haki in this way, he soon lost and was sent plummeting into the sea, with Kaidou commenting that he got carried away and should have cut of his head, since without it he couldn't break the morale of the rebels." metadata={'source': 'database/W

In [85]:
from langchain.chains.combine_documents import create_stuff_documents_chain

prompt = ChatPromptTemplate.from_template("""Answer the following question based only on the provided context:

<context>
{context}
</context>

Question: {input}""")

document_chain = create_stuff_documents_chain(llm, prompt)

In [96]:
from langchain.chains import create_retrieval_chain

retriever = db.as_retriever(search_kwargs={"k": 5}, search_type='mmr')
retrieval_chain = create_retrieval_chain(retriever, document_chain)

In [104]:
response = retrieval_chain.invoke({"input": "What was the name finishing move Luffy used against Kaido?"})
print(response["answer"])

The finishing move Luffy used against Kaidou was Gomu Gomu no Kong Gun.


## History-aware chatbots

In [57]:
from langchain.chains import create_history_aware_retriever
from langchain_core.prompts import MessagesPlaceholder

# First we need a prompt that we can pass into an LLM to generate this search query

prompt = ChatPromptTemplate.from_messages([
    MessagesPlaceholder(variable_name="chat_history"),
    ("user", "{input}"),
    ("user", '''Given the above conversation, generate a search query to look up in order to get information relevant to the conversation''')
])
retriever_chain = create_history_aware_retriever(llm, retriever, prompt)

In [74]:
from langchain_core.messages import HumanMessage, AIMessage

# chat_history = []
response = retriever_chain.invoke({
    "chat_history": chat_history,
    "input": "Name every current Straw Hat Pirates as of your latest knowledge"
})

In [63]:
prompt = ChatPromptTemplate.from_messages([
    ("system", '''Answer the user's questions based on the below context:\n\n{context}'''),
    MessagesPlaceholder(variable_name="chat_history"),
    ("user", "{input}"),
])
document_chain = create_stuff_documents_chain(llm, prompt)

retrieval_chain = create_retrieval_chain(retriever_chain, document_chain)

In [81]:
# chat_history = [HumanMessage(content="What is One Piece? and What is the latest arc of it?"), AIMessage(content="One Piece is a manga and anime series created by Eiichiro Oda. The latest arc of One Piece is the Egghead Arc, which follows the aftermath of the Levely and the Raid on Onigashima, leading to major shifts across the world that could potentially lead to a global war.")]
response = retrieval_chain.invoke({
    "chat_history": chat_history,
    "input": "How did Luffy beat Kaido?",
    # "input": ""
})

print(response['answer'])
chat_history.append(AIMessage(content = response['answer']))

In the context provided, there is no information about Luffy defeating Kaido. The details shared do mention the potential danger Luffy poses to the World Government if he were to succeed in defeating Kaido, but there is no specific account of Luffy beating Kaido.


In [78]:
llm.invoke('How did Luffy beat Kaido?')

AIMessage(content='As of now in the One Piece manga and anime, Luffy has not yet defeated Kaido. The battle between Luffy and Kaido is still ongoing, and the outcome has not been revealed yet.')