# Notebook for going through langchain tutorial

## Import and Loading Necessary Stuff

In [1]:
from dotenv import load_dotenv
import os
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate

load_dotenv()

True

## Test getting response

In [2]:
llm = ChatOpenAI()

In [276]:
llm.invoke("Name every arc in One Piece in order")

AIMessage(content='1. Romance Dawn\n2. Orange Town\n3. Syrup Village\n4. Baratie\n5. Arlong Park\n6. Loguetown\n7. Reverse Mountain\n8. Whiskey Peak\n9. Little Garden\n10. Drum Island\n11. Alabasta\n12. Jaya\n13. Skypiea\n14. Long Ring Long Land\n15. Water 7\n16. Enies Lobby\n17. Post-Enies Lobby\n18. Thriller Bark\n19. Sabaody Archipelago\n20. Amazon Lily\n21. Impel Down\n22. Marineford\n23. Post-War\n24. Return to Sabaody\n25. Fishman Island\n26. Punk Hazard\n27. Dressrosa\n28. Zou\n29. Whole Cake Island\n30. Reverie\n31. Wano Country')

## Scraping from One Piece Fandom

In [12]:
#Function for scraping one page, take in the url and also the interesting section
response = requests.get('https://onepiece.fandom.com/wiki/Reverse_Mountain_Arc')

page_soup = BeautifulSoup(response.text, 'html.parser')

In [81]:
def extract_text_from_element(element, prefix = ''):
    """Recursively extracts text from paragraphs and unordered lists."""
    text_content = ''
    if element.name == 'p':
        text_content += prefix + element.get_text().strip() + '\n\n'
    elif element.name == 'ul':
        for item in element.find_all('li', recursive=False):  # Direct children only to handle nesting
            
            item_text = ''.join(str(child) for child in item.children if child.name != 'ul')
            item_soup = BeautifulSoup(item_text, 'html.parser')
            text_content += prefix + '- ' + item_soup.get_text().strip() + '\n'
            # Check for nested <ul> within this <li>
            nested_ul = item.find('ul')
            if nested_ul:
                text_content += extract_text_from_element(nested_ul, prefix = '\t'+prefix)
    return text_content

In [161]:
import re
def scrape_page(url, topic = ['Summary', 'Story Impact']):
    """Scrapes paragraphs and nested unordered lists under a specific header and saves to a text file."""
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')

        page_title = soup.find('h1', {'class': "page-header__title"})
        if page_title:
            page_title = page_title.get_text().strip()
            
        headers = soup.find_all('h2')  # Adjust this to target the specific header you're interested in
        
        if headers:
            content = '' #f'{page_title}\n\n\n\n'
            for j, header in enumerate(headers):
                title = header.get_text().strip().replace('[', '').replace(']', '')
                next_header = headers[j+1] if j+1 < len(headers) else None
                if title in topic:
                    content += title+'\n\n\n'
                    # Iterate over siblings of the header that are either <p> or <ul>, including nested <ul>
                    for sibling in header.find_next_siblings():
                        if sibling == next_header:
                            break
                        content += extract_text_from_element(sibling)

                
            # Save the content to a file
            # with open(file_name, 'w', encoding='utf-8') as file:
            #     file.write(content)
            content = re.sub(r'\[.*?\]', '', content) 
            return page_title, content
        else:
            return 'Header not found'
    else:
        return 'Failed to retrieve the linked page'

In [162]:
print(scrape_page('https://onepiece.fandom.com/wiki/Reverse_Mountain_Arc')[1])

Summary


As the Straw Hats sail through a storm, Nami realizes that they will be going up Reverse Mountain to access the Grand Line. The crew expresses disbelief at the idea, with Zoro wondering why they can't just go south. Suddenly, the storm stops, and Nami reveals that they are in the Calm Belt, one of two strips of ocean surrounding the Grand Line, where there are no winds or currents. Nami urges the crew to row back to the storm, but the Sea Kings infesting the water then rise up. The Straw Hats struggle to get away, and once they escape, they turn their attention to successfully riding the dangerous current up Reverse Mountain. Their rudder soon breaks, but Luffy shields the Going Merry from hitting the mountain as the Straw Hats successfully sail down into the Grand Line.

When they come down, they immediately run into a giant whale named Laboon, who is blocking their path. The Merry ends up running into the whale, which breaks its masthead. Luffy punches Laboon in the eye in 

In [164]:
import requests
from bs4 import BeautifulSoup

url = 'https://onepiece.fandom.com/wiki/Chapters_and_Volumes'
response = requests.get(url)

if response.status_code == 200:
    soup = BeautifulSoup(response.text, 'html.parser')

In [174]:
table = soup.find('table', {'class': 'wikitable'})

scraped_contents = []

if table:
    #Iterate through each row
    for i, row in enumerate(table.find_all('tr')):
        if i in [0,1, 8, 9]:
            continue
            
        for cell in row.find_all(['th', 'td']):
            cell_text = cell.get_text().strip()
            print(cell_text)
            a_tag = cell.find('a')
            if a_tag and a_tag.has_attr('href'):
                href = a_tag['href']
                full_url = requests.compat.urljoin(response.url, href)  # Create full URL if necessary
                print(f'Link: {full_url}')
                scraped_content = scrape_page(full_url)
                scraped_content = (scraped_content[0], f'{cell_text}\n\n\n'+scraped_content[1])
                scraped_contents.append(scraped_content)
                # Now scrape the linked page for its title or other information
        print('--- End of Row ---')

#Next, find all the mini-series
h4s = soup.find_all('h4')

other_topics = ['Short-Term Focused Cover Page Serials']
for h4 in h4s:
    #Iterate through to find mini series
    title = h4.get_text().strip().replace('[', '').replace(']', '')
    if title in other_topics:
        siblings = h4.find_next_siblings()

        for sibling in siblings:
            if sibling.name == 'ol':
                ls = sibling.find_all('li')
                for l in ls:
                    a_tag = l.find('a')
                    cell_text = l.get_text().strip()
                    if a_tag and a_tag.has_attr('href'):
                        href = a_tag['href']
                        full_url = requests.compat.urljoin(response.url, href)  # Create full URL if necessary
                        print(f'Link: {full_url}')
                        scraped_content = scrape_page(full_url)
                        scraped_content = (scraped_content[0], f'{cell_text}\n\n\n'+scraped_content[1])
                        scraped_contents.append(scraped_content)
                break   

Romance Dawn Arc(Chapters 1 to 7, Volume 1)
Link: https://onepiece.fandom.com/wiki/Romance_Dawn_Arc
Reverse Mountain Arc(Chapters 101 to 105, Volume 12)
Link: https://onepiece.fandom.com/wiki/Reverse_Mountain_Arc
Jaya Arc(Chapters 218 to 236, Volumes 24 and 25)
Link: https://onepiece.fandom.com/wiki/Jaya_Arc
Long Ring Long Land Arc(Chapters 303 to 321, Volumes 32 to 34)
Link: https://onepiece.fandom.com/wiki/Long_Ring_Long_Land_Arc
Thriller Bark Arc(Chapters 442 to 489, Volumes 46 to 50)
Link: https://onepiece.fandom.com/wiki/Thriller_Bark_Arc
Sabaody Archipelago Arc(Chapters 490 to 513, Volumes 50 to 53)
Link: https://onepiece.fandom.com/wiki/Sabaody_Archipelago_Arc
--- End of Row ---
Orange Town Arc(Chapters 8 to 21, Volumes 1 to 3)
Link: https://onepiece.fandom.com/wiki/Orange_Town_Arc
Whisky Peak Arc(Chapters 106 to 114, Volumes 12 and 13)
Link: https://onepiece.fandom.com/wiki/Whisky_Peak_Arc
Skypiea Arc(Chapters 237 to 302, Volumes 26 to 32)
Link: https://onepiece.fandom.com/wiki

  item_soup = BeautifulSoup(item_text, 'html.parser')



Impel Down Arc(Chapters 525 to 549, Volumes 54 to 56)
Link: https://onepiece.fandom.com/wiki/Impel_Down_Arc
--- End of Row ---
Baratie Arc(Chapters 42 to 68, Volumes 5 to 8)
Link: https://onepiece.fandom.com/wiki/Baratie_Arc
Drum Island Arc(Chapters 130 to 154, Volumes 15 to 17)
Link: https://onepiece.fandom.com/wiki/Drum_Island_Arc

Post-Enies Lobby Arc(Chapters 431 to 441, Volumes 45 and 46)
Link: https://onepiece.fandom.com/wiki/Post-Enies_Lobby_Arc

Marineford Arc(Chapters 550 to 580, Volumes 56 to 59)
Link: https://onepiece.fandom.com/wiki/Marineford_Arc
--- End of Row ---
Arlong Park Arc(Chapters 69 to 95, Volumes 8 to 11)
Link: https://onepiece.fandom.com/wiki/Arlong_Park_Arc
Arabasta Arc(Chapters 155 to 217, Volumes 17 to 24)
Link: https://onepiece.fandom.com/wiki/Arabasta_Arc



Post-War Arc(Chapters 581 to 597, Volumes 59 to 61)
Link: https://onepiece.fandom.com/wiki/Post-War_Arc
--- End of Row ---
Loguetown Arc(Chapters 96 to 100, Volumes 11 and 12)
Link: https://onepiece.f

In [175]:
save_directory ='database'

for title, content in scraped_contents:

    with open(f'{save_directory}/{title}.txt', 'w') as f:
        f.write(content)

## Create Vector Store with the scraped database

In [176]:
from langchain_community.document_loaders import DirectoryLoader, TextLoader

loader = DirectoryLoader('database/', glob="*.txt", loader_cls=TextLoader, show_progress=True)
docs = loader.load()

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:00<00:00, 410.65it/s]


## Store in Vector DataBase

In [236]:
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()

In [237]:
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter, TokenTextSplitter
from langchain.retrievers import ParentDocumentRetriever

text_splitter = RecursiveCharacterTextSplitter(chunk_size = 4096, chunk_overlap = 0)

split_documents = text_splitter.split_documents(docs)
db = FAISS.from_documents(split_documents, embeddings)

In [238]:
#Try searching with vector
query = "What is Ryokugyu's power?"
embed_query = embeddings.embed_query(query)
similar_docs = db.similarity_search_by_vector(embed_query, 20)

print(similar_docs[0])

page_content="Ryokugyu landed in Udon, where he used his Mori Mori no Mi ability to cover the entire region in vegetation and attacked the Prison Mine where all the Beasts Pirates were being held and sucked out their vitality using roots he made with his ability. After calling in a battleship to Wano and commanding his subordinates to not mention this to Sakazuki, Ryokugyu then started moving onto the Flower Capital to take out Luffy, in order to gain Sakazuki's approval. However, the Red Scabbards (sans Kin'emon and Kikunojo) and Shinobu sensed him and recognized him as a Marine and an enemy of Luffy's, stood in his way. Ryokugyu then attacked them with Kinniku Mori Mori, all the while ranting how, since the country was unaffiliated to the World Government, no one on the island had any human rights and that he would have no problem slaughtering the civilians' if it meant getting to Luffy, claiming genuine support for the World Government's discriminatory politics. At this point, Yamat

In [239]:
from langchain.chains.combine_documents import create_stuff_documents_chain

prompt = ChatPromptTemplate.from_template("""Answer the following question based only on the provided context:

<context>
{context}
</context>

Question: {input}""")

document_chain = create_stuff_documents_chain(llm, prompt)

In [240]:
from langchain.chains import create_retrieval_chain

retriever = vector.as_retriever()
retrieval_chain = create_retrieval_chain(retriever, document_chain)

In [205]:
from langchain.storage import InMemoryStore
from langchain.retrievers import ParentDocumentRetriever
from langchain_community.vectorstores import Chroma

vectorstore = Chroma(
    collection_name="full_documents", embedding_function=OpenAIEmbeddings()
)

store = InMemoryStore()
parent_retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=text_splitter,
)

parent_retriever.add_documents(docs, ids = None)

In [208]:
query = "What is the power of admiral Ryokugyu?"

sub_docs = vectorstore.similarity_search(query)

retrieved_docs = parent_retriever.get_relevant_documents(query)

In [210]:
parent_retrieval_chain = create_retrieval_chain(parent_retriever, document_chain)

In [241]:
response = retrieval_chain.invoke({"input": "What does Admiral Ryokugyu look like?"})
print(response["answer"])

Based on the provided context, Admiral Ryokugyu is described as having the ability to cover regions in vegetation and suck out vitality using roots. Additionally, he is mentioned using an ability called Kinniku Mori Mori. While his physical appearance is not described in detail, it can be inferred that he has the ability to manipulate vegetation and roots, and likely has a powerful and intimidating presence.


## History-aware chatbots

In [256]:
from langchain.chains import create_history_aware_retriever
from langchain_core.prompts import MessagesPlaceholder

# First we need a prompt that we can pass into an LLM to generate this search query

prompt = ChatPromptTemplate.from_messages([
    MessagesPlaceholder(variable_name="chat_history"),
    ("user", "{input}"),
    ("user", '''Given the above conversation, 
                First determine the arc in which the last conversation is most related to, then generate a search query to look up in order to get information relevant to the arc determined.
                Sometimes, there maybe no arc to based on since the conversation is generic. In that case, just generate a search query to look up in order to get information most relevant to the chat history''')
])
retriever_chain = create_history_aware_retriever(llm, retriever, prompt)

In [257]:
from langchain_core.messages import HumanMessage, AIMessage

chat_history = [HumanMessage(content="What is One Piece? and What is the latest arc of it?"), AIMessage(content="One Piece is a manga and anime series created by Eiichiro Oda. The latest arc of One Piece is the Egghead Arc, which follows the aftermath of the Levely and the Raid on Onigashima, leading to major shifts across the world that could potentially lead to a global war.")]
response = retriever_chain.invoke({
    "chat_history": chat_history,
    "input": "Who are the main characters introduced in this arc?"
})
response

[Document(page_content="- Information about the Wano Country is revealed. As stated by Brook, the Wano Country is a closed-door kingdom who are not affiliated with the World Government, nor do they allow the Marines to go near them. Two samurais, Kin'emon and Momonosuke are introduced and become traveling companions of the Straw Hats to help save Kin'emon's ally at Dressrosa. Also, the status of the Wano Country as well as their connection to Kaidou is briefly foreshadowed in this arc which would lead to the events of the Wano Country Arc.\n\t- After introducing Momonosuke as Ki'nemon's son in this arc, the Zou Arc would later clarify Kin'emon's true relationship with Momonosuke as his retainer and not his father under the orders to have Momonosuke pretend to be his son as to not attract Kaidou or his crew.", metadata={'source': 'database/Punk Hazard Arc.txt'}),
 Document(page_content="- Monkey D. Luffy begins his journey as a pirate, setting into motion the main plot of the series.\n\

In [264]:
prompt = ChatPromptTemplate.from_messages([
    ("system", '''You are an expert in all manga One piece knowledge and trivia and you must provide an accurate answer to the user. 
                  Think step by step when answering. Gather all information from the context. If the answer is not in the context or in your knowledge. Do not try to offer false information or old information.
                  Make sure that if the user ask the same question, you must recheck your previous answer
                  :
                  
                  \n\n{context}'''),
    MessagesPlaceholder(variable_name="chat_history"),
    ("user", "{input}"),
])
document_chain = create_stuff_documents_chain(llm, prompt)

retrieval_chain = create_retrieval_chain(retriever_chain, document_chain)

In [285]:
# chat_history = [HumanMessage(content="What is One Piece? and What is the latest arc of it?"), AIMessage(content="One Piece is a manga and anime series created by Eiichiro Oda. The latest arc of One Piece is the Egghead Arc, which follows the aftermath of the Levely and the Raid on Onigashima, leading to major shifts across the world that could potentially lead to a global war.")]
response = retrieval_chain.invoke({
    "chat_history": chat_history,
    # "input": "Who are the main characters introduced in this arc?",
    "input": "Summarise the backstory of Kuma. Make sure you use latest information"
})

print(response['answer'])
chat_history.append(AIMessage(content = response['answer']))

Bartholomew Kuma, also known as "PX-0," was originally a human pirate who later became a Warlord of the Sea under the World Government. He was a fearsome pirate known for his strength and power. However, after the events of the Thriller Bark Arc, Kuma underwent a mysterious transformation orchestrated by the World Government.

During the events of the Sabaody Archipelago Arc, it was revealed that Kuma was turned into a Pacifista cyborg by the World Government. His consciousness and personality were removed, and he was programmed to serve the World Government as a mindless weapon. Despite losing his free will, Kuma carried out various missions for the World Government, including assisting in the capture of the Straw Hat Pirates.

Later in the story, it was revealed that Kuma willingly accepted this transformation to protect the Thousand Sunny and its crew. He used his last moments of consciousness to program his own mind into a mission to protect the ship and ensure the safety of Luffy'