# Web Scraping

In [10]:
import requests
from bs4 import BeautifulSoup
'''here, we try to use a wikipedia page to extract the links that it has to 
other wikipedia pages containing information about each episode of the office (most of them) '''
def get_episode_links(show_url):
    response = requests.get(show_url)
    soup = BeautifulSoup(response.text, 'html.parser')

    episode_links = set()

    # Iterate through all tables
    for table in soup.find_all('table', {'class': 'wikitable plainrowheaders wikiepisodetable'}):
        # Find the index of the "Title" column
        headers = table.find_all('th')
        title_index = None
        for i, header in enumerate(headers):
            if 'Title' in header.text:
                title_index = i
                break

        if title_index is not None:
            # Extract URLs from the "Title" column
            for row in table.find_all('tr')[1:]:  # Skip the header row
                cells = row.find_all('td')
                if len(cells) > title_index:
                    link = cells[1].find('a') #the 1 of cells gives us the links to the episodes (found by inspecting the html of the wikipedi page)
                    if link:
                        full_link = f'https://en.wikipedia.org{link["href"]}' #making it the full link 
                        episode_links.add(full_link)

    return episode_links

show_url = 'https://en.wikipedia.org/wiki/List_of_The_Office_(American_TV_series)_episodes'
episode_links = get_episode_links(show_url)

for link in episode_links:
    print(link) #sanity check 

https://en.wikipedia.org/wiki/Tallahassee_(The_Office)
https://en.wikipedia.org/wiki/Search_Committee
https://en.wikipedia.org/wiki/The_Convict
https://en.wikipedia.org/wiki/Boys_and_Girls_(The_Office)
https://en.wikipedia.org/wiki/Classy_Christmas
https://en.wikipedia.org/wiki/The_Dundies
https://en.wikipedia.org/wiki/The_Meeting_(The_Office)
https://en.wikipedia.org/wiki/Grief_Counseling_(The_Office)
https://en.wikipedia.org/wiki/The_Banker_(The_Office)
https://en.wikipedia.org/wiki/Christmas_Party_(The_Office)
https://en.wikipedia.org/wiki/The_Negotiation_(The_Office)
https://en.wikipedia.org/wiki/China_(The_Office)
https://en.wikipedia.org/wiki/The_Incentive
https://en.wikipedia.org/wiki/Office_Olympics
https://en.wikipedia.org/wiki/Shareholder_Meeting
https://en.wikipedia.org/wiki/Branch_Wars
https://en.wikipedia.org/wiki/Women%27s_Appreciation
https://en.wikipedia.org/wiki/Initiation_(The_Office)
https://en.wikipedia.org/wiki/Phyllis%27_Wedding
https://en.wikipedia.org/wiki/The_I

In [12]:
all_links = list(episode_links) #create a list 
#add the 2 main links about the office to this list 
all_links.append('https://en.wikipedia.org/wiki/List_of_The_Office_(American_TV_series)_episodes')
all_links.append('https://en.wikipedia.org/wiki/The_Office_(American_TV_series)')

In [13]:
#for each website url, we extract the html information between <p> and </p> which is our paragraph information 
# and iteratively append it to the text variable to create one string variable with all the web scraped information and put it in a txt file
def web_scraper(url, text): 
  response = requests.get(url)
  soup = BeautifulSoup(response.text, 'html')
  paragraphs = soup.find_all('p')
  for paragraph in paragraphs:
    text += paragraph.get_text()
  text += '\n'
  return text

In [15]:
len(all_links) #check 

187

In [16]:
web_text = '' #this is our string variable 
i = 0
for url in all_links: #for each url 
  i += 1
  web_text = web_scraper(url, web_text)
  if i%10 == 0:
    print('10 urls done!') #just to see our progress, make sure everything is moving smoothly

10 urls done!
10 urls done!
10 urls done!
10 urls done!
10 urls done!
10 urls done!
10 urls done!
10 urls done!
10 urls done!
10 urls done!
10 urls done!
10 urls done!
10 urls done!
10 urls done!
10 urls done!
10 urls done!
10 urls done!
10 urls done!


In [17]:
print(web_text[5000:5500]) #checking to see if the text seems correct

lar character, having previously appeared in the seventh season finale, "Search Committee" as a candidate to replace Michael as office manager.[3] Tate was initially the top choice to join the cast as the manager, but due to a commitment to the West End production of the Shakespeare play, Much Ado About Nothing she was unable to join the series at the start of the season.[4][5] Her character, Nellie is re-introduced as the Head of Special Projects for Sabre, and subsequently works in the Scranto


In [19]:
with open('theofficedata.txt', 'w', encoding = 'utf-8') as f:
    f.write(web_text) #write it into a text file so we don't need to do this again

# Vector DB creation with Chroma 

In [1]:
from langchain_community.document_loaders import WikipediaLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain.vectorstores.chroma import Chroma
import os
import shutil
from getpass import getpass

db_path = "db" #this is where our database goes, in the same directory 
doc_path = "theofficedata.txt" #this is where our web-sraped text went

In [2]:
with open(doc_path) as f:
  text = f.read() #this has all our web scraped information in one place
text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=100)
chunks = text_splitter.create_documents([text])

In [3]:
api_key_openai= getpass('Give OPENAI API key:') #this is our OPENAI API Key, for privacy reasons it is kept as a getpass function

In [4]:
if os.path.exists(db_path):
    shutil.rmtree(db_path)
db = Chroma.from_documents(chunks, OpenAIEmbeddings(openai_api_key=api_key_openai), persist_directory=db_path)
db.persist() #created our dataase 