In [None]:
!pip install bs4
!pip install requests
!pip install wikipedia
!pip install llama-index
!pip install chromadb

In [None]:
import requests
from bs4 import BeautifulSoup
import wikipedia

years = [2020, 2021, 2022, 2023, 2024]
data = {}
prefix = 'https://en.wikipedia.org'

def scrapeArticle(url):
  response = None
  try:
    response = requests.get(url=url, timeout=1)
  except:
    return None
  soup = BeautifulSoup(response.content, 'html.parser')
  title = soup.find(id="firstHeading").i.string
  pageObject = wikipedia.WikipediaPage(title)
  cast_list = []
  for th in soup.find_all('th'):
    if th.contents[0] == 'Starring':
      cast_list = th.contents[0].parent.parent.find_all('li')
      break
  cast_list = [x.string for x in cast_list]
  if len(cast_list) == 0:
    return None
  return {'Title':title, 'URL':url, 'Summary':pageObject.summary, 'Cast':cast_list}

def updateMovies():
  for year in years:
    response = requests.get(url='https://en.wikipedia.org/wiki/List_of_American_films_of_'+str(year))
    soup = BeautifulSoup(response.content, 'html.parser')
    for a in soup.find_all('tr'):
      firstTd = a.findChildren("td")
      if len(firstTd) > 0 and len(firstTd[0].findChildren('a')) > 0:
        href = None
        try:
          href = firstTd[0].findChildren('a')[0]['href']
          link = prefix + href
          if 'List' not in href:
            movieMeta = scrapeArticle(link)
            data[movieMeta['Title']] = movieMeta
            print(data[movieMeta['Title']])
        except:
          continue

updateMovies()
print(len(data))


In [None]:
import chromadb

from llama_index.core import Document
from llama_index.core import SimpleDirectoryReader
from llama_index.core.schema import MetadataMode

from llama_index.core import VectorStoreIndex
from llama_index.core import StorageContext, load_index_from_storage

documents = []

# Creation of custom documents

for title in data:
  movieDocument = Document(
      text=data[title]['Summary'],
      metadata={'Title': data[title]['Title'], 'URL': data[title]['URL'], 'Cast': data[title]['Cast']}
  )
  movieDocument.excluded_llm_metadata_keys = ["URL"]
  movieDocument.doc_id = data[title]['Title']
  documents.append(movieDocument)


index = VectorStoreIndex.from_documents(documents)
index.storage_context.persist(persist_dir="<persist_dir>")
  