## Imports

In [None]:
pip install beautifulsoup4

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from requests import get
from bs4 import BeautifulSoup
from warnings import warn
from time import sleep
from random import randint
import numpy as np, pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

## Scraping

## Function Title Scraping per Genre

In [None]:
pages = np.arange(1, 100, 50)

# headers = {'Accept-Language': 'en-US,en;q=0.8'} # If this is not specified, the default language is Chinese
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0"
headers = {"user-agent" : USER_AGENT, 'Accept-Language': 'en-US,en;q=0.8'}

#initialize empty lists to store the variables scraped
titles = []

def title(genre):
  for page in pages:
    
    # https://www.imdb.com/search/title/?genres=action&start=51&explore=title_type,genres&ref_=adv_nxt
    # get request
    url = f"https://www.imdb.com/search/title?genres={genre}&start={page}&explore=title_type,genres&ref_=adv_nxt"
    response = get(url, headers=headers)
     
    # response = get("https://www.imdb.com/chart/top/?ref_=nv_mv_250", headers=headers)

    sleep(randint(8,15))
    
    #throw warning for status codes that are not 200
    if response.status_code != 200:
        warn('Request: {}; Status code: {}'.format(requests, response.status_code))

    #parse the content of current iteration of request
    page_html = BeautifulSoup(response.text, 'html.parser')
        
    movie_containers = page_html.find_all('div', class_ = 'lister-item mode-advanced')
    
    #extract the 50 movies for that page
    for container in movie_containers:
        #title
        title = container.h3.a.text
        titles.append(title)


In [None]:
genre = ['action', 'adventure', 'drama', 'comedy']
for genres in genre:
  title(genres)

In [None]:
title = pd.DataFrame({'movie': titles})
title

Unnamed: 0,movie
0,Guardians of the Galaxy Vol. 3
1,Citadel
2,Peter Pan & Wendy
3,Dungeons & Dragons: Honor Among Thieves
4,Dune: Part Two
...,...
395,Not Dead Yet
396,The Wolf of Wall Street
397,Scream 4
398,Only Murders in the Building


In [None]:
title_df = pd.DataFrame({'movie': titles})
title_df

Unnamed: 0,movie
0,Guardians of the Galaxy Vol. 3
1,Citadel
2,Peter Pan & Wendy
3,Dungeons & Dragons: Honor Among Thieves
4,Dune: Part Two
...,...
395,Not Dead Yet
396,The Wolf of Wall Street
397,Scream 4
398,Only Murders in the Building


In [None]:
title_df['movie'] = title_df['movie'].str.replace(r'[^\w\s]+', '')
title_df

  title_df['movie'] = title_df['movie'].str.replace(r'[^\w\s]+', '')


Unnamed: 0,movie
0,Guardians of the Galaxy Vol 3
1,Citadel
2,Peter Pan Wendy
3,Dungeons Dragons Honor Among Thieves
4,Dune Part Two
...,...
395,Not Dead Yet
396,The Wolf of Wall Street
397,Scream 4
398,Only Murders in the Building


## Scraping Content 

In [None]:
def find_imdb_url(query):
  query = query.replace(' ', '+')
  URL = f"https://google.com/search?q={query}+imdb"
  
  resp = get(URL, headers= headers)
  if resp.status_code == 200:
    soup = BeautifulSoup(resp.content, "html.parser") 
  
  hasil = []
  for g in soup.find_all('div', class_='g'):
    anchors = g.find_all('a')
    if anchors:
      link = anchors[0]['href']
    if "imdb.com" in link:
        hasil.append(link)
  return hasil[0]

In [None]:
query = 'Nope'
find_imdb_url(query)

'https://www.imdb.com/title/tt10954984/'

In [None]:
def imdb_scrape(URL):
  # desktop user-agent
  resp = get(URL, headers= headers)

  if resp.status_code == 200:
      soup = BeautifulSoup(resp.content, "html.parser")


  for g in soup.find_all('section', class_ = 'ipc-page-background ipc-page-background--base sc-f9e7f53-0 ifXVtO'):
    movie_genre = []
    storyline = ''
    plot = ''

    try:
      for genres in g.find_all('div', class_ ="ipc-chip-list__scroller"):
        for genre in genres.find_all('span', class_ ="ipc-chip__text"):
          movie_genre.append(genre.text)
            
      # Story line
      for story in g.find_all('div', class_ = "ipc-html-content ipc-html-content--base"):
        storyline = story.find('div').text

      # Plot
      for plots in g.find_all('p', class_ ="sc-5f699a2-3 lopbTB"):
        plot = plots.find('span').text

    except:
      pass

  return movie_genre, storyline, plot

In [None]:
URL = 'https://www.imdb.com/title/tt10954984/'
imdb_scrape(URL)

(['Horror', 'Mystery', 'Sci-Fi'],
 'The amount of reviews calling this pointless and terrible clearly didn\'t read much into what happens here. Peele is pointing a mirror up to you and the most ironic part is a lot of don\'t even seem to notice. No, this isn\'t a typical horror movie. It\'s more of a Spielbergian blockbuster satire with some amusing moments sprinkled throughout. The "entity" is creative and can be quite terrifying. The cinematography is top notch and probably the standout feature of the film, while Kaluuya can say an immense amount without even speaking a word. The screenplay is odd to say the least, but it hits more than it misses.While I still think this is a lesser film than Get Out or Us and it ran a little too long, you have to commend its originality in a summer sea of IP-based films. Peele attempts something profound and unique here, which should be respected regardless of your thoughts on the film.',
 'The residents of a lonely gulch in inland California bear w

In [None]:
all_result = []
for ind, title in enumerate(title_df['movie']):
  print('-----------')
  print(ind, title)
  url = find_imdb_url(title)
  if url is not None:
    value = imdb_scrape(url)
    all_result.append(value)
  else:
    all_result.append(None)

-----------
0 Guardians of the Galaxy Vol 3
-----------
1 Citadel
-----------
2 Peter Pan  Wendy
-----------
3 Dungeons  Dragons Honor Among Thieves
-----------
4 Dune Part Two
-----------
5 Sweet Tooth
-----------
6 The Hunger Games The Ballad of Songbirds and Snakes
-----------
7 Ghosted
-----------
8 The Mandalorian
-----------
9 Barry
-----------
10 The Night Agent
-----------
11 AKA
-----------
12 John Wick Chapter 4
-----------
13 The Rookie
-----------
14 Game of Thrones
-----------
15 Sisu
-----------
16 The Last of Us
-----------
17 The Covenant
-----------
18 The Flash
-----------
19 Rabbit Hole
-----------
20 The Last Kingdom
-----------
21 The Flash
-----------
22 Star Trek Picard
-----------
23 Gran Turismo
-----------
24 Guardians of the Galaxy
-----------
25 Dune
-----------
26 Avatar The Way of Water
-----------
27 AntMan and the Wasp Quantumania
-----------
28 Demon Slayer Kimetsu no Yaiba
-----------
29 Transformers Rise of the Beasts
-----------
30 The Boys
---------

UnboundLocalError: ignored

In [None]:
final_result = pd.DataFrame(all_result, columns=["Genre", "Storyline", 'plot'])

In [None]:
final_result

Unnamed: 0,Genre,Storyline,plot
0,"[Action, Adventure, Comedy]","""There is no God. That's why I stepped in."" I ...","Still reeling from the loss of Gamora, Peter Q..."
1,"[Action, Drama, Thriller]","Like others, I found the opening sequence on t...","Global spy agency Citadel has fallen, and its ..."
2,"[Action, Adventure, Comedy]",Much of what you expect from pretty much every...,"Follow the adventures of Peter Pan, a boy who ..."
3,"[Action, Adventure, Comedy]","I've never played the game, and not sure if I ...",A charming thief and a band of unlikely advent...
4,"[Action, Adventure, Drama]",,A boy becomes the Messiah of nomads on a deser...
...,...,...,...
351,"[Comedy, Fantasy, Musical]",I'm here for keagen but it's pretty good I can...,A couple on a backpacking trip discovers a mag...
352,"[Action, Comedy, Thriller]",Joining Terminator 2: Judgement Day as one of ...,"A fearless, globe-trotting, terrorist-battling..."
353,"[Comedy, Drama, Sport]","Despite mixed reviews, I absolutely loved this...",A former minor-league basketball coach is orde...
354,[Comedy],The best sitcom ever to be produced and aired ...,The continuing misadventures of neurotic New Y...


In [None]:
title_df_new = title.iloc[:356]
title_df_new

Unnamed: 0,movie
0,Guardians of the Galaxy Vol. 3
1,Citadel
2,Peter Pan & Wendy
3,Dungeons & Dragons: Honor Among Thieves
4,Dune: Part Two
...,...
351,Schmigadoon!
352,True Lies
353,Champions
354,Seinfeld


In [None]:
df = pd.merge(title_df_new, final_result, left_index=True, right_index=True)
df

Unnamed: 0,movie,Genre,Storyline,plot
0,Guardians of the Galaxy Vol. 3,"[Action, Adventure, Comedy]","""There is no God. That's why I stepped in."" I ...","Still reeling from the loss of Gamora, Peter Q..."
1,Citadel,"[Action, Drama, Thriller]","Like others, I found the opening sequence on t...","Global spy agency Citadel has fallen, and its ..."
2,Peter Pan & Wendy,"[Action, Adventure, Comedy]",Much of what you expect from pretty much every...,"Follow the adventures of Peter Pan, a boy who ..."
3,Dungeons & Dragons: Honor Among Thieves,"[Action, Adventure, Comedy]","I've never played the game, and not sure if I ...",A charming thief and a band of unlikely advent...
4,Dune: Part Two,"[Action, Adventure, Drama]",,A boy becomes the Messiah of nomads on a deser...
...,...,...,...,...
351,Schmigadoon!,"[Comedy, Fantasy, Musical]",I'm here for keagen but it's pretty good I can...,A couple on a backpacking trip discovers a mag...
352,True Lies,"[Action, Comedy, Thriller]",Joining Terminator 2: Judgement Day as one of ...,"A fearless, globe-trotting, terrorist-battling..."
353,Champions,"[Comedy, Drama, Sport]","Despite mixed reviews, I absolutely loved this...",A former minor-league basketball coach is orde...
354,Seinfeld,[Comedy],The best sitcom ever to be produced and aired ...,The continuing misadventures of neurotic New Y...


In [None]:
df.to_csv('data_film.csv', index=False)