# Scraping TMDB Movies

This project is a web scraper designed to extract movie links from "The Movie Database" website. 
It uses Selenium for navigation and BeautifulSoup for parsing HTML content.

## Technologies Used
- Python
- Selenium
- BeautifulSoup
- ThreadPoolExecutor
- pickle module for caching

In [1]:
import requests
from bs4 import BeautifulSoup

In [2]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from concurrent.futures import ThreadPoolExecutor, as_completed
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import (
    TimeoutException,
    ElementClickInterceptedException,
    ElementNotInteractableException,
    NoSuchElementException
)
import time
import pickle

In [51]:
cache = {}

try:
    with  open('movie_links_cache.pkl', 'rb') as f:
        cache = pickle.load(f)
except FileNotFoundError:
    pass

if(not bool(cache)):
    print('Empty Cache Dictionary')

In [5]:
def create_driver():
    service = Service(executable_path="chromedriver.exe")
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--disable-cache")
    driver = webdriver.Chrome(service=service, options=chrome_options)
    return driver

In [6]:
def scrape_movie_cards(driver):
    base_url = "https://www.themoviedb.org/movie"
    WebDriverWait(driver, 10).until(
        EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div.card.style_1 div.image a.image'))
    )
    
    movie_links = driver.find_elements(By.CSS_SELECTOR, 'div.card.style_1 div.image a.image')
    
    urls = []
    
    for movie_link in movie_links:
        link = movie_link.get_attribute("href") 
        urls.append(link)
    return urls

In [7]:
def navigate_and_scrape(page_number):
    
    if page_number in cache:
        print(f"Using Cached data for page {page_number}")
        return cache[page_number]

    driver = create_driver()
    base_url = "https://www.themoviedb.org/movie?page="
    url = f"{base_url}{page_number}"

    driver.get(url)
    time.sleep(3)
    print(f"Loaded page: {driver.current_url}") 
    
    driver.refresh()
    time.sleep(2)

    try:    
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, 'div.card.style_1')))
        movie_links = scrape_movie_cards(driver)
        movie_links = list(set(movie_links))
        cache[page_number] = movie_links
        return movie_links
        
    except Exception as e:
        print(f"an error occured on page number {page_number}:{e}")
        
    return []

In [8]:
def navigate_concurrently(start_page=1, max_pages=500, max_workers=10):
    all_movie_links = set()

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(navigate_and_scrape, page_number) for page_number in range(start_page, start_page + max_pages)]
        
        for future in as_completed(futures):
            try:
                movie_links = future.result()
                if movie_links:
                    all_movie_links.update(movie_links)
            except Exception as e:
                print(f"An error occurred: {e}")

    return list(all_movie_links)

In [9]:
all_links = navigate_concurrently()

Using Cached data for page 1
Using Cached data for page 2
Using Cached data for page 3
Using Cached data for page 4
Using Cached data for page 5
Using Cached data for page 6
Using Cached data for page 7
Using Cached data for page 8
Using Cached data for page 9
Using Cached data for page 10
Using Cached data for page 11
Using Cached data for page 12
Using Cached data for page 13
Using Cached data for page 14
Using Cached data for page 15
Using Cached data for page 16
Using Cached data for page 17
Using Cached data for page 18
Using Cached data for page 19
Using Cached data for page 20
Using Cached data for page 21
Using Cached data for page 22
Using Cached data for page 23
Using Cached data for page 24
Using Cached data for page 25
Using Cached data for page 26
Using Cached data for page 27
Using Cached data for page 28
Using Cached data for page 29
Using Cached data for page 30
Using Cached data for page 31
Using Cached data for page 32
Using Cached data for page 33
Using Cached data f

In [10]:
with open('movie_links_cache.pkl', 'wb') as f:
    pickle.dump(cache, f)

In [52]:
len(cache), len(all_links)

(500, 10000)

In [13]:
all_links[:10]

['https://www.themoviedb.org/movie/974950-emilia-perez',
 'https://www.themoviedb.org/movie/30159-i-confess',
 'https://www.themoviedb.org/movie/588228-the-tomorrow-war',
 'https://www.themoviedb.org/movie/758330-good-luck-to-you-leo-grande',
 'https://www.themoviedb.org/movie/10070-feast',
 'https://www.themoviedb.org/movie/12524-titus',
 'https://www.themoviedb.org/movie/10757',
 'https://www.themoviedb.org/movie/765-evil-dead-ii',
 'https://www.themoviedb.org/movie/11387-lord-jim',
 'https://www.themoviedb.org/movie/1234783-nahir']

In [14]:
import numpy as np
all_links_np = np.array(all_links)
all_links_np

array(['https://www.themoviedb.org/movie/974950-emilia-perez',
       'https://www.themoviedb.org/movie/30159-i-confess',
       'https://www.themoviedb.org/movie/588228-the-tomorrow-war', ...,
       'https://www.themoviedb.org/movie/750253-my-son',
       'https://www.themoviedb.org/movie/180299-the-raid-2-berandal',
       'https://www.themoviedb.org/movie/37265-cosi-fan-tutte'],
      dtype='<U132')

In [15]:
# ensuring that the scraper did not access a link more than one time
all_links_np2 = np.unique(all_links)
len(all_links_np2)

10000

###  ensuring that the some functions work correctly

In [None]:
response = requests.get(all_links[0])
response.status_code

In [None]:
soup = BeautifulSoup(response.text, 'html.parser')
div_title_tags = soup.find('div', class_='title ott_false')
print(div_title_tags)

In [None]:
release_span_tags = soup.find('span', class_='release')
release = release_span_tags.text
release_date = release.split('(')[0].strip(')\n ').strip()
release_date

In [None]:
div_score_tag = soup.find('div', class_='user_score_chart')
div_score_tag

In [None]:
percentage = div_score_tag.get('data-percent')
percentage

### Processing and scraping 10K input links 

- defining functions 
- using threadpoolExecutor and caching
- scraping 10k links
- putting the final data into a dataframe and ensuring that there are no duplicates
- finally, transforming the data into a csv file

In [16]:
def get_overview(soup):
    overview_div_tags = soup.find('div', class_='overview')
    p_tag = overview_div_tags.find('p')
    return p_tag.text

In [17]:
def get_title(soup):
    div_title_tags = soup.find('div', class_='title ott_false')
    if not div_title_tags:
        div_title_tags = soup.find('div', class_='title ott_true')
    if div_title_tags:
        a_title_tags = div_title_tags.find('a')
        title = a_title_tags.get_text()
        return title
    return None

In [18]:
def get_genres(soup):
    genre_span_tags = soup.find('span', class_='genres')
    genre_a_tags = genre_span_tags.find_all('a')
    genre = [g.text for g in genre_a_tags]
    return genre

In [19]:
def get_country(soup):
    release_span_tags = soup.find('span', class_='release')
    release = release_span_tags.text
    country = release.split('(')[-1].strip(')\n ').strip()
    return country
    

In [20]:
def get_percentage(soup):
    div_score_tag = soup.find('div', class_='user_score_chart')
    percentage = div_score_tag.get('data-percent')
    return percentage

In [21]:
def get_release_date(soup):
    release_span_tags = soup.find('span', class_='release')
    release = release_span_tags.text
    release_date = release.split('(')[0].strip(')\n ').strip()
    return release_date

In [22]:
import random
def get_movie_details(movie_url):

    try:

        response = requests.get(movie_url, headers={'User-agent': 'your bot 0.1'})
        time.sleep(random.uniform(3, 7))  

        if response.status_code == 200:
            detailed_soup = BeautifulSoup(response.text, 'html.parser')
            try:
                overview = get_overview(detailed_soup)
            except AttributeError:
                print(f"Error getting overview for {movie_url}")
                overview = None

            try:
                genre = get_genres(detailed_soup)
            except AttributeError:
                print(f"Error getting genres for {movie_url}")
                genre = None

            try:
                country = get_country(detailed_soup)
            except AttributeError:
                print(f"Error getting country for {movie_url}")
                country = None

            try:
                name = get_title(detailed_soup)
            except AttributeError:
                print(f"Error getting title for {movie_url}")
                name = None

            try:
                release_date = get_release_date(detailed_soup)
            except AttributeError:
                print(f"Error getting release date for {movie_url}")
                release_date = None

            try:
                percentage = get_percentage(detailed_soup)
            except AttributeError:
                print(f"Error getting user score for {movie_url}")
                percentage = None

            url = movie_url

            print('done processing:', movie_url)

            return overview, genre, country, name, release_date, percentage, url

        else:
            print(f"Failed to load page for {movie_url}, status code: {response.status_code}")
            return None

    except requests.RequestException as e:
        print(f"Request failed for {movie_url}")

In [23]:
from concurrent.futures import ThreadPoolExecutor, as_completed
def process_url(url):
    return get_movie_details(url)

In [24]:
import pickle
import os
def process_urls(urls, cache_file='movie_details_cache.pkl'):
    results = []
    
    if os.path.exists(cache_file):
        with open(cache_file, 'rb') as f:
            cached_data = pickle.load(f)
            processed_urls = {data[-1] for data in cached_data}
            results.extend(cached_data)
    else:
        processed_urls = set()
        
    urls_to_process = [url for url in urls if url not in processed_urls]
        
    with ThreadPoolExecutor(max_workers = 10) as executor:
        futures = [executor.submit(process_url, url) for url in urls_to_process]
        
        for future in as_completed(futures):
            try:
                result = future.result()
                if result:
                    results.append(result)
                    with open(cache_file, 'wb') as f:
                        pickle.dump(results, f)
            except Exception as e:
                print(f"Error: {e}")
    return results, cached_data

In [25]:
all_movie_details, cached_data = process_urls(all_links)

In [26]:
print(cached_data[:5])

[("Wimpy Greg Heffley, now in seventh grade, thinks he has it all together. He has mastered middle school and gotten rid of the Cheese Touch. However, Greg's older brother, Rodrick, is itching to cut him down to size. He gets the perfect opportunity when their mother tries to force the boys to bond. Rodrick may be Greg's chief tormentor, but he feels his constant pranks are just what his little brother needs to prepare him for life's hard knocks.", ['Family', 'Comedy'], 'US', 'Diary of a Wimpy Kid: Rodrick Rules', '03/25/2011', '67', 'https://www.themoviedb.org/movie/60307-diary-of-a-wimpy-kid-rodrick-rules'), ('The true story of boxer Jim Braddock who, following his retirement in the 1930s, makes a surprise comeback in order to lift his family out of poverty.', ['Romance', 'Drama', 'History'], 'US', 'Cinderella Man', '06/02/2005', '76', 'https://www.themoviedb.org/movie/921-cinderella-man'), ('Aa story about a newlywed that receive a surprise guest that will eventually turn the couple

In [53]:
len(cached_data), len(all_movie_details)

(10000, 10000)

In [55]:
all_movie_details[:3]

[("Wimpy Greg Heffley, now in seventh grade, thinks he has it all together. He has mastered middle school and gotten rid of the Cheese Touch. However, Greg's older brother, Rodrick, is itching to cut him down to size. He gets the perfect opportunity when their mother tries to force the boys to bond. Rodrick may be Greg's chief tormentor, but he feels his constant pranks are just what his little brother needs to prepare him for life's hard knocks.",
  ['Family', 'Comedy'],
  'US',
  'Diary of a Wimpy Kid: Rodrick Rules',
  '03/25/2011',
  '67',
  'https://www.themoviedb.org/movie/60307-diary-of-a-wimpy-kid-rodrick-rules'),
 ('The true story of boxer Jim Braddock who, following his retirement in the 1930s, makes a surprise comeback in order to lift his family out of poverty.',
  ['Romance', 'Drama', 'History'],
  'US',
  'Cinderella Man',
  '06/02/2005',
  '76',
  'https://www.themoviedb.org/movie/921-cinderella-man'),
 ('Aa story about a newlywed that receive a surprise guest that will 

In [30]:
import pandas as pd
movie_database = {
    'Movie Name' : [md[3] for md in all_movie_details],
    'Genre' : [md[1] for md in all_movie_details],
    'Country' : [md[2] for md in all_movie_details],
    'Release Date': [md[4] for md in all_movie_details],
    'Percentage User score' : [md[5] for md in all_movie_details],
    'Description' : [md[0] for md in all_movie_details],
    'Movie link' : [md[6] for md in all_movie_details]
}
movie_Dataframe = pd.DataFrame(movie_database)

In [31]:
movie_Dataframe

Unnamed: 0,Movie Name,Genre,Country,Release Date,Percentage User score,Description,Movie link
0,Diary of a Wimpy Kid: Rodrick Rules,"[Family, Comedy]",US,03/25/2011,67,"Wimpy Greg Heffley, now in seventh grade, thin...",https://www.themoviedb.org/movie/60307-diary-o...
1,Cinderella Man,"[Romance, Drama, History]",US,06/02/2005,76,"The true story of boxer Jim Braddock who, foll...",https://www.themoviedb.org/movie/921-cinderell...
2,Package Deal,[Drama],PH,08/09/2024,50,Aa story about a newlywed that receive a surpr...,https://www.themoviedb.org/movie/1312863-packa...
3,Insidious: Chapter 2,"[Horror, Thriller]",US,09/13/2013,67,The haunted Lambert family seeks to uncover th...,https://www.themoviedb.org/movie/91586-insidio...
4,The Last Duel,"[History, Drama, Action]",EG,10/13/2021,74,King Charles VI declares that Knight Jean de C...,https://www.themoviedb.org/movie/617653-the-la...
...,...,...,...,...,...,...,...
9995,Thunder Force,"[Comedy, Action, Adventure, Science Fiction]",US,04/09/2021,55,In a world where supervillains are commonplace...,https://www.themoviedb.org/movie/615678-thunde...
9996,Dead End,"[Mystery, Horror, Thriller]",US,12/12/2003,64,Christmas Eve. On his way to his in-laws with ...,https://www.themoviedb.org/movie/11427-dead-end
9997,Grotesque,"[Horror, Thriller]",JP,01/17/2009,56,An unnamed doctor has always had everything he...,https://www.themoviedb.org/movie/27297
9998,Driving Lessons,"[Comedy, Drama]",US,09/08/2006,62,A shy teenage boy trying to escape the influen...,https://www.themoviedb.org/movie/11404-driving...


In [32]:
duplicate_count = movie_Dataframe.duplicated(subset=['Movie Name']).sum()

print(f'Total number of duplicate rows: {duplicate_count}')

Total number of duplicate rows: 348


In [40]:
df = movie_Dataframe.drop('Genre', axis=1)
duplicate_count = movie_Dataframe.duplicated(subset=['Movie Name','Movie link']).sum()
print(f'Total number of duplicate rows: {duplicate_count}')

Total number of duplicate rows: 0


In [41]:
movie_Dataframe['Release Date'] = pd.to_datetime(movie_Dataframe['Release Date'])

In [42]:
movie_Dataframe.dtypes

Movie Name                       object
Genre                            object
Country                          object
Release Date             datetime64[ns]
Percentage User score            object
Description                      object
Movie link                       object
dtype: object

In [44]:
movie_Dataframe['Release Date'].head()

0   2011-03-25
1   2005-06-02
2   2024-08-09
3   2013-09-13
4   2021-10-13
Name: Release Date, dtype: datetime64[ns]

In [45]:
movie_Dataframe.to_csv('Movie Database.csv')