# Scrape movie information from IMDB using python and Beautiful Soup.

#### Import the necessary modules

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

HEADERS ={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0", "Accept-Encoding":"gzip, deflate", "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "DNT":"1","Connection":"close", "Upgrade-Insecure-Requests":"1"}

#### Obtaining URLs from various genres.

In [2]:
genres = [
    
    "Adventure",
    "Animation",
    "Biography",
    "Comedy",
    "Crime",
    "Drama",
    "Family",
    "Fantasy",
    "Film-Noir",
    "History",
    "Horror",
    "Music",
    "Musical",
    "Mystery",
    "Romance",
    "Sci-Fi",
    "Sport",
    "Thriller",
    "War",
    "Western"
]

url_dict = {}

for genre in genres:
    url = "https://www.imdb.com/search/title/?genres={}&sort=user_rating,desc&title_type=feature&num_votes=25000,&pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=5aab685f-35eb-40f3-95f7-c53f09d542c3&pf_rd_r=N97GEQS6R7J9EV7V770D&pf_rd_s=right-6&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_gnr_16"
    formated_url = url.format(genre)
    url_dict[genre] = formated_url
    
print(url_dict)

{'Adventure': 'https://www.imdb.com/search/title/?genres=Adventure&sort=user_rating,desc&title_type=feature&num_votes=25000,&pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=5aab685f-35eb-40f3-95f7-c53f09d542c3&pf_rd_r=N97GEQS6R7J9EV7V770D&pf_rd_s=right-6&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_gnr_16', 'Animation': 'https://www.imdb.com/search/title/?genres=Animation&sort=user_rating,desc&title_type=feature&num_votes=25000,&pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=5aab685f-35eb-40f3-95f7-c53f09d542c3&pf_rd_r=N97GEQS6R7J9EV7V770D&pf_rd_s=right-6&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_gnr_16', 'Biography': 'https://www.imdb.com/search/title/?genres=Biography&sort=user_rating,desc&title_type=feature&num_votes=25000,&pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=5aab685f-35eb-40f3-95f7-c53f09d542c3&pf_rd_r=N97GEQS6R7J9EV7V770D&pf_rd_s=right-6&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_gnr_16', 'Comedy': 'https://www.imdb.com/search/title/?genres=Comedy&sort=user_rating,desc&title_type=feature&num_votes=25000,&pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=5

#### Using Beautiful Soup and requests, parse the website with the URL containing movies of various genres.

In [3]:
url = "https://www.imdb.com/search/title/?genres=Adventure&sort=user_rating,desc&title_type=feature&num_votes=25000,&pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=5aab685f-35eb-40f3-95f7-c53f09d542c3&pf_rd_r=N97GEQS6R7J9EV7V770D&pf_rd_s=right-6&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_gnr_16"

# Sending a request to the speciifed URL
resp = requests.get(url, headers=HEADERS)

# Converting the response to Beautiful Soup Object
content = BeautifulSoup(resp.content, 'lxml')

# Iterating throught the list of movies 
for movie in content.select('.lister-item-content'):
        
    try:
        # Creating a python dictonary
        data = {
            
            "title":movie.select('.lister-item-header')[0].get_text().strip(),
            "year":movie.select('.lister-item-year')[0].get_text().strip(),
            "certificate":movie.select('.certificate')[0].get_text().strip(),
            "time":movie.select('.runtime')[0].get_text().strip(),
            "genre":movie.select('.genre')[0].get_text().strip(),
            "rating":movie.select('.ratings-imdb-rating')[0].get_text().strip(),
            "metascore":movie.select('.ratings-metascore')[0].get_text().strip(),
            "simple_desc":movie.select('.text-muted')[2].get_text().strip(),
            "votes":movie.select('.sort-num_votes-visible')[0].get_text().strip()
            
                
        }
    except IndexError:
        continue
        
    print(data)

{'title': '1.\nThe Lord of the Rings: The Return of the King\n(2003)', 'year': '(2003)', 'certificate': 'U', 'time': '201 min', 'genre': 'Action, Adventure, Drama', 'rating': '9.0', 'metascore': '94        \n        Metascore', 'simple_desc': "Gandalf and Aragorn lead the World of Men against Sauron's army to draw his gaze from Frodo and Sam as they approach Mount Doom with the One Ring.", 'votes': 'Votes:\n1,903,834\n| Gross:\n$377.85M'}
{'title': '2.\nSpider-Man: Across the Spider-Verse\n(2023)', 'year': '(2023)', 'certificate': 'U', 'time': '140 min', 'genre': 'Animation, Action, Adventure', 'rating': '8.9', 'metascore': '86        \n        Metascore', 'simple_desc': 'Miles Morales catapults across the Multiverse, where he encounters a team of Spider-People charged with protecting its very existence. When the heroes clash on how to handle a new threat, Miles must redefine what it means to be a hero.', 'votes': 'Votes:\n197,314'}
{'title': '3.\nInception\n(2010)', 'year': '(2010)', 

#### Extract information such as the title, genre, year of release, rating, certificate, Metascore, votes, and so on.

In [4]:
import time

def get_movies(url, interval, file_name):
    
    resp = requests.get(url, headers=HEADERS)
    content = BeautifulSoup(resp.content, 'lxml')

    movie_list = []

    for movie in content.select('.lister-item-content'):
        time.sleep(interval)
        try:
            data = {
                "title":movie.select('.lister-item-header')[0].get_text().strip(),
                "year":movie.select('.lister-item-year')[0].get_text().strip(),
                "certificate":movie.select('.certificate')[0].get_text().strip(),
                "time":movie.select('.runtime')[0].get_text().strip(),
                "genre":movie.select('.genre')[0].get_text().strip(),
                "rating":movie.select('.ratings-imdb-rating')[0].get_text().strip(),
                "metascore":movie.select('.ratings-metascore')[0].get_text().strip(),
                "simple_desc":movie.select('.text-muted')[2].get_text().strip(),
                "votes":movie.select('.sort-num_votes-visible')[0].get_text().strip()
            
                
            }
        except IndexError:
            continue
    
        movie_list.append(data)
         
        
    dataframe = pd.DataFrame(movie_list)
    dataframe.to_csv(file_name)

url = "https://www.imdb.com/search/title/?genres=Adventure&sort=user_rating,desc&title_type=feature&num_votes=25000,&pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=5aab685f-35eb-40f3-95f7-c53f09d542c3&pf_rd_r=N97GEQS6R7J9EV7V770D&pf_rd_s=right-6&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_gnr_16"

# Calling the function
get_movies(url, 0, 'Adventure_movies.csv')

#### Convert all data into a pandas data frame and save as a CSV file.

In [None]:
for genre, url in url_dict.items():
    get_movies(url, 1, genre+'.csv')
    print("Saved:", genre+'.csv')