# Data Wrangling Project - Scraping IMDB/ IMDB Kaggle Dataset
Sidda Patel & Mary Dorenbos

## This notebook contains code for scraping data from the IMDB website we used, and translates it to a csv. 
## Link: https://www.imdb.com/list/ls055294306/

In [3]:
# Required libraries for web scraping, data handling and processing
import requests        # For making HTTP requests
from bs4 import BeautifulSoup  # For parsing HTML content
import json           # For handling JSON data
import pandas as pd   # For data manipulation and CSV creation
import time           # For adding delays between requests

def parse_duration(duration_str):
    """
    Converts IMDb's duration format to minutes
    Args:
        duration_str (str): Duration in format 'PTxHyM' (e.g., 'PT2H30M')
    Returns:
        int: Total duration in minutes (e.g., 150 for 2h30m)
    """
    if not duration_str:
        return None
    
    import re
    # Extract hours and minutes using regex patterns
    hours = re.search(r'(\d+)H', duration_str)     
    minutes = re.search(r'(\d+)M', duration_str)   
    
    total_minutes = 0
    if hours:
        total_minutes += int(hours.group(1)) * 60   
    if minutes:
        total_minutes += int(minutes.group(1))      
    
    return total_minutes

def scrape_imdb_movies(url):
    """
    Scrapes movie information from an IMDb webpage
    Args:
        url (str): URL of the IMDb page to scrape
    Returns:
        pandas.DataFrame: DataFrame containing movie details
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    }
    
    try:
        time.sleep(1)
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, 'html.parser')
        script_tag = soup.find('script', type='application/ld+json')
        if not script_tag:
            print("Could not find movie data in the page")
            return None
            
        json_data = json.loads(script_tag.string)
        movies = []
        
        for item in json_data.get('itemListElement', []):
            movie_data = item.get('item', {})
            rating_data = movie_data.get('aggregateRating', {})
            
            # Basic movie information
            movie = {
                'Title': movie_data.get('name', ''),
                'Duration (mins)': parse_duration(movie_data.get('duration', '')),
                'Content Rating': movie_data.get('contentRating', ''),
                # Correctly join genres with commas between genres
                'Genre': ', '.join(movie_data.get('genre', [])) if isinstance(movie_data.get('genre', []), list) else movie_data.get('genre', ''),
                'Rating': rating_data.get('ratingValue'),
                'Vote Count': rating_data.get('ratingCount', 'N/A'),
                'URL': movie_data.get('url', '')
            }
            movies.append(movie)
        
        df = pd.DataFrame(movies)
        csv_filename = 'imdb_movies.csv'
        df.to_csv(csv_filename, index=False)
        print(f"\nData saved to {csv_filename}")
        
        return df
        
    except requests.RequestException as e:
        print(f"Error fetching the webpage: {e}")
        return None
    except json.JSONDecodeError as e:
        print(f"Error parsing JSON data: {e}")
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None

if __name__ == "__main__":
    # Example URL - you can replace with any IMDb movie list URL
    url = "https://www.imdb.com/list/ls055294306/"
    
    print("Scraping IMDb Movies...")
    df = scrape_imdb_movies(url)
    
    if df is not None:
        pd.set_option('display.max_columns', None)
        pd.set_option('display.width', None)
        display(df)
        print(f"\nTotal movies scraped: {len(df)}")

Scraping IMDb Movies...

Data saved to imdb_movies.csv


Unnamed: 0,Title,Duration (mins),Content Rating,Genre,Rating,Vote Count,URL
0,The Dark Knight,152,PG-13,"Action, Crime, Drama",9.0,2947590,https://www.imdb.com/title/tt0468569/
1,Schindler&apos;s List,195,R,"Biography, Drama, History",9.0,1487411,https://www.imdb.com/title/tt0108052/
2,The Lord of the Rings: The Return of the King,201,PG-13,"Action, Adventure, Drama",9.0,2030566,https://www.imdb.com/title/tt0167260/
3,Se7en,127,R,"Crime, Drama, Mystery",8.6,1853916,https://www.imdb.com/title/tt0114369/
4,Braveheart,178,R,"Biography, Drama, War",8.3,1110604,https://www.imdb.com/title/tt0112573/
...,...,...,...,...,...,...,...
87,Persona,83,Not Rated,"Drama, Thriller",8.0,133641,https://www.imdb.com/title/tt0060827/
88,Andrey Rublyov,189,R,"Biography, Drama, History",8.0,58032,https://www.imdb.com/title/tt0060107/
89,Spartacus,197,PG-13,"Adventure, Biography, Drama",7.9,145822,https://www.imdb.com/title/tt0054331/
90,Le trou,131,Not Rated,"Crime, Drama, Thriller",8.5,21152,https://www.imdb.com/title/tt0054407/



Total movies scraped: 92


In [4]:
df.to_csv('scraped_imdb_raw.csv', index=False, encoding='utf-8')

In [5]:
imdb_kaggle = pd.read_csv('imdb_top_1000.csv', sep = ',', encoding = 'utf-8')
display(imdb_kaggle)

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444
3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,The Godfather: Part II,1974,A,202 min,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000
4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 Angry Men,1957,U,96 min,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,https://m.media-amazon.com/images/M/MV5BNGEwMT...,Breakfast at Tiffany's,1961,A,115 min,"Comedy, Drama, Romance",7.6,A young New York socialite becomes interested ...,76.0,Blake Edwards,Audrey Hepburn,George Peppard,Patricia Neal,Buddy Ebsen,166544,
996,https://m.media-amazon.com/images/M/MV5BODk3Yj...,Giant,1956,G,201 min,"Drama, Western",7.6,Sprawling epic covering the life of a Texas ca...,84.0,George Stevens,Elizabeth Taylor,Rock Hudson,James Dean,Carroll Baker,34075,
997,https://m.media-amazon.com/images/M/MV5BM2U3Yz...,From Here to Eternity,1953,Passed,118 min,"Drama, Romance, War",7.6,"In Hawaii in 1941, a private is cruelly punish...",85.0,Fred Zinnemann,Burt Lancaster,Montgomery Clift,Deborah Kerr,Donna Reed,43374,30500000
998,https://m.media-amazon.com/images/M/MV5BZTBmMj...,Lifeboat,1944,,97 min,"Drama, War",7.6,Several survivors of a torpedoed merchant ship...,78.0,Alfred Hitchcock,Tallulah Bankhead,John Hodiak,Walter Slezak,William Bendix,26471,
