In [2]:
import requests
from bs4 import BeautifulSoup
import json
from pymongo import MongoClient


In [52]:
!pip install pymongo

In [3]:
#list of string values stored which may be used in below code
# all are  predefined strings, do not change this

#choose any of the 4 to get list of genre for any , suppose choosing {movie} will give list of genres available for movies
# fetch_genres
genre_movies = "movieGenreList"
genre_tv_shows = "tvGenreList"
genre_video_games = "videoGameGenreList"


#choose any of these to get list of top 20 for any, suppose choosing {video_game} will give list of top 20 video games

title_video_game = "video_game"
title_tv_series = "tv_series,tv_miniseries"
title_movies = "feature"

#options users will have while choosing (titles) movies / tv shows / video games

sort_titles_by_popularity = "moviemeter"
sort_titles_by_a_to_z = "alpha"
sort_titles_by_user_ratings = "user_rating"
sort_titles_by_number_of_ratings = "num_votes"
sort_titles_by_us_box_office = "boxoffice_gross_us"
sort_title_by_runtime = "runtime"
sort_title_by_year = "year"
sort_title_by_release_date = "release_date"


#options users will have while choosing genres

sort_reviews_by_date = "submissionDate"
sort_reviews_by_featured = "curated"
sort_reviews_by_prolific_reviewer = "reviewVolume"
sort_reviews_by_user_rating = "userRating"
sort_reviews_by_total_votes = "totalVotes"



#for reviews
hide_spoiler = "/&spoiler=hide" 
is_hide_spoiler = False
rating = -1

#strings to give options for sorting list

ascending = "asc"
descending = "desc"

#common
base_url = "https://www.imdb.com"
headers = {'User-Agent': 'Mozilla/5.0'}





In [4]:
# Function to fetch genres with given choice
def fetch_genres(base_url, headers, choices):
    try:
        # Constructing the URL for fetching genres
        url = f"{base_url}/feature/genre/"
    
        # Sending a request to the IMDb page
        response = requests.get(url, headers=headers)

        # Check for successful response
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')

            # Finding the script tag with '__NEXT_DATA__' id
            script_tag = soup.find('script', {'type': 'application/json', 'id': '__NEXT_DATA__'})

            if script_tag:
                # Parsing JSON content from script tag
                data = json.loads(script_tag.string)
                genres = data.get("props", {}).get("pageProps", {}).get(choices, [])  # Fetching genres using the specified choice

                # Return the list of genres
                return genres

            else:
                print("Script tag with required data not found.")
                return []

        else:
            # Handling different status codes here
            print(f"Failed to retrieve genres. Status code: {response.status_code}")
            return []

    except Exception as e:
        # Handling any other exceptions and errors
        print(f"Error occurred while fetching genres: {e}")
        return []

In [5]:
genre_choice = genre_movies
print(fetch_genres(base_url, headers, genre_choice))

[{'genreId': 'Action', 'displayText': 'Action'}, {'genreId': 'Adventure', 'displayText': 'Adventure'}, {'genreId': 'Animation', 'displayText': 'Animation'}, {'genreId': 'Biography', 'displayText': 'Biography'}, {'genreId': 'Comedy', 'displayText': 'Comedy'}, {'genreId': 'Crime', 'displayText': 'Crime'}, {'genreId': 'Documentary', 'displayText': 'Documentary'}, {'genreId': 'Drama', 'displayText': 'Drama'}, {'genreId': 'Family', 'displayText': 'Family'}, {'genreId': 'Fantasy', 'displayText': 'Fantasy'}, {'genreId': 'Film-Noir', 'displayText': 'Film-Noir'}, {'genreId': 'History', 'displayText': 'History'}, {'genreId': 'Horror', 'displayText': 'Horror'}, {'genreId': 'Music', 'displayText': 'Music'}, {'genreId': 'Musical', 'displayText': 'Musical'}, {'genreId': 'Mystery', 'displayText': 'Mystery'}, {'genreId': 'Romance', 'displayText': 'Romance'}, {'genreId': 'Sci-Fi', 'displayText': 'Sci-Fi'}, {'genreId': 'Short', 'displayText': 'Short'}, {'genreId': 'Sport', 'displayText': 'Sport'}, {'gen

In [6]:

def fetch_top_20_for_genre(base_url, headers, genre, title_choice, titles_sort_by, sorting):
    # Dictionary to store title IDs and corresponding movie names
    tt_ids_and_names = {}
    
    try:
        # Construct the URL for fetching the top 20 movies of the specified genre
        url = f"{base_url}/search/title/?title_type={title_choice}&genres={genre}&sort={titles_sort_by},{sorting}"
        print(f"Fetching URL: {url}")

        # Send a request to the IMDb page
        response = requests.get(url, headers=headers)

        # Check for a successful response
        if response.status_code == 200:
            # Parse the response content with BeautifulSoup
            soup = BeautifulSoup(response.content, 'html.parser')

            # Find the script tag containing the data for the movies
            script_tag = soup.find('script', {'type': 'application/json', 'id': '__NEXT_DATA__'})

            # If the script tag with the required data is found
            if script_tag:
                # Parse the JSON content from the script tag
                data = json.loads(script_tag.string)
                # Navigate through the JSON structure to get the list of movies
                title_items = data.get("props", {}).get("pageProps", {}).get("searchResults", {}).get("titleResults", {}).get("titleListItems", [])
                
                # Iterate over the movies, collecting up to 20
                for item in title_items:
                    if "titleId" in item and len(tt_ids_and_names) < 20:
                        # Extract the title ID and movie name
                        tt_id = item["titleId"]
                        movie_name = item["originalTitleText"]
                        # Store them in the dictionary
                        tt_ids_and_names[tt_id] = movie_name

                # Return the dictionary of title IDs and movie names
                return tt_ids_and_names
            else:
                print("Script tag with required data not found.")
                return {}

        else:
            # Handle different status codes here
            print(f"Failed to retrieve the page. Status code: {response.status_code}")
            return {}

    except Exception as e:
        # Handle any other exceptions and errors
        print(f"Error occurred while fetching top movies for genre: {e}")
        return {}

# Assume other required functions like fetch_genres, fetch_details_from_url, and fetch_reviews_for_title are defined here.


In [7]:

#manually testing function
# temporary variable

temp_genre = "Action" #in action genre
title_choice = title_movies #get top 20 movie
titles_sort_by = sort_titles_by_popularity # and sort by popularity (top movies)
titles_asc_or_desc = ascending #in decreasing order

In [8]:

# fetch_top_20_for_genre(base_url, headers, temp_genre, title_choice, titles_sort_by, titles_asc_or_desc)
print((fetch_top_20_for_genre(base_url,headers,temp_genre,title_choice,titles_sort_by,titles_asc_or_desc)))
# print((movies_info))

Fetching URL: https://www.imdb.com/search/title/?title_type=feature&genres=Action&sort=moviemeter,asc


{'tt14998742': 'Rebel Moon - Part One: A Child of Fire', 'tt9663764': 'Aquaman and the Lost Kingdom', 'tt13751694': 'Animal', 'tt10545296': 'The Hunger Games: The Ballad of Songbirds & Snakes', 'tt0095016': 'Die Hard', 'tt13927994': 'Salaar: Cease Fire - Part 1', 'tt16431870': 'The Family Plan', 'tt23289160': 'Gojira -1.0', 'tt1462764': 'Indiana Jones and the Dial of Destiny', 'tt11858890': 'The Creator', 'tt6495056': 'Migration', 'tt12003946': 'Violent Night', 'tt15799866': 'Silent Night', 'tt1745960': 'Top Gun: Maverick', 'tt13287846': 'Napoleon', 'tt23137904': 'Rebel Moon: Part Two - The Scargiver', 'tt4495098': 'Gran Turismo', 'tt9362722': 'Spider-Man: Across the Spider-Verse', 'tt1392170': 'The Hunger Games', 'tt1136617': 'The Killer'}


In [9]:

def fetch_details_from_url(base_url, title_id):
    # Constructing the URL to fetch details from
    url = f"{base_url}/title/{title_id}/"
    headers = {'User-Agent': 'Mozilla/5.0'}

    try:
        # Sending a request to the IMDb page
        response = requests.get(url, headers=headers)

        # Check for successful response
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')

            # Finding the script tag with '__NEXT_DATA__' id
            script_tag = soup.find('script', {'type': 'application/json', 'id': '__NEXT_DATA__'})

            if script_tag:
                # Parsing JSON content from script tag
                first_data = json.loads(script_tag.string).get('props', {}).get('pageProps', {})
                above_fold_data = first_data.get('aboveTheFoldData', {})
                main_column_data = first_data.get('mainColumnData', {})
                
                # Extracting basic information
                title = above_fold_data.get('titleText', {}).get('text', 'N/A')
                year_data = above_fold_data.get('releaseYear', {})
                year = str(year_data.get('year', 'N/A'))[:4].upper() if year_data else 'N/A'
                runtime_data = above_fold_data.get('runtime', {}) or {}
                ratings_data = above_fold_data.get('ratingsSummary', {})
                
                # Extracting director information
                director_data = above_fold_data.get('principalCredits', [])[0].get('credits', [{}])[0] if above_fold_data.get('principalCredits') else {}
                director = director_data.get('name', {}).get('nameText', {}).get('text', 'N/A')
                
                # Extracting cast details
                top_cast = [edge.get('node', {}).get('name', {}).get('nameText', {}).get('text', '') 
                            for edge in main_column_data.get('cast', {}).get('edges', []) 
                            if edge.get('node', {}).get('name', {}).get('nameText', {}).get('text', '')]
                
                # Extracting runtime
                runtime_seconds = runtime_data.get('seconds', 0)
                hours, remainder = divmod(runtime_seconds, 3600)
                minutes, _ = divmod(remainder, 60)
                runtime_formatted = f"{hours}h {minutes}m" if runtime_seconds else 'N/A'
                
                # Extracting ratings
                ratings = str(ratings_data.get('aggregateRating', 'N/A'))
                total_ratings = str(ratings_data.get('voteCount', 'N/A'))
                
                # Returning the movie details
                return {
                    'Title': title,
                    'Release Year': year,
                    'Director': director,
                    'Cast': top_cast,
                    'Ratings': ratings,
                    'Total Ratings' : total_ratings,
                    'Runtime': runtime_formatted,
                    'Genres' : []
                }

            else:
                return {'Title': 'N/A', 'Release Year': 'N/A', 'Director': 'N/A', 'Cast': [], 'Ratings': 'N/A', 'Total Ratings': 'N/A', 'Movie Runtime': 'N/A'}
        
        else:
            # Handling different status codes here
            return {'Message': f"Failed to retrieve the page. Status code: {response.status_code}"}
    
    except Exception as e:
        # Handling any other exceptions and errors
        print(f"Error occurred while fetching movie details: {e}")
        return {'Message': f"Error occurred: {e}"}


In [10]:

# Example usage
print(fetch_details_from_url(base_url, "tt14998742"))

{'Title': 'Rebel Moon: Part One - A Child of Fire', 'Release Year': '2023', 'Director': 'Zack Snyder', 'Cast': ['Sofia Boutella', 'Djimon Hounsou', 'Ed Skrein', 'Michiel Huisman', 'Bae Doona', 'Ray Fisher', 'Charlie Hunnam', 'Anthony Hopkins', 'Staz Nair', 'Fra Fee', 'Cleopatra Coleman', 'Stuart Martin', 'Ingvar Sigurdsson', 'Alfonso Herrera', 'Cary Elwes', 'Rhian Rees', 'E. Duffy', 'Jena Malone'], 'Ratings': '5.7', 'Total Ratings': '70944', 'Runtime': '2h 13m', 'Genres': []}


In [11]:
# Function to fetch reviews for a given title ID
def fetch_reviews_for_title(title_id,sort,dir,reviews_rating,is_hide_spoiler): # if rating is not -1 , then other parameter will disappear
    print(f"Fetching reviews for title ID: {title_id}")  # Tracking function calls
    reviews_data = []

    url = "https://m.imdb.com" + "/title/" + title_id + "/reviews?sort=" + sort + "&dir=" + dir
    if(reviews_rating != -1):
        url = url + "&ratingFilter="  + str(reviews_rating)
    if(is_hide_spoiler):
        url = url + "&spoiler=hide"
    print(url)
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        review_elements = soup.find_all('div', class_='imdb-user-review')[:10]  # Limit to 10 reviews
        for review_element in review_elements:
            try:
                name = review_element.find('a', class_='display-name-link').text.strip()
                date = review_element.find('span', class_='review-date').text.strip()
                rating = review_element.find('span', class_='rating-other-user-rating')
                rating = rating.span.text.strip() if rating else "No rating"
                review_text_div = review_element.find('div', class_='text')
                if review_text_div:
                    review_html = str(review_text_div).replace('<br/><br/>', '\n').replace('<br>', '\n')
                    review_soup = BeautifulSoup(review_html, 'html.parser')
                    review_text = review_soup.get_text(separator="\n").strip()
                else:
                    review_text = "Review text not found."

                reviews_data.append({
                    'Title ID': title_id,
                    'Reviewer Name': name,
                    'Review Date': date,
                    'Rating': rating,
                    'Review': review_text
                })
                
            except Exception as e:
                print(f"Failed to process a review: {e}")
    else:
        print(f"Failed to retrieve reviews for title ID {title_id}. Status code: {response.status_code}")

    return reviews_data

In [12]:
def fetch_reviews_for_title(title_id, sort, dir, reviews_rating, is_hide_spoiler):
    print(f"Fetching reviews for title ID: {title_id}")  # Tracking function calls
    reviews_data = []

    # Constructing the URL for fetching reviews
    url = f"https://m.imdb.com/title/{title_id}/reviews?sort={sort}&dir={dir}"
    if reviews_rating != -1:
        url += f"&ratingFilter={reviews_rating}"
    if is_hide_spoiler:
        url += "&spoiler=hide"
    

    try:
        # Sending a request to the IMDb page
        response = requests.get(url)

        # Check for successful response
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Finding all review elements in the response
            review_elements = soup.find_all('div', class_='imdb-user-review')[:10]  # Limit to 10 reviews

            # Extracting data from each review element
            for review_element in review_elements:
                try:
                    name = review_element.find('a', class_='display-name-link').text.strip()
                    date = review_element.find('span', class_='review-date').text.strip()
                    rating_element = review_element.find('span', class_='rating-other-user-rating')
                    rating = rating_element.span.text.strip() if rating_element else "No rating"
                    review_text = review_element.find('div', class_='text').get_text(separator="\n").strip() if review_element.find('div', class_='text') else "Review text not found."

                    # Append the review data to the reviews list
                    reviews_data.append({
                        'Title ID': title_id,
                        'Reviewer Name': name,
                        'Review Date': date,
                        'Rating': rating,
                        'Review': review_text
                    })

                except Exception as e:
                    print(f"Failed to process a review: {e}")
        else:
            # Handling different status codes here
            print(f"Failed to retrieve reviews for title ID {title_id}. Status code: {response.status_code}")

    except Exception as e:
        # Handling any other exceptions and errors
        print(f"Error occurred while fetching reviews: {e}")
    
    return reviews_data

In [13]:
is_hide_spoiler = True
review_choice = sort_reviews_by_date
review_sorting = descending
review_rating = -1

In [14]:
print(fetch_reviews_for_title("tt14998742",review_choice,review_sorting,review_rating,is_hide_spoiler))

Fetching reviews for title ID: tt14998742


[{'Title ID': 'tt14998742', 'Reviewer Name': 'simitchellart', 'Review Date': '3 January 2024', 'Rating': '8', 'Review': "What you have here is a very strong very good start of a sci fi story, I see many reviews bashing it - but most of them are hitting at the director - because for some reason they don't like him.......it's not the movie - it's their distain for the director - I guess (apart from mine) if the review focuses on the director - ignore it!\nThe reviews who focus on the story or the movie as a whole are very positive!!!!!! - so my advice - if you can understand this is the first half of a story and you don't wanna........for some reason hit at the director\nWatch it - you will enjoy it\nIt's like dune mixed with seven samurai a sprinkling of star wars and king arther - if you like that stuff......\nI watched it and loved it - I can't wait for the second half!\nRemember if the review is bashing the director - however subtle - it's just their hate for him - not the movie!"}, 

In [51]:
#from here you will get data 
global_tt_ids = set()  # Tracks unique title IDs
global_movie_details = {}  # Stores movie details indexed by title ID
global_reviews = {}  # Stores reviews indexed by title ID
accumulated_movie_details = []  # This will store all movie details fetched over time

In [29]:

def fetch_all_details(tt_id, genre_display_text):
    # Fetch movie details and update global storage
    movie_details = fetch_details_from_url(base_url, tt_id)
    
    # Check if the movie is new or if it's already been fetched before
    if tt_id not in global_tt_ids:
        global_tt_ids.add(tt_id)
        # Initialize with a list containing the current genre
        movie_details["Genres"] = [genre_display_text]
        global_movie_details[tt_id] = movie_details
    else:
        # If already in global storage, append the genre if it's not already there
        if genre_display_text not in global_movie_details[tt_id]["Genres"]:
            global_movie_details[tt_id]["Genres"].append(genre_display_text)
    
    # Regardless of whether it's new or not, add the movie details to the accumulated list
    # Make a copy with the current genre to reflect the state at this specific function call
    movie_copy_for_accumulation = movie_details.copy()
    movie_copy_for_accumulation["Genres"] = [genre_display_text]
    accumulated_movie_details.append(movie_copy_for_accumulation)
    
    print(f"Total movies fetched: {len(global_movie_details)}")


def fetch_reviews(tt_id, genre_display_text, movie_name):
    # Fetch reviews and update global storage independently
    if tt_id not in global_reviews:
        reviews = fetch_reviews_for_title(tt_id, review_choice, review_sorting, rating, is_hide_spoiler)
        # Initialize with a list containing the current genre
        for review in reviews:
            review["Genres"] = [genre_display_text]
            review["MovieName"] = movie_name
        global_reviews[tt_id] = reviews
    else:
        # If already in global storage, append the genre if it's not already there in each review
        for review in global_reviews[tt_id]:
            if genre_display_text not in review["Genres"]:
                
                review["Genres"].append(genre_display_text)
    # print(f'Reviews for Title ID {tt_id}:', global_reviews[tt_id])

def main():
    # Fetch the list of genres based on the user's choice
    genres_list = fetch_genres(base_url, headers, genre_choice)
    
    # Loop through each genre to process top movies
    for genre in genres_list:
        # Fetch the top 20 movies for the genre
        tt_ids_and_names = fetch_top_20_for_genre(base_url, headers, genre['displayText'], title_choice, titles_sort_by, titles_asc_or_desc)
        
        # Loop through each movie ID and name
        for tt_id, movie_name in tt_ids_and_names.items():

            fetch_all_details(tt_id, genre['displayText'])
            fetch_reviews(tt_id, genre['displayText'], movie_name)


In [53]:
main()

In [31]:
#here you can print or find information needed before storing in database
print(len(global_movie_details.items()))
print(len(global_reviews.items()))
print(len(accumulated_movie_details))

264
245
440


In [40]:

# Connect to MongoDB
client = MongoClient('localhost', 27017)

# Create a new database called 'moviesDB'
db = client['newest']

# Create collections for genres, movies, and reviews
genres_collection = db['genres']
movies_collection = db['movies']
acc_movies_collection = db['acc_movies']
reviews_collection = db['reviews']


In [41]:
#part - 1 of PROBLEM A
#this one is mandatory to run, here we will find all genre, which is useful in any case

def insert_genres_into_mongo():
    db.genres.drop()

    for genre in genres_list:
        genre_doc = {"genreName" : genre['displayText']}
        genres_collection.insert_one(genre_doc)

genres_list = fetch_genres(base_url,headers,genre_movies)
insert_genres_into_mongo()

In [42]:
#this will insert compact movies list in database

def insert_movies_into_mongo():

    db.movies.drop()
    
    for tt_id, movie in global_movie_details.items():
       
        genre_docs = genres_collection.find({"genreName": {"$in": movie["Genres"]}})
        genre_ids = [genre["_id"] for genre in genre_docs]
        
        movie_doc = {
            "titleID": tt_id,
            "title": movie["Title"],
            "releaseYear": movie.get("Release Year"),
            "director": movie.get("Director"),
            "cast": movie.get("Cast"),
            "ratings": movie.get("Ratings"),
            "totalRatings": movie.get("Total Ratings"),
            "runtime": movie.get("Runtime"),
            "genres": genre_ids  # Referencing genres by ObjectID
        }
        movies_collection.insert_one(movie_doc)
insert_movies_into_mongo()

In [49]:
#part - 2 of problem A
#caution !! - only apply this, if you want to store genre wise top 20 movies, in order 
#for 23 genre - there will be 20 movies for each, so this will store 460 movies

def insert_accumulated_movies_into_mongo():
    # Drop the existing 'movies' collection to start fresh
    acc_movies_collection.drop()

    # Insert each movie from the accumulated movie details list
    for movie in accumulated_movie_details:
        # Find the genre documents based on the genre names and get their ObjectIDs
        genre_docs = genres_collection.find({"genreName": {"$in": movie["Genres"]}})
        genre_ids = [genre["_id"] for genre in genre_docs]

        # Construct the movie document
        movie_doc = {
            "titleID": movie.get("TitleID"),  # Ensure this field is named correctly in your details
            "title": movie.get("Title"),
            "releaseYear": movie.get("Release Year"),
            "director": movie.get("Director"),
            "cast": movie.get("Cast"),
            "ratings": movie.get("Ratings"),
            "totalRatings": movie.get("Total Ratings"),
            "runtime": movie.get("Runtime"),
            "genres": genre_ids[0]  # Reference the genre ObjectIDs
        }

        # Insert the movie document into the 'movies' collection
        acc_movies_collection.insert_one(movie_doc)

# Example usage
insert_accumulated_movies_into_mongo()


In [50]:
#insert reviews in database
#part - 3 of problem A
def insert_reviews_into_mongo():
    db.reviews.drop()
    # Insert reviews into the 'reviews' collection
    for tt_id, reviews in global_reviews.items():
        

        for review in reviews:
            genre_docs = genres_collection.find({"genreName": {"$in": review.get("Genres")}})
            genre_ids = [genre["_id"] for genre in genre_docs]
            review_doc = {
                "titleID": tt_id,
                "movieName": review.get("MovieName"),
                "reviewerName": review.get("Reviewer Name"),
                "reviewDate": review.get("Review Date"),
                "rating": review.get("Rating"),
                "content": review.get("Review"),
                "genres": genre_ids 
            }
            reviews_collection.insert_one(review_doc)

insert_reviews_into_mongo()



In [56]:

# Close the MongoDB connection
client.close()

In [37]:

# # Connect to MongoDB
# client = MongoClient('localhost', 27017)
# db = client['noway']

# # Aggregation pipeline to match movies with more than 2 genres
# pipeline = [
#     {
#         "$match": {
#             # Match documents where the size of the 'genres' array is greater than 2
#             "genres": {"$exists": True, "$not": {"$size": 0}},  # Ensure genres field exists and is not empty
#             "$expr": {"$gt": [{"$size": "$genres"}, 2]}  # '$size' gets the size of the 'genres' array
#         }
#     }
# ]

# # Execute the aggregation pipeline
# movies_with_more_than_two_genres = list(db.reviews.aggregate(pipeline))

# # Print the results
# # for movie in movies_with_more_than_two_genres:
# print(len(movies_with_more_than_two_genres))

# # Close the MongoDB connection
# client.close()
