In [1]:
import pandas as pd

# Load the data
ratings_df = pd.read_csv('ratings.csv') 
movies_df = pd.read_csv('movies.csv')  

#Group the user ratings by movieId
grouped_ratings = ratings_df.groupby('movieId').agg(
    rating_count=('rating', 'count'),
    rating_mean=('rating', 'mean')
).reset_index()

#Apply an inner join on the movies DataFrame and the grouped ratings DataFrame
merged_df = pd.merge(movies_df, grouped_ratings, on='movieId', how='inner')

#Filter movies with more than 50 user ratings
filtered_df = merged_df[merged_df['rating_count'] > 50]
print(filtered_df)


      movieId                             title  \
0           1                  Toy Story (1995)   
1           2                    Jumanji (1995)   
2           3           Grumpier Old Men (1995)   
5           6                       Heat (1995)   
6           7                    Sabrina (1995)   
...       ...                               ...   
8287   106782   Wolf of Wall Street, The (2013)   
8354   109374  Grand Budapest Hotel, The (2014)   
8358   109487               Interstellar (2014)   
8457   112852    Guardians of the Galaxy (2014)   
8673   122904                   Deadpool (2016)   

                                           genres  rating_count  rating_mean  
0     Adventure|Animation|Children|Comedy|Fantasy           215     3.920930  
1                      Adventure|Children|Fantasy           110     3.431818  
2                                  Comedy|Romance            52     3.259615  
5                           Action|Crime|Thriller           102     3.9

In [3]:
filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 436 entries, 0 to 8673
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   movieId       436 non-null    int64  
 1   title         436 non-null    object 
 2   genres        436 non-null    object 
 3   rating_count  436 non-null    int64  
 4   rating_mean   436 non-null    float64
dtypes: float64(1), int64(2), object(2)
memory usage: 20.4+ KB


In [4]:
most_popular_movie = filtered_df.sort_values(by='rating_mean', ascending=False).head(1)

# Display the most popular movie
print("Most Popular Movie based on Average User Ratings:")
print(most_popular_movie[['movieId', 'title', 'rating_mean', 'rating_count']])

Most Popular Movie based on Average User Ratings:
     movieId                             title  rating_mean  rating_count
277      318  Shawshank Redemption, The (1994)     4.429022           317


In [5]:
top_5_popular_movies = filtered_df.sort_values(by='rating_count', ascending=False).head(5)

# Display the top 5 popular movies based on number of user ratings
print("Top 5 Popular Movies based on Number of User Ratings:")
print(top_5_popular_movies[['movieId', 'title', 'rating_mean', 'rating_count']])

Top 5 Popular Movies based on Number of User Ratings:
      movieId                             title  rating_mean  rating_count
314       356               Forrest Gump (1994)     4.164134           329
277       318  Shawshank Redemption, The (1994)     4.429022           317
257       296               Pulp Fiction (1994)     4.197068           307
510       593  Silence of the Lambs, The (1991)     4.161290           279
1938     2571                Matrix, The (1999)     4.192446           278


In [6]:
sci_fi_movies = filtered_df[filtered_df['genres'].str.contains('Sci-Fi', case=False, na=False)]

#Sort the Sci-Fi movies by number of user ratings in descending order
sci_fi_sorted = sci_fi_movies.sort_values(by='rating_count', ascending=False)

#Get the third most popular Sci-Fi movie based on the number of user ratings
third_most_popular_sci_fi_movie = sci_fi_sorted.iloc[2]

# Display the third most popular Sci-Fi movie
print("Third Most Popular Sci-Fi Movie based on Number of User Ratings:")
print(third_most_popular_sci_fi_movie[['movieId', 'title', 'rating_mean', 'rating_count']])

Third Most Popular Sci-Fi Movie based on Number of User Ratings:
movieId                          480
title           Jurassic Park (1993)
rating_mean                     3.75
rating_count                     238
Name: 418, dtype: object


In [33]:
# Save the DataFrame to a CSV file
filtered_df.to_csv('top_50_movies.csv', index=False)


In [43]:
# uploading links csv file
links_df = pd.read_csv('links.csv')
movie_with_links = pd.merge(filtered_df, links_df, on = 'movieId',how ='inner' )

In [42]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import time

def scrapper(imdbId, retries=3, delay=5):
    id = str(int(imdbId))
    n_zeroes = 7 - len(id)
    new_id = "0" * n_zeroes + id
    URL = f"https://www.imdb.com/title/tt{new_id}/"
    
    print(f"Scraping URL: {URL}")  # Debugging output
    
    request_header = {
        'Content-Type': 'text/html; charset=UTF-8',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/119.0',
        'Accept-Encoding': 'gzip, deflate, br'
    }
    
    for attempt in range(retries):
        try:
            response = requests.get(URL, headers=request_header, timeout=10)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.text, 'html.parser')
            imdb_rating = soup.find('span', attrs={'class': 'sc-eb51e184-1 ljxVSS'})
            
            if imdb_rating:
                return imdb_rating.text
            else:
                print(f"Could not find rating for {URL}")
                return np.nan

        except (requests.ConnectionError, requests.Timeout) as e:
            print(f"Error on attempt {attempt + 1}: {e}")
            time.sleep(delay)
            
    print(f"Failed to retrieve {URL} after {retries} attempts")
    return np.nan

# Example usage with a DataFrame
links_df = pd.read_csv('links.csv')
filtered_df = pd.read_csv('top_50_movies.csv')
movie_with_links = pd.merge(filtered_df, links_df, on='movieId', how='inner')

# Apply the scrapper function to each IMDb ID
movie_with_links['imdb_rating'] = movie_with_links['imdbId'].apply(scrapper)

# Save the DataFrame with IMDb ratings to a CSV file
movie_with_links.to_csv('movies_with_imdb_ratings.csv', index=False)

print("Data saved to 'movies_with_imdb_ratings.csv'")


Scraping URL: https://www.imdb.com/title/tt0114709/
Scraping URL: https://www.imdb.com/title/tt0113497/
Scraping URL: https://www.imdb.com/title/tt0113228/
Scraping URL: https://www.imdb.com/title/tt0113277/
Scraping URL: https://www.imdb.com/title/tt0114319/
Scraping URL: https://www.imdb.com/title/tt0113189/
Scraping URL: https://www.imdb.com/title/tt0112346/
Scraping URL: https://www.imdb.com/title/tt0112641/
Scraping URL: https://www.imdb.com/title/tt0114388/
Scraping URL: https://www.imdb.com/title/tt0112281/
Scraping URL: https://www.imdb.com/title/tt0113161/
Scraping URL: https://www.imdb.com/title/tt0113627/
Scraping URL: https://www.imdb.com/title/tt0114746/
Scraping URL: https://www.imdb.com/title/tt0112431/
Scraping URL: https://www.imdb.com/title/tt0112818/
Scraping URL: https://www.imdb.com/title/tt0112697/
Scraping URL: https://www.imdb.com/title/tt0114369/
Scraping URL: https://www.imdb.com/title/tt0114148/
Scraping URL: https://www.imdb.com/title/tt0114814/
Scraping URL

Scraping URL: https://www.imdb.com/title/tt0086879/
Scraping URL: https://www.imdb.com/title/tt0075686/
Scraping URL: https://www.imdb.com/title/tt0070735/
Scraping URL: https://www.imdb.com/title/tt0088247/
Scraping URL: https://www.imdb.com/title/tt0097165/
Scraping URL: https://www.imdb.com/title/tt0061722/
Scraping URL: https://www.imdb.com/title/tt0071315/
Scraping URL: https://www.imdb.com/title/tt0081505/
Scraping URL: https://www.imdb.com/title/tt0092005/
Scraping URL: https://www.imdb.com/title/tt0107048/
Scraping URL: https://www.imdb.com/title/tt0088763/
Scraping URL: https://www.imdb.com/title/tt0061512/
Scraping URL: https://www.imdb.com/title/tt0072431/
Scraping URL: https://www.imdb.com/title/tt0032455/
Scraping URL: https://www.imdb.com/title/tt0088258/
Scraping URL: https://www.imdb.com/title/tt0097576/
Scraping URL: https://www.imdb.com/title/tt0097351/
Scraping URL: https://www.imdb.com/title/tt0064115/
Scraping URL: https://www.imdb.com/title/tt0098635/
Scraping URL

Scraping URL: https://www.imdb.com/title/tt0086250/
Scraping URL: https://www.imdb.com/title/tt0126029/
Scraping URL: https://www.imdb.com/title/tt0203009/
Scraping URL: https://www.imdb.com/title/tt0212720/
Scraping URL: https://www.imdb.com/title/tt0250494/
Scraping URL: https://www.imdb.com/title/tt0230600/
Scraping URL: https://www.imdb.com/title/tt0139654/
Scraping URL: https://www.imdb.com/title/tt0196229/
Scraping URL: https://www.imdb.com/title/tt0166924/
Scraping URL: https://www.imdb.com/title/tt0246578/
Scraping URL: https://www.imdb.com/title/tt0198781/
Scraping URL: https://www.imdb.com/title/tt0241527/
Scraping URL: https://www.imdb.com/title/tt0240772/
Scraping URL: https://www.imdb.com/title/tt0211915/
Scraping URL: https://www.imdb.com/title/tt0265666/
Scraping URL: https://www.imdb.com/title/tt0120737/
Scraping URL: https://www.imdb.com/title/tt0268978/
Scraping URL: https://www.imdb.com/title/tt0268380/
Scraping URL: https://www.imdb.com/title/tt0259446/
Scraping URL