In [None]:
import zipfile
import os

# Unzip the movie_data.zip file
with zipfile.ZipFile('movie_data.zip', 'r') as zip_ref:
    zip_ref.extractall()

# List the contents of the directory again to confirm extraction
print(os.listdir())

In [None]:
# Now that we have confirmed the presence of movies.csv, let's load it and print its shape
movies_df = pd.read_csv('movies.csv')
print('Shape of movies.csv:', movies_df.shape)

In [None]:
# Load the ratings.csv file and print its shape
ratings_df = pd.read_csv('ratings.csv')
print('Shape of ratings.csv:', ratings_df.shape)

In [None]:
# Calculate the number of unique user IDs in the ratings.csv file
unique_user_ids = ratings_df['userId'].nunique()
print('Number of unique user IDs:', unique_user_ids)

In [None]:
# Find the movie with the maximum number of user ratings
movie_rating_counts = ratings_df['movieId'].value_counts()
max_ratings_movie_id = movie_rating_counts.idxmax()
max_ratings_count = movie_rating_counts.max()

# Find the movie title corresponding to the movie with the maximum number of ratings
movie_title = movies_df.loc[movies_df['movieId'] == max_ratings_movie_id, 'title'].iloc[0]

print('Movie with the maximum number of user ratings:')
print('Movie ID:', max_ratings_movie_id)
print('Title:', movie_title)
print('Number of ratings:', max_ratings_count)

In [None]:
# Load the tags.csv file since it hasn't been loaded yet
# Then perform the search for tags related to 'The Matrix (1999)'
tags_df = pd.read_csv('tags.csv')

# Repeating the previous operation now that tags_df is defined
matrix_movie_id = movies_df[movies_df['title'].str.contains('Matrix, The \(1999\)', regex=True)]['movieId'].iloc[0]
matrix_tags = tags_df[tags_df['movieId'] == matrix_movie_id]['tag'].unique()
print(matrix_tags)

In [None]:
# Find the movieId for 'Terminator 2: Judgment Day (1991)'
terminator_movie_id = movies_df[movies_df['title'] == 'Terminator 2: Judgment Day (1991)']['movieId'].iloc[0]

# Now, find the average user rating for 'Terminator 2: Judgment Day (1991)'
terminator_avg_rating = ratings_df[ratings_df['movieId'] == terminator_movie_id]['rating'].mean()
print('Average user rating for Terminator 2: Judgment Day (1991):', terminator_avg_rating)

In [None]:
import matplotlib.pyplot as plt

# Find the movieId for 'Fight Club (1999)'
fight_club_movie_id = movies_df[movies_df['title'] == 'Fight Club (1999)']['movieId'].iloc[0]

# Extract the ratings for 'Fight Club (1999)'
fight_club_ratings = ratings_df[ratings_df['movieId'] == fight_club_movie_id]['rating']

# Plot the distribution of the ratings
plt.figure(figsize=(10, 6))
plt.hist(fight_club_ratings, bins=20, edgecolor='black')
plt.title('Rating Distribution for Fight Club (1999)')
plt.xlabel('Rating')
plt.ylabel('Number of Ratings')
plt.grid(axis='y', alpha=0.75)
plt.show()

In [None]:
# Group the user ratings based on movieId and apply aggregation operations like count and mean on ratings
grouped_ratings = ratings_df.groupby('movieId')['rating'].agg(['count', 'mean'])

# Apply inner join on dataframe created from movies.csv and the grouped df from step 1
movies_with_ratings = movies_df.merge(grouped_ratings, left_on='movieId', right_index=True)

# Filter only those movies which have more than 50 user ratings
movies_with_more_than_50_ratings = movies_with_ratings[movies_with_ratings['count'] > 50]

# Display the head of the resulting dataframe
print(movies_with_more_than_50_ratings.head())

In [None]:
# Find the movie with the highest average rating from the filtered movies
most_popular_movie = movies_with_more_than_50_ratings.loc[movies_with_more_than_50_ratings['mean'].idxmax()]
print(most_popular_movie)

In [None]:
# Sort the movies_with_more_than_50_ratings dataframe by the count of ratings in descending order
# and select the top 5 movies based on the number of user ratings
top_5_movies_by_count = movies_with_more_than_50_ratings.sort_values(by='count', ascending=False).head(5)
print(top_5_movies_by_count[['title', 'count']])