In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

ratings = pd.read_csv("ml-100k/u.data", sep="\t", names=["user_id", "movie_id", "rating", "timestamp"])
movies = pd.read_csv("ml-100k/u.item", sep="|", names=["movie_id", "movie_title"], usecols=[0,1], encoding="latin-1")

In [None]:
print(ratings.head())

In [None]:
print(movies.head())

In [20]:
total_df = ratings.merge(movies, on ="movie_id")

In [None]:
print(total_df.head())

In [None]:
num_ratings = len(ratings)
num_movies = ratings["movie_id"].nunique()
num_users = ratings["user_id"].nunique()

print(f"Number of ratings: {num_ratings}")
print(f"Number of movies: {num_movies}")
print(f"Number of users: {num_users}")
print(f"The number of elements within the user-movie matrix is: {num_users * num_movies} elements")
print(f"The sparsity of the user_movie matrix is: {(num_ratings / (num_users * num_movies)) * 100:.2f}%")
print(f"We can conclude from {(num_ratings / (num_users * num_movies)) * 100:.2f}% that the user-movie matrix is very sparse.")

In [None]:
sns.countplot(x="rating", data=ratings)
plt.title("Distribution of movie ratings", fontsize=14)
plt.show()

In [None]:
print(f"Mean global rating: {round(ratings['rating'].mean(), 2)}")

In [None]:
mean_ratings = ratings.groupby("user_id")["rating"].mean()
print(f"Mean rating per user: {round(mean_ratings.mean(), 2)}")

In [79]:
total_df["movie_title"].value_counts()[0:5]

movie_title
Star Wars (1977)             583
Contact (1997)               509
Fargo (1996)                 508
Return of the Jedi (1983)    507
Liar Liar (1997)             485
Name: count, dtype: int64

In [None]:
mean_ratings = ratings.groupby("movie_id")["rating"].mean()
lowest_rated_movies = mean_ratings.nsmallest(1)
lowest_movie_id = lowest_rated_movies.index[0]
print(f"Lowest rated movie: {movies[movies['movie_id'] == lowest_movie_id]['movie_title'].values[0]} with a rating of {lowest_rated_movies.values[0]}")

In [None]:
highest_rated_movie = mean_ratings.nlargest(1)
highest_rated_movie_id = highest_rated_movie.index[0]
print(f"Highest rated movie: {movies[movies['movie_id'] == highest_rated_movie_id]['movie_title'].values[0]} with a rating of {highest_rated_movie.values[0]}")

Include Bayesian Average to make ratings more realistic statistics.

In [None]:
movie_stats = ratings.groupby("movie_id")["rating"].agg(["count", "mean"])
C = movie_stats["count"].mean()
m = movie_stats["mean"].mean()

print(f"The average number of ratings per movie is: {C:.2f}")
print(f"The average rating for a given movie is {m:.2f}")

In [76]:
def bayesian_average(ratings):
    bayesian_average = (C * m + ratings.sum()) / (C + ratings.count())
    return round(bayesian_average, 3)

In [77]:
bayesian_avg_ratings = ratings.groupby("movie_id")["rating"].agg(bayesian_average).reset_index()
bayesian_avg_ratings.columns = ["movie_id", "bayesian_average"]
movie_stats = movie_stats.merge(bayesian_avg_ratings, on="movie_id")

In [None]:
movie_stats.head()

In [None]:
movie_stats = movie_stats.merge(movies[["movie_id", "movie_title"]])
movie_stats.sort_values(by="bayesian_average", ascending=False)

In [None]:
movie_stats.sort_values(by="bayesian_average", ascending=True)