Exploratory Data Analysis

In [None]:
# Standard imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# For displaying plots nicely
%matplotlib inline
sns.set(style='whitegrid')

# To import from src/
import sys
sys.path.append('../src')

from data_loader import load_ratings, load_movies, merge_datasets

Step 1: Load & Merge Data

In [None]:
ratings = load_ratings('../data/ratings.csv')
movies = load_movies('../data/movies.csv')
df = merge_datasets(ratings, movies)

df.head()


Step 2: Quick Overview

print("Shape:", df.shape)
print("\nData types:\n", df.dtypes)
print("\nMissing values:\n", df.isnull().sum())

Step 3: Ratings Distribution

In [None]:
plt.figure(figsize=(8, 4))
sns.histplot(df['rating'], bins=np.arange(0.5, 5.5, 0.5), kde=False)
plt.title('Distribution of Ratings')
plt.xlabel('Rating')
plt.ylabel('Count')
plt.show()

Step 4: Top 10 Most Rated Movies

In [None]:
top_movies = df.groupby('title')['rating'].count().sort_values(ascending=False).head(10)
top_movies.plot(kind='barh', figsize=(8, 5), color='skyblue')
plt.title('Top 10 Most Rated Movies')
plt.xlabel('Number of Ratings')
plt.gca().invert_yaxis()
plt.show()

Step 5: Users & Items Overview

In [None]:
n_users = df['userId'].nunique()
n_movies = df['movieId'].nunique()
print(f"Unique users: {n_users}, Unique movies: {n_movies}")

 Step 6: Average Rating per Movie

In [None]:
avg_ratings = df.groupby('title')['rating'].mean().sort_values(ascending=False)
avg_ratings.head(10)