## Load and preprocess data 

In [29]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [8]:
movies = pd.read_csv('./ml-32m/movies.csv')
ratings = pd.read_csv('./ml-32m/ratings.csv')

In [9]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [10]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,17,4.0,944249077
1,1,25,1.0,944250228
2,1,29,2.0,943230976
3,1,30,5.0,944249077
4,1,32,5.0,943228858


In [12]:
df = pd.merge(ratings,movies,on='movieId')
print(movies.shape)
print(ratings.shape)
print(df.shape)

(87585, 3)
(32000204, 4)
(32000204, 6)


In [13]:
df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,17,4.0,944249077,Sense and Sensibility (1995),Drama|Romance
1,1,25,1.0,944250228,Leaving Las Vegas (1995),Drama|Romance
2,1,29,2.0,943230976,"City of Lost Children, The (Cité des enfants p...",Adventure|Drama|Fantasy|Mystery|Sci-Fi
3,1,30,5.0,944249077,Shanghai Triad (Yao a yao yao dao waipo qiao) ...,Crime|Drama
4,1,32,5.0,943228858,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller


In [14]:
# Check for missing values
df.isnull().sum()

userId       0
movieId      0
rating       0
timestamp    0
title        0
genres       0
dtype: int64

## Step 2: Explore basic statistics

### Unique values

In [24]:
num_unique_users = df['userId'].unique()
num_unique_movies = df['movieId'].unique()
num_ratings = df.shape[0]

print(f"Number of unique users: {num_unique_users}")
print(f"Number of unique movies: {num_unique_movies}")
print(f"Total number of ratings: {num_ratings}")

Number of unique users: [     1      2      3 ... 200946 200947 200948]
Number of unique movies: [    17     25     29 ... 175771 157917 274343]
Total number of ratings: 32000204


## Step 3: Visualizing data

### Distribution of Ratings

In [None]:
plt.figure(figsize=(8,5))
# KDE (Kernel Density Estimation): overlay a smooth curve that estimates PDF
# sns.histplot(df['rating'], bins=10, kde=True, color="blue")
plt.hist(df['rating'], bins=10, color="blue", edgecolor="black", alpha=0.7)
plt.xlabel('Rating')
plt.ylabel('Count')
plt.title('Distribution of Movie Ratings')
plt.show()