# Notebook 1: Data Loading & Exploration

**Movie Recommendation System**  
Author: Mohamed Hedi Foughai

Initial exploration of the MovieLens 25M dataset.

In [None]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from collections import Counter
import os
import warnings
import json

warnings.filterwarnings('ignore')
pd.set_option('display.float_format', '{:.2f}'.format)
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 1. Setup & Load Data

In [None]:
# Setup paths
# Check if we're in notebooks/ folder or project root
if os.path.basename(os.getcwd()) == 'notebooks':
    BASE_DIR = os.path.dirname(os.getcwd())  # Go up from notebooks/ to project root
else:
    BASE_DIR = os.getcwd()  # Already in project root

RAW_DIR = os.path.join(BASE_DIR, 'data', 'raw')
PROCESSED_DIR = os.path.join(BASE_DIR, 'data', 'processed')

os.makedirs(PROCESSED_DIR, exist_ok=True)

print(f"Current directory: {os.getcwd()}")
print(f"Project root: {BASE_DIR}")
print(f"Looking for data in: {RAW_DIR}")

# Load datasets
ratings = pd.read_csv(os.path.join(RAW_DIR, 'ratings.csv'))
movies = pd.read_csv(os.path.join(RAW_DIR, 'movies.csv'))
tags = pd.read_csv(os.path.join(RAW_DIR, 'tags.csv'))
links = pd.read_csv(os.path.join(RAW_DIR, 'links.csv'))

print(f"\nRatings: {ratings.shape[0]:,} rows")
print(f"Movies: {movies.shape[0]:,} rows")
print(f"Tags: {tags.shape[0]:,} rows")
print(f"Links: {links.shape[0]:,} rows")

In [None]:
# View data structure
print("Ratings:")
display(ratings.head())
print("\nMovies:")
display(movies.head())
print("\nTags:")
display(tags.head())
print("\nLinks (IMDB/TMDB IDs):")
display(links.head())

## 2. Dataset Statistics

In [None]:
# Calculate key metrics
n_ratings = len(ratings)
n_users = ratings['userId'].nunique()
n_movies = ratings['movieId'].nunique()
matrix_size = n_users * n_movies
sparsity = (1 - n_ratings / matrix_size) * 100

print(f"Total ratings: {n_ratings:,}")
print(f"Unique users: {n_users:,}")
print(f"Unique movies: {n_movies:,}")
print(f"Matrix sparsity: {sparsity:.2f}%")
print(f"\nMean rating: {ratings['rating'].mean():.2f}")
print(f"Median rating: {ratings['rating'].median():.2f}")
print(f"\nRating distribution:")
print(ratings['rating'].value_counts().sort_index())

In [None]:
# Visualize rating distribution
plt.figure(figsize=(10, 5))
ratings['rating'].value_counts().sort_index().plot(kind='bar', color='steelblue')
plt.title('Rating Distribution')
plt.xlabel('Rating')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

## 3. User & Movie Activity

In [None]:
# User and movie statistics
user_counts = ratings.groupby('userId').size()
movie_counts = ratings.groupby('movieId').size()

print("User activity:")
print(user_counts.describe())
print(f"\nMovie popularity:")
print(movie_counts.describe())

In [None]:
# Top 10 most rated movies
top_movies = movie_counts.nlargest(10)
movie_titles = movies.set_index('movieId')['title']

print("Top 10 most rated movies:")
for i, (movie_id, count) in enumerate(top_movies.items(), 1):
    print(f"{i:2d}. {movie_titles[movie_id]:45s} {count:>6,}")

## 4. Genre Analysis

In [None]:
# Extract and count genres
all_genres = []
for genres in movies['genres']:
    if genres != '(no genres listed)':
        all_genres.extend(genres.split('|'))

genre_counts = Counter(all_genres)

print(f"Total unique genres: {len(genre_counts)}")
print(f"\nTop 10 genres:")
for genre, count in genre_counts.most_common(10):
    print(f"{genre:15s}: {count:>5,} ({count/len(movies)*100:5.1f}%)")

In [None]:
# Visualize top genres
top_genres = pd.Series(dict(genre_counts.most_common(10)))

plt.figure(figsize=(10, 6))
top_genres.plot(kind='barh', color='steelblue')
plt.title('Top 10 Genres')
plt.xlabel('Number of Movies')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

## 5. Tag Analysis

In [None]:
# Tag statistics
print(f"Total tags: {len(tags):,}")
print(f"Unique tags: {tags['tag'].nunique():,}")
print(f"Movies with tags: {tags['movieId'].nunique():,}")
print(f"\nTop 15 tags:")
print(tags['tag'].value_counts().head(15))

## 6. Temporal Patterns

In [None]:
# Convert timestamps and extract time features
ratings['datetime'] = pd.to_datetime(ratings['timestamp'], unit='s')
ratings['year'] = ratings['datetime'].dt.year

print(f"Date range: {ratings['datetime'].min()} to {ratings['datetime'].max()}")
print(f"Years covered: {ratings['year'].max() - ratings['year'].min() + 1}")

In [None]:
# Yearly trends
yearly = ratings.groupby('year').size()

plt.figure(figsize=(12, 5))
yearly.plot(kind='line', marker='o', color='steelblue')
plt.title('Ratings per Year')
plt.xlabel('Year')
plt.ylabel('Number of Ratings')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 7. Data Quality

In [None]:
# Quality checks
print("Missing values:")
print(f"Ratings: {ratings.isnull().sum().sum()}")
print(f"Movies: {movies.isnull().sum().sum()}")
print(f"Tags: {tags.isnull().sum().sum()}")

print(f"\nDuplicate ratings: {ratings.duplicated(subset=['userId', 'movieId']).sum()}")
print(f"Invalid ratings: {((ratings['rating'] < 0.5) | (ratings['rating'] > 5)).sum()}")
print(f"Movies without genres: {(movies['genres'] == '(no genres listed)').sum()}")

## 8. Summary

In [None]:
# Create and save summary
summary = {
    'total_ratings': int(n_ratings),
    'unique_users': int(n_users),
    'unique_movies': int(n_movies),
    'sparsity_percent': float(sparsity),
    'mean_rating': float(ratings['rating'].mean()),
    'unique_genres': int(len(genre_counts)),
    'unique_tags': int(tags['tag'].nunique()),
    'date_range': f"{ratings['datetime'].min()} to {ratings['datetime'].max()}"
}

# Save to JSON
with open(os.path.join(PROCESSED_DIR, 'dataset_summary.json'), 'w') as f:
    json.dump(summary, f, indent=2, default=str)

# Print key findings
print("KEY FINDINGS:")
print(f"- {n_ratings:,} ratings from {n_users:,} users on {n_movies:,} movies")
print(f"- Matrix sparsity: {sparsity:.2f}% (high cold-start challenge)")
print(f"- Mean rating: {ratings['rating'].mean():.2f}/5.0")
print(f"- {len(genre_counts)} genres, {tags['tag'].nunique():,} unique tags")
print(f"- Dataset spans {ratings['year'].max() - ratings['year'].min() + 1} years")
print(f"- Data quality: Excellent (no missing values or duplicates)")
print(f"\nReady for preprocessing and feature engineering.")