
 # NOTEBOOK 1: DATA LOADING & EXPLORATION                      
 # Movie Recommender System                                    
# Author: Mohamed Hedi Foughai                                


In [1]:
# Core Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import os
import warnings
import json

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# Pandas display options
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.float_format', '{:.2f}'.format)  # 2 decimal places

# Visualization settings
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 6)  # Default figure size

# Print header
print("=" * 80)
print("üé¨ MOVIE RECOMMENDER SYSTEM - DATA LOADING & EXPLORATION")
print("=" * 80)
print(f"üìÖ Started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"üêç Python version: {pd.__version__}")
print("=" * 80)

üé¨ MOVIE RECOMMENDER SYSTEM - DATA LOADING & EXPLORATION
üìÖ Started at: 2025-10-18 18:03:08
üêç Python version: 2.3.3


## 1- SETUP PATHS & VERIFY DATA FILES

In [2]:
# Get directory paths
# We're currently in: MovieRecommenderSystem/notebooks/
# We need to go up one level to access data/
BASE_DIR = os.path.dirname(os.getcwd())  # Go up from notebooks/ to project root
RAW_DIR = os.path.join(BASE_DIR, 'data', 'raw')
PROCESSED_DIR = os.path.join(BASE_DIR, 'data', 'processed')
MODELS_DIR = os.path.join(BASE_DIR, 'models')
REPORTS_DIR = os.path.join(BASE_DIR, 'reports', 'figures')

print("\nüìÅ DIRECTORY PATHS:")
print("-" * 80)
print(f"Current directory:  {os.getcwd()}")
print(f"Project root:       {BASE_DIR}")
print(f"Raw data:           {RAW_DIR}")
print(f"Processed data:     {PROCESSED_DIR}")
print(f"Models:             {MODELS_DIR}")
print(f"Reports:            {REPORTS_DIR}")

# Create directories if they don't exist
os.makedirs(PROCESSED_DIR, exist_ok=True)
os.makedirs(MODELS_DIR, exist_ok=True)
os.makedirs(REPORTS_DIR, exist_ok=True)

print("\n‚úÖ Directories created/verified")

# Check for required data files
print("\n" + "=" * 80)
print("üìÇ CHECKING DATA FILES")
print("=" * 80)

required_files = [
    'ratings.csv',
    'movies.csv', 
    'tags.csv',
    'links.csv'
]

all_files_present = True

for file in required_files:
    filepath = os.path.join(RAW_DIR, file)
    if os.path.exists(filepath):
        size_mb = os.path.getsize(filepath) / (1024**2)  # Convert bytes to MB
        print(f"   ‚úì {file:20s} ({size_mb:>8.1f} MB) ‚úÖ")
    else:
        print(f"   ‚úó {file:20s} - NOT FOUND! ‚ùå")
        all_files_present = False

# Check optional files
print("\nüìã Optional files:")
optional_files = ['genome-scores.csv', 'genome-tags.csv', 'README.txt']

for file in optional_files:
    filepath = os.path.join(RAW_DIR, file)
    if os.path.exists(filepath):
        size_mb = os.path.getsize(filepath) / (1024**2)
        print(f"   ‚úì {file:20s} ({size_mb:>8.1f} MB)")
    else:
        print(f"   - {file:20s} (not present)")

# Final check
print("\n" + "=" * 80)
if all_files_present:
    print("‚úÖ ALL REQUIRED DATA FILES FOUND! Ready to load data! üéâ")
else:
    print("‚ùå MISSING FILES! Please check your data/raw/ folder.")
    print("   Download from: https://grouplens.org/datasets/movielens/25m/")
print("=" * 80)


üìÅ DIRECTORY PATHS:
--------------------------------------------------------------------------------
Current directory:  c:\Users\mhfou\Documents\MovieRecommenderSystem\notebooks
Project root:       c:\Users\mhfou\Documents\MovieRecommenderSystem
Raw data:           c:\Users\mhfou\Documents\MovieRecommenderSystem\data\raw
Processed data:     c:\Users\mhfou\Documents\MovieRecommenderSystem\data\processed
Models:             c:\Users\mhfou\Documents\MovieRecommenderSystem\models
Reports:            c:\Users\mhfou\Documents\MovieRecommenderSystem\reports\figures

‚úÖ Directories created/verified

üìÇ CHECKING DATA FILES
   ‚úì ratings.csv          (   646.8 MB) ‚úÖ
   ‚úì movies.csv           (     2.9 MB) ‚úÖ
   ‚úì tags.csv             (    37.0 MB) ‚úÖ
   ‚úì links.csv            (     1.3 MB) ‚úÖ

üìã Optional files:
   ‚úì genome-scores.csv    (   415.0 MB)
   ‚úì genome-tags.csv      (     0.0 MB)
   ‚úì README.txt           (     0.0 MB)

‚úÖ ALL REQUIRED DATA FILES FOUND! Rea

## 2- LOAD MOVIELENS 25M DATASET

In [3]:

print("\n" + "=" * 80)
print("üìÇ LOADING MOVIELENS 25M DATASET")
print("=" * 80)

print("\n‚è≥ Loading files (this may take 1-2 minutes)...")
start_time = datetime.now()

# Load main datasets
ratings = pd.read_csv(os.path.join(RAW_DIR, 'ratings.csv'))
movies = pd.read_csv(os.path.join(RAW_DIR, 'movies.csv'))
tags = pd.read_csv(os.path.join(RAW_DIR, 'tags.csv'))
links = pd.read_csv(os.path.join(RAW_DIR, 'links.csv'))

end_time = datetime.now()
load_time = (end_time - start_time).total_seconds()

print(f"‚úÖ All files loaded successfully in {load_time:.1f} seconds!")

print(f"\nüìä Dataset Shapes:")
print(f"   ‚Ä¢ Ratings: {ratings.shape[0]:>12,} rows √ó {ratings.shape[1]:>2} columns")
print(f"   ‚Ä¢ Movies:  {movies.shape[0]:>12,} rows √ó {movies.shape[1]:>2} columns")
print(f"   ‚Ä¢ Tags:    {tags.shape[0]:>12,} rows √ó {tags.shape[1]:>2} columns")
print(f"   ‚Ä¢ Links:   {links.shape[0]:>12,} rows √ó {links.shape[1]:>2} columns")

print(f"\nüíæ Total Memory Usage: {(ratings.memory_usage(deep=True).sum() + movies.memory_usage(deep=True).sum() + tags.memory_usage(deep=True).sum() + links.memory_usage(deep=True).sum()) / (1024**2):.1f} MB")

print("\n" + "=" * 80)


üìÇ LOADING MOVIELENS 25M DATASET

‚è≥ Loading files (this may take 1-2 minutes)...
‚úÖ All files loaded successfully in 14.6 seconds!

üìä Dataset Shapes:
   ‚Ä¢ Ratings:   25,000,095 rows √ó  4 columns
   ‚Ä¢ Movies:        62,423 rows √ó  3 columns
   ‚Ä¢ Tags:       1,093,360 rows √ó  4 columns
   ‚Ä¢ Links:         62,423 rows √ó  3 columns

üíæ Total Memory Usage: 860.3 MB



## 3- DATA STRUCTURE INSPECTION


In [9]:
"""
CELL 4: DATA STRUCTURE INSPECTION
"""

print("\n" + "=" * 80)
print("üîç DATA STRUCTURE INSPECTION")
print("=" * 80)

print("\nüìã RATINGS Dataset:")
print("-" * 80)
print(ratings.head(10))
print(f"\nColumns: {ratings.columns.tolist()}")
print(f"\nData types:\n{ratings.dtypes}")
print(f"\nMemory usage: {ratings.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

print("\n" + "-" * 80)
print("üìã MOVIES Dataset:")
print("-" * 80)
print(movies.head(10))
print(f"\nColumns: {movies.columns.tolist()}")
print(f"\nData types:\n{movies.dtypes}")

print("\n" + "-" * 80)
print("üìã TAGS Dataset:")
print("-" * 80)
print(tags.head(15))
print(f"\nColumns: {tags.columns.tolist()}")
print(f"\nData types:\n{tags.dtypes}")
print(f"\nüí° These user-generated tags will be used for NLP analysis!")

print("\n" + "-" * 80)
print("üìã LINKS Dataset:")
print("-" * 80)
print(links.head(10))
print(f"\nColumns: {links.columns.tolist()}")
print(f"\nüí° Links to IMDB and TMDB - useful for getting movie metadata")

print("\n" + "=" * 80)
print("üìä GENOME DATA (Optional - Machine-Generated Tags)")
print("=" * 80)

genome_scores_path = os.path.join(RAW_DIR, 'genome-scores.csv')
genome_tags_path = os.path.join(RAW_DIR, 'genome-tags.csv')

if os.path.exists(genome_scores_path) and os.path.exists(genome_tags_path):
    print("\n‚è≥ Loading genome data (this may take a minute)...")
    
    genome_scores = pd.read_csv(genome_scores_path)
    genome_tags = pd.read_csv(genome_tags_path)
    
    print("\nüìã GENOME SCORES Dataset:")
    print("-" * 80)
    print(genome_scores.head(10))
    print(f"\nShape: {genome_scores.shape[0]:,} rows √ó {genome_scores.shape[1]} columns")
    print(f"Columns: {genome_scores.columns.tolist()}")
    print(f"\nüí° Relevance scores (0-1) of tags for each movie")
    
    print("\n" + "-" * 80)
    print("üìã GENOME TAGS Dataset:")
    print("-" * 80)
    print(genome_tags.head(20))
    print(f"\nShape: {genome_tags.shape[0]:,} rows √ó {genome_tags.shape[1]} columns")
    print(f"Columns: {genome_tags.columns.tolist()}")
    print(f"\nüí° {genome_tags.shape[0]} machine-generated tags for content analysis")
    
    print(f"\nüìä Genome data loaded successfully!")
else:
    print("\n‚ö†Ô∏è  Genome data files not found (optional)")
    genome_scores = None
    genome_tags = None


üîç DATA STRUCTURE INSPECTION

üìã RATINGS Dataset:
--------------------------------------------------------------------------------
   userId  movieId  rating   timestamp
0       1      296    5.00  1147880044
1       1      306    3.50  1147868817
2       1      307    5.00  1147868828
3       1      665    5.00  1147878820
4       1      899    3.50  1147868510
5       1     1088    4.00  1147868495
6       1     1175    3.50  1147868826
7       1     1217    3.50  1147878326
8       1     1237    5.00  1147868839
9       1     1250    4.00  1147868414

Columns: ['userId', 'movieId', 'rating', 'timestamp']

Data types:
userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

Memory usage: 762.94 MB

--------------------------------------------------------------------------------
üìã MOVIES Dataset:
--------------------------------------------------------------------------------
   movieId                               title  \
0        1

## 4- DATASET STATISTICS

In [10]:
print("\n" + "=" * 80)
print("üìä DATASET STATISTICS")
print("=" * 80)

n_ratings = len(ratings)
n_users = ratings['userId'].nunique()
n_movies = ratings['movieId'].nunique()

print("\nüé¨ Ratings Overview:")
print(f"   ‚Ä¢ Total ratings:        {n_ratings:>12,}")
print(f"   ‚Ä¢ Unique users:         {n_users:>12,}")
print(f"   ‚Ä¢ Unique movies:        {n_movies:>12,}")
print(f"   ‚Ä¢ Rating range:         {ratings['rating'].min():.1f} - {ratings['rating'].max():.1f}")
print(f"   ‚Ä¢ Average rating:       {ratings['rating'].mean():>12.2f}")
print(f"   ‚Ä¢ Median rating:        {ratings['rating'].median():>12.2f}")
print(f"   ‚Ä¢ Std deviation:        {ratings['rating'].std():>12.2f}")

print("\nüìà Rating Distribution:")
rating_dist = ratings['rating'].value_counts().sort_index()
for rating, count in rating_dist.items():
    percentage = (count / n_ratings) * 100
    bar_length = int(percentage * 2)
    bar = '‚ñà' * bar_length
    print(f"   {rating:.1f} ‚≠ê: {bar:<40} {percentage:>5.1f}% ({count:>10,})")

print("\nüë• User Activity:")
user_counts = ratings.groupby('userId').size()
print(f"   ‚Ä¢ Avg ratings per user: {user_counts.mean():>12.1f}")
print(f"   ‚Ä¢ Median:               {user_counts.median():>12.1f}")
print(f"   ‚Ä¢ Most active user:     {user_counts.max():>12,} ratings")
print(f"   ‚Ä¢ Least active user:    {user_counts.min():>12} rating(s)")

user_activity_quartiles = user_counts.quantile([0.25, 0.5, 0.75])
print(f"\n   Quartiles:")
print(f"   ‚Ä¢ 25th percentile:      {user_activity_quartiles[0.25]:>12.0f} ratings")
print(f"   ‚Ä¢ 50th percentile:      {user_activity_quartiles[0.5]:>12.0f} ratings")
print(f"   ‚Ä¢ 75th percentile:      {user_activity_quartiles[0.75]:>12.0f} ratings")

print("\nüé• Movie Popularity:")
movie_counts = ratings.groupby('movieId').size()
print(f"   ‚Ä¢ Avg ratings per movie: {movie_counts.mean():>11.1f}")
print(f"   ‚Ä¢ Median:                {movie_counts.median():>11.1f}")
print(f"   ‚Ä¢ Most popular movie:    {movie_counts.max():>11,} ratings")
print(f"   ‚Ä¢ Movies with 1 rating:  {(movie_counts == 1).sum():>11,}")
print(f"   ‚Ä¢ Movies with <5 ratings: {(movie_counts < 5).sum():>10,}")
print(f"   ‚Ä¢ Movies with >1000:     {(movie_counts > 1000).sum():>11,}")

movie_popularity_quartiles = movie_counts.quantile([0.25, 0.5, 0.75])
print(f"\n   Quartiles:")
print(f"   ‚Ä¢ 25th percentile:      {movie_popularity_quartiles[0.25]:>12.0f} ratings")
print(f"   ‚Ä¢ 50th percentile:      {movie_popularity_quartiles[0.5]:>12.0f} ratings")
print(f"   ‚Ä¢ 75th percentile:      {movie_popularity_quartiles[0.75]:>12.0f} ratings")


üìä DATASET STATISTICS

üé¨ Ratings Overview:
   ‚Ä¢ Total ratings:          25,000,095
   ‚Ä¢ Unique users:              162,541
   ‚Ä¢ Unique movies:              59,047
   ‚Ä¢ Rating range:         0.5 - 5.0
   ‚Ä¢ Average rating:               3.53
   ‚Ä¢ Median rating:                3.50
   ‚Ä¢ Std deviation:                1.06

üìà Rating Distribution:
   0.5 ‚≠ê: ‚ñà‚ñà‚ñà                                        1.6% (   393,068)
   1.0 ‚≠ê: ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà                                     3.1% (   776,815)
   1.5 ‚≠ê: ‚ñà‚ñà‚ñà                                        1.6% (   399,490)
   2.0 ‚≠ê: ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà                              6.6% ( 1,640,868)
   2.5 ‚≠ê: ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà                                 5.1% ( 1,262,797)
   3.0 ‚≠ê: ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   19.6% ( 4,896,928)
   3.5 ‚≠ê: ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñ

 ## 5- TAGS ANALYSIS - NLP DATA

In [11]:

print("\n" + "=" * 80)
print("üè∑Ô∏è  TAGS ANALYSIS - NLP DATA")
print("=" * 80)

n_tags = len(tags)
n_unique_tags = tags['tag'].nunique()
n_movies_with_tags = tags['movieId'].nunique()
n_users_tagging = tags['userId'].nunique()

print(f"\nüìä Tags Overview:")
print(f"   ‚Ä¢ Total tag entries:     {n_tags:>12,}")
print(f"   ‚Ä¢ Unique tags:           {n_unique_tags:>12,}")
print(f"   ‚Ä¢ Movies with tags:      {n_movies_with_tags:>12,}")
print(f"   ‚Ä¢ Users who tagged:      {n_users_tagging:>12,}")
print(f"   ‚Ä¢ Avg tags per movie:    {n_tags / n_movies_with_tags:>12.1f}")

print("\nüî• Most Common Tags (Top 30):")
top_tags = tags['tag'].value_counts().head(30)
for i, (tag, count) in enumerate(top_tags.items(), 1):
    bar_length = int((count / top_tags.iloc[0]) * 30)
    bar = '‚ñà' * bar_length
    print(f"   {i:2d}. {tag:35s} {bar:<30} {count:>6,}")

print("\nüí° Sample Tags for Popular Movies:")
popular_movies_with_tags = tags.groupby('movieId').size().sort_values(ascending=False).head(5)

for movie_id in popular_movies_with_tags.index:
    movie_title = movies[movies['movieId'] == movie_id]['title'].values[0]
    movie_tags = tags[tags['movieId'] == movie_id]['tag'].values[:8]
    print(f"\n   üé¨ {movie_title}")
    print(f"      Tags: {', '.join(movie_tags)}")

print("\nüìà Tag Distribution:")
tags_per_movie = tags.groupby('movieId').size()
print(f"   ‚Ä¢ Min tags per movie:    {tags_per_movie.min():>12}")
print(f"   ‚Ä¢ Max tags per movie:    {tags_per_movie.max():>12,}")
print(f"   ‚Ä¢ Median tags per movie: {tags_per_movie.median():>12.0f}")
print(f"   ‚Ä¢ Mean tags per movie:   {tags_per_movie.mean():>12.1f}")

tags_per_user = tags.groupby('userId').size()
print(f"\n   ‚Ä¢ Min tags per user:     {tags_per_user.min():>12}")
print(f"   ‚Ä¢ Max tags per user:     {tags_per_user.max():>12,}")
print(f"   ‚Ä¢ Median tags per user:  {tags_per_user.median():>12.0f}")
print(f"   ‚Ä¢ Mean tags per user:    {tags_per_user.mean():>12.1f}")

print("\n‚úÖ Tags Data Quality:")
print(f"   ‚Ä¢ Missing values:        {tags.isnull().sum().sum():>12}")
print(f"   ‚Ä¢ Empty strings:         {(tags['tag'].str.strip() == '').sum():>12}")
print(f"   ‚Ä¢ Average tag length:    {tags['tag'].str.len().mean():>12.1f} characters")
print(f"   ‚Ä¢ Shortest tag:          {tags['tag'].str.len().min():>12} characters")
print(f"   ‚Ä¢ Longest tag:           {tags['tag'].str.len().max():>12} characters")

print("\nüí° Tag Length Distribution:")
tag_lengths = tags['tag'].str.len()
bins = [0, 5, 10, 15, 20, 30, 50, 100, 500]
tag_length_dist = pd.cut(tag_lengths, bins=bins).value_counts().sort_index()
for interval, count in tag_length_dist.items():
    percentage = (count / len(tags)) * 100
    bar_length = int(percentage * 2)
    bar = '‚ñà' * bar_length
    print(f"   {str(interval):15s}: {bar:<40} {percentage:>5.1f}% ({count:>8,})")

print("\n" + "=" * 80)
print("‚úÖ Tags are ready for NLP processing!")
print("   ‚Ä¢ We'll use TF-IDF to extract semantic features")
print("   ‚Ä¢ These tags will improve content-based recommendations")
print("=" * 80)


üè∑Ô∏è  TAGS ANALYSIS - NLP DATA

üìä Tags Overview:
   ‚Ä¢ Total tag entries:        1,093,360
   ‚Ä¢ Unique tags:                 73,050
   ‚Ä¢ Movies with tags:            45,251
   ‚Ä¢ Users who tagged:            14,592
   ‚Ä¢ Avg tags per movie:            24.2

üî• Most Common Tags (Top 30):
    1. sci-fi                              ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  8,330
    2. atmospheric                         ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà         6,516
    3. action                              ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà           5,907
    4. comedy                              ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà            5,702
    5. surreal                             ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà             5,326
    6. based on a book                     ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚

## 6- GENRE ANALYSIS

In [12]:
print("\n" + "=" * 80)
print("üé≠ GENRE ANALYSIS")
print("=" * 80)

movies['genres_list'] = movies['genres'].str.split('|')

from collections import Counter
all_genres = []
for genres in movies['genres_list'].dropna():
    all_genres.extend(genres)

genre_counts = Counter(all_genres)

print(f"\nüìä Genre Distribution:")
print(f"   Total unique genres: {len(genre_counts)}")
print()

for genre, count in sorted(genre_counts.items(), key=lambda x: x[1], reverse=True):
    percentage = (count / len(movies)) * 100
    bar_length = int(percentage / 2)
    bar = '‚ñà' * bar_length
    print(f"   {genre:20s}: {bar:<40} {percentage:5.1f}% ({count:>5,} movies)")

movies['genre_count'] = movies['genres_list'].apply(lambda x: len(x) if isinstance(x, list) else 0)

print(f"\nüìä Genres per Movie:")
print(f"   ‚Ä¢ Average:  {movies['genre_count'].mean():.2f}")
print(f"   ‚Ä¢ Median:   {movies['genre_count'].median():.0f}")
print(f"   ‚Ä¢ Max:      {movies['genre_count'].max()}")
print(f"   ‚Ä¢ Min:      {movies['genre_count'].min()}")

genre_count_dist = movies['genre_count'].value_counts().sort_index()
print(f"\n   Distribution:")
for n_genres, count in genre_count_dist.items():
    percentage = (count / len(movies)) * 100
    bar_length = int(percentage)
    bar = '‚ñà' * bar_length
    print(f"   {n_genres} genre(s):  {bar:<40} {percentage:5.1f}% ({count:>5,} movies)")

print("\nüé¨ Sample Movies with Genres:")
print(movies[['title', 'genres']].head(15))

print("\nüìä Most Common Genre Combinations:")
genre_combinations = movies['genres'].value_counts().head(15)
for i, (combo, count) in enumerate(genre_combinations.items(), 1):
    print(f"   {i:2d}. {combo:50s} ({count:>4,} movies)")

print("\nüí° Genre Co-occurrence Analysis:")
genre_pairs = Counter()
for genres_list in movies['genres_list'].dropna():
    if len(genres_list) >= 2:
        for i in range(len(genres_list)):
            for j in range(i + 1, len(genres_list)):
                pair = tuple(sorted([genres_list[i], genres_list[j]]))
                genre_pairs[pair] += 1

print("\n   Top 20 Genre Pairs:")
for i, (pair, count) in enumerate(genre_pairs.most_common(20), 1):
    print(f"   {i:2d}. {pair[0]:15s} + {pair[1]:15s} : {count:>5,} movies")


üé≠ GENRE ANALYSIS

üìä Genre Distribution:
   Total unique genres: 20

   Drama               : ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà                      41.0% (25,606 movies)
   Comedy              : ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà                             27.0% (16,870 movies)
   Thriller            : ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà                                    13.9% (8,654 movies)
   Romance             : ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà                                    12.4% (7,719 movies)
   Action              : ‚ñà‚ñà‚ñà‚ñà‚ñà                                     11.8% (7,348 movies)
   Horror              : ‚ñà‚ñà‚ñà‚ñà                                       9.6% (5,989 movies)
   Documentary         : ‚ñà‚ñà‚ñà‚ñà                                       9.0% (5,605 movies)
   Crime               : ‚ñà‚ñà‚ñà‚ñà                                       8.5% (5,319 movies)
   (no genres listed)  : ‚ñà‚ñà‚ñà‚ñà                                       8.1% (5,062 movies)

## 7- MATRIX SPARSITY ANALYSIS

In [13]:


print("\n" + "=" * 80)
print("üîç MATRIX SPARSITY ANALYSIS")
print("=" * 80)

matrix_size = n_users * n_movies
sparsity = (1 - n_ratings / matrix_size) * 100

print(f"\nüìä User-Movie Interaction Matrix:")
print(f"   ‚Ä¢ Dimensions:     {n_users:>10,} users √ó {n_movies:>6,} movies")
print(f"   ‚Ä¢ Total cells:    {matrix_size:>20,}")
print(f"   ‚Ä¢ Filled cells:   {n_ratings:>20,}")
print(f"   ‚Ä¢ Empty cells:    {matrix_size - n_ratings:>20,}")
print(f"   ‚Ä¢ Sparsity:       {sparsity:>20.6f}%")
print(f"   ‚Ä¢ Density:        {100-sparsity:>20.6f}%")

print("\nüìâ Visual Representation:")
filled = int((100 - sparsity) * 40 / 100)
empty = 40 - filled
print(f"   Filled: [{'‚ñà' * filled}{'‚ñë' * empty}] {100-sparsity:.6f}%")
print(f"   Empty:  [{'‚ñë' * filled}{'‚ñà' * empty}] {sparsity:.6f}%")

print(f"\nüí° What Sparsity Means:")
if sparsity > 99.9:
    print(f"   ‚ö†Ô∏è  EXTREMELY SPARSE matrix ({sparsity:.4f}%)")
    print(f"   ‚Ä¢ Out of every 10,000 cells, only {int((100-sparsity)*100):.0f} are filled")
    print(f"   ‚Ä¢ Most user-movie pairs have NO interaction")
elif sparsity > 99:
    print(f"   ‚ö†Ô∏è  VERY SPARSE matrix ({sparsity:.4f}%)")
    print(f"   ‚Ä¢ Out of every 1,000 cells, only {int((100-sparsity)*10):.0f} are filled")
else:
    print(f"   ‚úÖ Reasonable sparsity for collaborative filtering")

print(f"\nüéØ Why This Matters:")
print(f"   ‚Ä¢ Each user has rated {n_ratings/n_users:.1f} movies on average")
print(f"   ‚Ä¢ But there are {n_movies:,} total movies")
print(f"   ‚Ä¢ Users have rated only {(n_ratings/n_users)/n_movies*100:.3f}% of all movies")
print(f"   ‚Ä¢ This creates the \"Cold Start\" problem")

print(f"\nüìä Coverage Analysis:")
users_per_movie = ratings.groupby('movieId').size()
movies_per_user = ratings.groupby('userId').size()

print(f"\n   Movies Coverage:")
print(f"   ‚Ä¢ Movies with 1-10 ratings:    {(users_per_movie <= 10).sum():>8,} ({(users_per_movie <= 10).sum()/n_movies*100:>5.1f}%)")
print(f"   ‚Ä¢ Movies with 11-50 ratings:   {((users_per_movie > 10) & (users_per_movie <= 50)).sum():>8,} ({((users_per_movie > 10) & (users_per_movie <= 50)).sum()/n_movies*100:>5.1f}%)")
print(f"   ‚Ä¢ Movies with 51-100 ratings:  {((users_per_movie > 50) & (users_per_movie <= 100)).sum():>8,} ({((users_per_movie > 50) & (users_per_movie <= 100)).sum()/n_movies*100:>5.1f}%)")
print(f"   ‚Ä¢ Movies with 100+ ratings:    {(users_per_movie > 100).sum():>8,} ({(users_per_movie > 100).sum()/n_movies*100:>5.1f}%)")

print(f"\n   Users Coverage:")
print(f"   ‚Ä¢ Users with 1-20 ratings:     {(movies_per_user <= 20).sum():>8,} ({(movies_per_user <= 20).sum()/n_users*100:>5.1f}%)")
print(f"   ‚Ä¢ Users with 21-50 ratings:    {((movies_per_user > 20) & (movies_per_user <= 50)).sum():>8,} ({((movies_per_user > 20) & (movies_per_user <= 50)).sum()/n_users*100:>5.1f}%)")
print(f"   ‚Ä¢ Users with 51-100 ratings:   {((movies_per_user > 50) & (movies_per_user <= 100)).sum():>8,} ({((movies_per_user > 50) & (movies_per_user <= 100)).sum()/n_users*100:>5.1f}%)")
print(f"   ‚Ä¢ Users with 100+ ratings:     {(movies_per_user > 100).sum():>8,} ({(movies_per_user > 100).sum()/n_users*100:>5.1f}%)")

print(f"\nüí° Interpretation:")
print(f"   ‚ö†Ô∏è  Matrix is {sparsity:.2f}% sparse")
print(f"   ‚Üí Collaborative filtering alone will struggle")
print(f"   ‚Üí Matrix factorization (SVD) is ESSENTIAL")
print(f"   ‚Üí Content-based (genres + tags NLP) handles cold-start")
print(f"   ‚Üí Hybrid approach combines strengths of all methods")

print(f"\nüéØ Recommendation Strategy:")
print(f"   1. Content-Based: Use genres + NLP on tags (handles cold-start)")
print(f"   2. Collaborative: User-user & item-item similarity")
print(f"   3. Matrix Factorization: SVD to handle sparsity")
print(f"   4. Hybrid: Combine all three for best results")

print("\n" + "=" * 80)


üîç MATRIX SPARSITY ANALYSIS

üìä User-Movie Interaction Matrix:
   ‚Ä¢ Dimensions:        162,541 users √ó 59,047 movies
   ‚Ä¢ Total cells:           9,597,558,427
   ‚Ä¢ Filled cells:             25,000,095
   ‚Ä¢ Empty cells:           9,572,558,332
   ‚Ä¢ Sparsity:                  99.739516%
   ‚Ä¢ Density:                    0.260484%

üìâ Visual Representation:
   Filled: [‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë] 0.260484%
   Empty:  [‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà] 99.739516%

üí° What Sparsity Means:
   ‚ö†Ô∏è  VERY SPARSE matrix (99.7395%)
   ‚Ä¢ Out of every 1,000 cells, only 2 are filled

üéØ Why This Matters:
   ‚Ä¢ Each user has rated 153.8 movies on average
   ‚Ä¢ But there are 59,047 total movies
   ‚Ä¢ Users have rated only 0.260% of all movies
   ‚Ä¢ This creates the "Cold Start" problem

üìä

## 8- TEMPORAL ANALYSIS

In [14]:


print("\n" + "=" * 80)
print("‚è∞ TEMPORAL ANALYSIS")
print("=" * 80)

ratings['datetime'] = pd.to_datetime(ratings['timestamp'], unit='s')
ratings['year'] = ratings['datetime'].dt.year
ratings['month'] = ratings['datetime'].dt.month
ratings['day_of_week'] = ratings['datetime'].dt.day_name()
ratings['hour'] = ratings['datetime'].dt.hour

print(f"\nüìÖ Time Range:")
print(f"   ‚Ä¢ First rating:  {ratings['datetime'].min()}")
print(f"   ‚Ä¢ Last rating:   {ratings['datetime'].max()}")
print(f"   ‚Ä¢ Time span:     {(ratings['datetime'].max() - ratings['datetime'].min()).days:,} days")
print(f"   ‚Ä¢ Years covered: {ratings['year'].max() - ratings['year'].min() + 1}")

print(f"\nüìä Ratings by Year:")
yearly = ratings.groupby('year').size().sort_index()
for year in yearly.index:
    count = yearly[year]
    percentage = (count / n_ratings) * 100
    bar_length = int(percentage * 5)
    bar = '‚ñà' * bar_length
    print(f"   {year}: {bar:<50} {percentage:5.2f}% ({count:>10,})")

print(f"\nüìà Most Active Years:")
top_years = yearly.sort_values(ascending=False).head(10)
for i, (year, count) in enumerate(top_years.items(), 1):
    print(f"   {i:2d}. {year}: {count:>10,} ratings")

print(f"\nüìä Ratings by Month:")
monthly = ratings.groupby('month').size().sort_index()
month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
for month in monthly.index:
    count = monthly[month]
    percentage = (count / n_ratings) * 100
    bar_length = int(percentage * 3)
    bar = '‚ñà' * bar_length
    print(f"   {month_names[month-1]:3s}: {bar:<30} {percentage:5.2f}% ({count:>10,})")

print(f"\nüìä Ratings by Day of Week:")
dow_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
dow_counts = ratings['day_of_week'].value_counts()
for day in dow_order:
    count = dow_counts[day]
    percentage = (count / n_ratings) * 100
    bar_length = int(percentage * 3)
    bar = '‚ñà' * bar_length
    print(f"   {day:9s}: {bar:<30} {percentage:5.2f}% ({count:>10,})")

print(f"\nüìä Ratings by Hour of Day:")
hourly = ratings.groupby('hour').size().sort_index()
for hour in hourly.index:
    count = hourly[hour]
    percentage = (count / n_ratings) * 100
    bar_length = int(percentage * 2)
    bar = '‚ñà' * bar_length
    print(f"   {hour:2d}:00: {bar:<30} {percentage:5.2f}% ({count:>10,})")

print(f"\nüí° Temporal Insights:")
peak_year = yearly.idxmax()
peak_month = monthly.idxmax()
peak_dow = dow_counts.idxmax()
peak_hour = hourly.idxmax()

print(f"   ‚Ä¢ Peak year:        {peak_year} ({yearly[peak_year]:,} ratings)")
print(f"   ‚Ä¢ Peak month:       {month_names[peak_month-1]} ({monthly[peak_month]:,} ratings)")
print(f"   ‚Ä¢ Peak day:         {peak_dow} ({dow_counts[peak_dow]:,} ratings)")
print(f"   ‚Ä¢ Peak hour:        {peak_hour}:00 ({hourly[peak_hour]:,} ratings)")

print(f"\nüìà Rating Trends Over Time:")
yearly_avg_rating = ratings.groupby('year')['rating'].mean()
print(f"   Average rating by year:")
for year in yearly_avg_rating.index:
    avg_rating = yearly_avg_rating[year]
    stars = '‚òÖ' * int(avg_rating)
    print(f"   {year}: {stars} {avg_rating:.2f}")

print(f"\nüí° Observations:")
print(f"   ‚Ä¢ Most ratings occurred in {peak_year}")
print(f"   ‚Ä¢ Users rate most on {peak_dow}s")
print(f"   ‚Ä¢ Peak activity at {peak_hour}:00 (likely evening)")
print(f"   ‚Ä¢ Dataset spans {ratings['year'].max() - ratings['year'].min() + 1} years of user behavior")

print("\n" + "=" * 80)


‚è∞ TEMPORAL ANALYSIS

üìÖ Time Range:
   ‚Ä¢ First rating:  1995-01-09 11:46:49
   ‚Ä¢ Last rating:   2019-11-21 09:15:03
   ‚Ä¢ Time span:     9,081 days
   ‚Ä¢ Years covered: 25

üìä Ratings by Year:
   1995:                                                     0.00% (         3)
   1996: ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà                        5.72% ( 1,430,093)
   1997: ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà                                        2.50% (   626,202)
   1998: ‚ñà‚ñà‚ñà‚ñà‚ñà                                               1.09% (   272,099)
   1999: ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà                               4.24% ( 1,059,080)
   2000: ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà                  6.94% ( 1,735,398)
   2001: ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà                               4.23% ( 1,0

## 9- DATA QUALITY CHECK & SUMMARY

In [16]:
"""
CELL 10: DATA QUALITY CHECK & SUMMARY
"""

print("\n" + "=" * 80)
print("‚úÖ DATA QUALITY CHECK")
print("=" * 80)

def check_quality(df, name, skip_duplicates=False):
    print(f"\n{name}:")
    print(f"   ‚Ä¢ Total rows:           {len(df):>15,}")
    print(f"   ‚Ä¢ Missing values:       {df.isnull().sum().sum():>15,}")
    print(f"   ‚Ä¢ Missing % :           {(df.isnull().sum().sum() / (len(df) * len(df.columns)) * 100):>15.4f}%")
    
    if not skip_duplicates:
        print(f"   ‚Ä¢ Duplicate rows:       {df.duplicated().sum():>15,}")
    else:
        print(f"   ‚Ä¢ Duplicate rows:       {'Skipped (has list columns)':>15}")
    
    print(f"   ‚Ä¢ Memory usage:         {df.memory_usage(deep=True).sum() / 1024**2:>15.2f} MB")
    
    print(f"\n   Column-wise missing values:")
    for col in df.columns:
        missing = df[col].isnull().sum()
        if missing > 0:
            print(f"      ‚Ä¢ {col:20s}: {missing:>10,} ({missing/len(df)*100:.2f}%)")
        else:
            print(f"      ‚Ä¢ {col:20s}: {missing:>10,} (‚úÖ Complete)")

check_quality(ratings, "RATINGS Dataset")

print("\n   Additional checks:")
invalid_ratings = ((ratings['rating'] < 0.5) | (ratings['rating'] > 5.0)).sum()
print(f"   ‚Ä¢ Invalid ratings (not 0.5-5.0): {invalid_ratings:>6,}")

negative_ids = (ratings['userId'] < 0).sum() + (ratings['movieId'] < 0).sum()
print(f"   ‚Ä¢ Negative IDs:                  {negative_ids:>6,}")

check_quality(movies, "MOVIES Dataset", skip_duplicates=True)

print("\n   Additional checks:")
no_genre = (movies['genres'] == '(no genres listed)').sum()
print(f"   ‚Ä¢ Movies with no genres:  {no_genre:>10,} ({no_genre/len(movies)*100:.2f}%)")

movies_original_duplicates = movies[['movieId', 'title', 'genres']].duplicated().sum()
print(f"   ‚Ä¢ Duplicate movies:       {movies_original_duplicates:>10,}")

check_quality(tags, "TAGS Dataset")

print("\n   Additional checks:")
empty_tags = (tags['tag'].str.strip() == '').sum()
print(f"   ‚Ä¢ Empty tags:             {empty_tags:>10,}")

very_short_tags = (tags['tag'].str.len() < 2).sum()
print(f"   ‚Ä¢ Very short tags (<2):   {very_short_tags:>10,} ({very_short_tags/len(tags)*100:.2f}%)")

check_quality(links, "LINKS Dataset")

print("\n" + "=" * 80)
print("üìä DATASET SUMMARY")
print("=" * 80)

summary = {
    'dataset_name': 'MovieLens 25M',
    'collection_period': f"{ratings['datetime'].min()} to {ratings['datetime'].max()}",
    'total_ratings': int(n_ratings),
    'unique_users': int(n_users),
    'unique_movies': int(n_movies),
    'sparsity_percent': float(sparsity),
    'avg_rating': float(ratings['rating'].mean()),
    'median_rating': float(ratings['rating'].median()),
    'std_rating': float(ratings['rating'].std()),
    'unique_genres': int(len(genre_counts)),
    'total_tags': int(n_tags),
    'unique_tags': int(n_unique_tags),
    'movies_with_tags': int(n_movies_with_tags),
    'movies_without_genres': int(no_genre),
    'avg_ratings_per_user': float(user_counts.mean()),
    'median_ratings_per_user': float(user_counts.median()),
    'avg_ratings_per_movie': float(movie_counts.mean()),
    'median_ratings_per_movie': float(movie_counts.median()),
    'peak_year': int(peak_year),
    'peak_month': int(peak_month),
    'peak_day': peak_dow,
    'peak_hour': int(peak_hour),
    'data_quality': 'Excellent - minimal missing values'
}

print("\nüìã Key Statistics:")
for key, value in summary.items():
    if isinstance(value, float):
        print(f"   ‚Ä¢ {key:30s}: {value:>15.2f}")
    elif isinstance(value, int):
        print(f"   ‚Ä¢ {key:30s}: {value:>15,}")
    else:
        print(f"   ‚Ä¢ {key:30s}: {value:>15}")

print("\nüíæ Saving summary to file...")
os.makedirs(PROCESSED_DIR, exist_ok=True)
summary_path = os.path.join(PROCESSED_DIR, 'dataset_summary.json')

with open(summary_path, 'w') as f:
    json.dump(summary, f, indent=2, default=str)

print(f"‚úÖ Summary saved to: {summary_path}")

print("\n" + "=" * 80)
print("‚úÖ DATA LOADING & EXPLORATION COMPLETE!")
print("=" * 80)

print("\nüéØ Key Findings:")
print(f"   ‚Ä¢ {n_ratings:,} ratings from {n_users:,} users on {n_movies:,} movies")
print(f"   ‚Ä¢ Matrix is {sparsity:.2f}% sparse (challenging for CF)")
print(f"   ‚Ä¢ {len(genre_counts)} genres for content-based filtering")
print(f"   ‚Ä¢ {n_unique_tags:,} unique tags for NLP analysis")
print(f"   ‚Ä¢ Average rating: {ratings['rating'].mean():.2f}/5.0 stars")
print(f"   ‚Ä¢ Data spans {ratings['year'].max() - ratings['year'].min() + 1} years ({ratings['year'].min()}-{ratings['year'].max()})")
print(f"   ‚Ä¢ Peak activity: {peak_dow}s at {peak_hour}:00, in {month_names[peak_month-1]}, year {peak_year}")
print(f"   ‚Ä¢ Data quality: EXCELLENT (minimal missing values)")

print("\nüöÄ Next Steps:")
print("   ‚úÖ Notebook 1: Data Loading & Exploration (COMPLETE)")
print("   üìç Notebook 2: Data Preprocessing & Feature Engineering")
print("   üìç Notebook 3: EDA & Visualizations")
print("   üìç Notebook 4: Content-Based Recommender (Genres + NLP)")
print("   üìç Notebook 5: Collaborative Filtering")
print("   üìç Notebook 6: Matrix Factorization (SVD)")
print("   üìç Notebook 7: Hybrid Model & Evaluation")

print("\nüí° Ready to proceed with preprocessing!")
print("=" * 80)


‚úÖ DATA QUALITY CHECK

RATINGS Dataset:
   ‚Ä¢ Total rows:                25,000,095
   ‚Ä¢ Missing values:                     0
   ‚Ä¢ Missing % :                    0.0000%
   ‚Ä¢ Duplicate rows:                     0
   ‚Ä¢ Memory usage:                 2577.38 MB

   Column-wise missing values:
      ‚Ä¢ userId              :          0 (‚úÖ Complete)
      ‚Ä¢ movieId             :          0 (‚úÖ Complete)
      ‚Ä¢ rating              :          0 (‚úÖ Complete)
      ‚Ä¢ timestamp           :          0 (‚úÖ Complete)
      ‚Ä¢ datetime            :          0 (‚úÖ Complete)
      ‚Ä¢ year                :          0 (‚úÖ Complete)
      ‚Ä¢ month               :          0 (‚úÖ Complete)
      ‚Ä¢ day_of_week         :          0 (‚úÖ Complete)
      ‚Ä¢ hour                :          0 (‚úÖ Complete)

   Additional checks:
   ‚Ä¢ Invalid ratings (not 0.5-5.0):      0
   ‚Ä¢ Negative IDs:                       0

MOVIES Dataset:
   ‚Ä¢ Total rows:                    62,423
