# MovieLens Dataset Exploration

This notebook explores the MovieLens dataset to understand its structure and characteristics.

In [None]:
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import yaml
from pathlib import Path

# Add the project root to the Python path
notebook_path = Path(os.getcwd())
project_root = notebook_path.parent
sys.path.append(str(project_root))

# Set plotting style
sns.set_style("whitegrid")
plt.rcParams["figure.figsize"] = (12, 8)
%matplotlib inline

In [None]:
# Load configuration
def load_config(config_path="configs/data_config.yaml"):
    config_path = project_root / config_path
    with open(config_path, "r") as file:
        config = yaml.safe_load(file)
    return config

config = load_config()
movielens_config = config["movielens"]
dataset_size = movielens_config["dataset_size"]
raw_dir = project_root / movielens_config["paths"]["raw"]

## Loading the MovieLens Dataset

Let's load the ratings, movies, and users data. The file format depends on the dataset size.

In [None]:
def load_movielens_data():
    """Load MovieLens dataset files based on dataset size."""
    dataset_path = raw_dir / f"ml-{dataset_size}"
    
    # Different file formats and structures for different dataset sizes
    if dataset_size == "100k":
        # MovieLens 100K has a different structure
        ratings_file = dataset_path / "u.data"
        movies_file = dataset_path / "u.item"
        users_file = dataset_path / "u.user"
        
        # Load ratings
        ratings = pd.read_csv(ratings_file, sep="\t", 
                             names=["user_id", "movie_id", "rating", "timestamp"])
        
        # Load movies (encoding Latin-1 for special characters)
        movies_cols = ["movie_id", "title", "release_date", "video_release_date", 
                      "IMDb_URL"] + [f"genre_{i}" for i in range(19)]
        movies = pd.read_csv(movies_file, sep="|", encoding="latin-1", names=movies_cols)
        
        # Load users
        users = pd.read_csv(users_file, sep="|", 
                           names=["user_id", "age", "gender", "occupation", "zip_code"])
        
    elif dataset_size == "1m":
        # MovieLens 1M uses double-colon separator
        ratings_file = dataset_path / "ratings.dat"
        movies_file = dataset_path / "movies.dat"
        users_file = dataset_path / "users.dat"
        
        # Load ratings
        ratings = pd.read_csv(ratings_file, sep="::", engine="python",
                             names=["user_id", "movie_id", "rating", "timestamp"])
        
        # Load movies
        movies = pd.read_csv(movies_file, sep="::", engine="python", encoding="latin-1",
                            names=["movie_id", "title", "genres"])
        
        # Load users
        users = pd.read_csv(users_file, sep="::", engine="python",
                           names=["user_id", "gender", "age", "occupation", "zip_code"])
    
    else:  # 20m and 25m have similar CSV structure
        ratings_file = dataset_path / "ratings.csv"
        movies_file = dataset_path / "movies.csv"
        
        # Load ratings
        ratings = pd.read_csv(ratings_file)
        
        # Load movies
        movies = pd.read_csv(movies_file)
        
        # No users file in larger datasets
        users = None
    
    return ratings, movies, users

ratings_df, movies_df, users_df = load_movielens_data()

## Data Overview

Let's explore the basic statistics and structures of our datasets.

In [None]:
print("=== Ratings Dataset ===")
print(f"Shape: {ratings_df.shape}")
print("\nFirst 5 rows:")
display(ratings_df.head())
print("\nSummary statistics:")
display(ratings_df.describe())

In [None]:
print("=== Movies Dataset ===")
print(f"Shape: {movies_df.shape}")
print("\nFirst 5 rows:")
display(movies_df.head())
print("\nData types:")
display(movies_df.dtypes)

In [None]:
if users_df is not None:
    print("=== Users Dataset ===")
    print(f"Shape: {users_df.shape}")
    print("\nFirst 5 rows:")
    display(users_df.head())
    print("\nData types:")
    display(users_df.dtypes)

## Data Analysis

Let's analyze key aspects of the dataset.

In [None]:
# Rating distribution
plt.figure(figsize=(10, 6))
sns.countplot(x='rating', data=ratings_df)
plt.title('Distribution of Ratings')
plt.xlabel('Rating')
plt.ylabel('Count')
plt.show()

In [None]:
# User activity distribution
user_activity = ratings_df['user_id'].value_counts().reset_index()
user_activity.columns = ['user_id', 'rating_count']

plt.figure(figsize=(10, 6))
sns.histplot(user_activity['rating_count'], bins=50)
plt.title('Distribution of User Activity (Number of Ratings per User)')
plt.xlabel('Number of Ratings')
plt.ylabel('Count of Users')
plt.show()

In [None]:
# Movie popularity distribution
movie_popularity = ratings_df['movie_id'].value_counts().reset_index()
movie_popularity.columns = ['movie_id', 'rating_count']

plt.figure(figsize=(10, 6))
sns.histplot(movie_popularity['rating_count'], bins=50)
plt.title('Distribution of Movie Popularity (Number of Ratings per Movie)')
plt.xlabel('Number of Ratings')
plt.ylabel('Count of Movies')
plt.show()

In [None]:
# Top-rated movies (with minimum 100 ratings)
movie_stats = ratings_df.groupby('movie_id').agg({
    'rating': ['mean', 'count']
}).reset_index()
movie_stats.columns = ['movie_id', 'avg_rating', 'rating_count']

popular_movies = movie_stats[movie_stats['rating_count'] >= 100].sort_values('avg_rating', ascending=False).head(20)
popular_movies = popular_movies.merge(movies_df[['movie_id', 'title']], on='movie_id')

plt.figure(figsize=(12, 8))
sns.barplot(x='avg_rating', y='title', data=popular_movies)
plt.title('Top 20 Highest Rated Movies (with at least 100 ratings)')
plt.xlabel('Average Rating')
plt.ylabel('Movie Title')
plt.tight_layout()
plt.show()

## Data Insights Summary

Key observations from our data exploration:

1. Dataset size and completeness
2. Rating distribution patterns
3. User activity patterns
4. Movie popularity distribution
5. Popular and highly-rated movies

Next steps:
- Data preprocessing and cleaning
- Feature engineering
- Initial model implementation