In [None]:
import pandas as pd
import torch 

## Dataset Analysis
This script is used to analyze the dataset and extract key parameters for the report

In [2]:
train_data = torch.load('datasets/train_dataset.pt')
val_data = torch.load('datasets/val_dataset.pt')
test_data = torch.load('datasets/test_dataset.pt')

In [3]:
colums = ['userId', 'movieId', 'rating', 'gender', 'age', 'occupation', 'zip_code']
train_data = pd.DataFrame(train_data)
train_data = train_data.map(lambda x: x.item() if torch.is_tensor(x) else x)
train_data.columns = colums

val_data = pd.DataFrame(val_data)
val_data = val_data.map(lambda x: x.item() if torch.is_tensor(x) else x)
val_data.columns = colums

test_data = pd.DataFrame(test_data)
test_data = test_data.map(lambda x: x.item() if torch.is_tensor(x) else x)
test_data.columns = colums

train_data.head()

Unnamed: 0,userId,movieId,rating,gender,age,occupation,zip_code
0,3313,2786,2.0,1,2,7,6
1,47,646,3.0,1,2,4,92
2,3769,1078,4.0,0,3,3,50
3,523,1125,1.0,1,1,0,91
4,3433,2469,3.0,1,1,4,48


In [5]:
def summarize_datasets(train_df, val_df, test_df, gender_col='gender', occupation_col='occupation', age_col='age', zip_col='zip_code'):
    def compute_metrics(df):
        gender_counts = df[gender_col].value_counts(normalize=True)
        most_common_occupation = df[occupation_col].mode().iloc[0] if not df[occupation_col].mode().empty else None
        most_common_zip = df[zip_col].mode().iloc[0] if not df[zip_col].mode().empty else None
        result = { 
            'Count': int(len(df)),
            'User Count': int(df['userId'].nunique()),
            'Movie Count': int(df['movieId'].nunique()),
            'Rating Count': int(df['rating'].count()),
            'Female %': round(gender_counts.get(0, 0) * 100, 2),
            'Male %': round(gender_counts.get(1, 0) * 100, 2),
            'Most Common Occupation': most_common_occupation,
            'Unique Occupations': int(df[occupation_col].nunique()),
            'Avg Age': round(df[age_col].mean(), 2),
            'Most Common Zip Code': most_common_zip,
            'Unique Zip Codes': int(df[zip_col].nunique()),
            'Average Rating': round(df['rating'].mean(), 2),
            'Median Rating': round(df['rating'].median(), 2),
            'Rating Std Dev': round(df['rating'].std(), 2),
            'Rating Variance': round(df['rating'].var(), 2),
            'Avg Number of Ratings per User': round(df.groupby('userId')['rating'].count().mean(), 2),
            'Avg Number of Ratings per Movie': round(df.groupby('movieId')['rating'].count().mean(), 2),
        }
        return result

    metrics = {
        'Train': compute_metrics(train_df),
        'Validation': compute_metrics(val_df),
        'Test': compute_metrics(test_df),
    }

    # Combine all datasets for "Total"
    total_df = pd.concat([train_df, val_df, test_df])
    metrics['Total'] = compute_metrics(total_df)

    return pd.DataFrame(metrics).T  # Transpose so columns become splits

# Example usage
# summary_df = summarize_datasets(train_df, val_df, test_df)
# print(summary_df)
summary_df = summarize_datasets(train_data, val_data, test_data)
summary_df.head()

Unnamed: 0,Count,User Count,Movie Count,Rating Count,Female %,Male %,Most Common Occupation,Unique Occupations,Avg Age,Most Common Zip Code,Unique Zip Codes,Average Rating,Median Rating,Rating Std Dev,Rating Variance,Avg Number of Ratings per User,Avg Number of Ratings per Movie
Train,700146.0,6040.0,3664.0,700146.0,24.62,75.38,4.0,21.0,2.5,55.0,100.0,3.58,4.0,1.12,1.25,115.92,191.09
Validation,150032.0,6030.0,3408.0,150032.0,24.81,75.19,0.0,21.0,2.5,55.0,100.0,3.58,4.0,1.12,1.25,24.88,44.02
Test,150031.0,6021.0,3409.0,150031.0,24.56,75.44,4.0,21.0,2.5,55.0,100.0,3.58,4.0,1.12,1.26,24.92,44.01
Total,1000209.0,6040.0,3706.0,1000209.0,24.64,75.36,4.0,21.0,2.5,55.0,100.0,3.58,4.0,1.12,1.25,165.6,269.89
