Reduce size of data... 
Data Source: https://files.grouplens.org/datasets/movielens/ml-32m.zip

READ in 32 million entries and randomly pick 256K of them 

In [2]:
import pandas as pd
import numpy as np

# Set random seed for reproducibility
np.random.seed(42)

# Read the large ratings file
print("Reading ratings data...")
ratings_full = pd.read_csv('data/ml-32m/ratings.csv')

print(f"Original dataset shape: {ratings_full.shape}")
print(f"Users: {ratings_full['userId'].nunique()}")
print(f"Movies: {ratings_full['movieId'].nunique()}")
print(f"Total ratings: {len(ratings_full)}")

# Randomly sample 256K entries
sample_size = 262144
print(f"\nSampling {sample_size:,} entries...")

ratings_sample = ratings_full.sample(n=sample_size, random_state=42)

# Check sample statistics
print(f"\nSample dataset shape: {ratings_sample.shape}")
print(f"Sample users: {ratings_sample['userId'].nunique()}")
print(f"Sample movies: {ratings_sample['movieId'].nunique()}")
print(f"Rating distribution:")
print(ratings_sample['rating'].value_counts().sort_index())

# Save to new file
output_path = 'data/ratings-256k.csv'
ratings_sample.to_csv(output_path, index=False)
print(f"\nSample saved to: {output_path}")

# Display first few rows
print(f"\nFirst 5 rows of sample:")
print(ratings_sample.head())


Reading ratings data...
Original dataset shape: (32000204, 4)
Users: 200948
Movies: 84432
Total ratings: 32000204

Sampling 262,144 entries...

Sample dataset shape: (262144, 4)
Sample users: 102430
Sample movies: 16462
Rating distribution:
rating
0.5     4343
1.0     7883
1.5     4326
2.0    16357
2.5    13850
3.0    49698
3.5    35177
4.0    68521
4.5    24411
5.0    37578
Name: count, dtype: int64

Sample saved to: data/ratings-256k.csv

First 5 rows of sample:
          userId  movieId  rating   timestamp
10685861   66954      781     5.0   850944577
1552723     9877      574     4.0   945495614
6145184    38348     1088     2.0   999974867
16268584  101952     2706     1.0  1203077565
22418634  140400   275079     3.5  1653782463
