# Netflix Prize Dataset
## Extract a small sub-dataset
Here, we choose the top 1000 most rated movies. Then, among those movies, we choose the top 1000 users who rated the most number of movies.

In [None]:
import os

directory = 'Raw/training_set'
file_lengths = []

# Loop through each file in the directory
for filename in os.listdir(directory):
    filepath = os.path.join(directory, filename)
    
    # Get the length of the file
    file_length = os.path.getsize(filepath)
    
    # Append the filename and length to the list
    file_lengths.append((filename, file_length))

# Sort the file_lengths list based on the file length in descending order
file_lengths.sort(key=lambda x: x[1], reverse=True)

# Get the top 1000 longest files
top_1000_longest_files = file_lengths[:1000]

In [None]:
# Movie and user mappings
movie_mapping = {}
user_mapping = {}

# User rating counters
user_ratings = {}

def count_user_ratings(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()

        # Save the first line as the movie_id
        movie_id = lines[0].split(':')[0]

        # Process the remaining lines
        for line in lines[1:]:
            user_id = line.split(',')[0]
            user_ratings[user_id] = user_ratings.get(user_id, 0) + 1

        return movie_id

# Loop through the top 1000 longest files
i = 0
for file in top_1000_longest_files:
    file_path = os.path.join(directory, file[0])
    movie_mapping[count_user_ratings(file_path)] = i
    i += 1

# Find the top 1000 users with the most ratings
top_1000_users = sorted(user_ratings.items(), key=lambda x: x[1], reverse=True)[:1000]

# Assign a unique index to each user
i = 0
for user in top_1000_users:
    user_mapping[user[0]] = i
    i += 1

In [None]:
import pandas as pd

dataset = []

# Loop through the top 1000 longest files
for file in top_1000_longest_files:
    file_path = os.path.join(directory, file[0])
    
    with open(file_path, 'r') as file:
        lines = file.readlines()
    
        # Get the movie index
        movie_index = movie_mapping[lines[0].split(':')[0]]

        # Process the remaining lines
        for line in lines[1:]:
            user_id, rating, _ = line.split(',')
            
            # Get the user index
            user_index = user_mapping.get(user_id, -1)
            
            # Append the movie index, user index, and rating to the dataset
            if user_index != -1:
                dataset.append((movie_index, user_index, int(rating)))

# Create a pandas DataFrame from the dataset
df = pd.DataFrame(dataset, columns=['Movie Index', 'User Index', 'Rating'])

In [None]:
df.head()

In [None]:
# Inspect the data density
num_users = len(user_mapping)
num_movies = len(movie_mapping)
num_ratings = len(df)

density = num_ratings / (num_users * num_movies)

print(f'Number of users: {num_users}')
print(f'Number of movies: {num_movies}')
print(f'Number of ratings: {num_ratings}')
print(f'Density: {density:.6f}')

In [None]:
# Save the dataset to a CSV file
df.to_csv('Small/dataset.csv', index=False)

In [3]:
import pandas as pd

# Read the CSV dataset file
df = pd.read_csv('Small/dataset.csv')

# Remap user ids
df['User Index'] = df['User Index'].rank(method='dense') - 1

# Remap movie ids
df['Movie Index'] = df['Movie Index'].rank(method='dense') - 1

# Convert the columns to integers
df['User Index'] = df['User Index'].astype(int)
df['Movie Index'] = df['Movie Index'].astype(int)

# Write the updated dataset to the CSV file
df.to_csv('Small/dataset.csv', index=False)

In [4]:
# Read the CSV dataset file
df = pd.read_csv('Small/dataset.csv')

# Get the range of movie IDs and user IDs
min_movie_id = df['Movie Index'].min()
max_movie_id = df['Movie Index'].max()
min_user_id = df['User Index'].min()
max_user_id = df['User Index'].max()

# Get the total number of unique movie IDs and user IDs
unique_movie_ids = df['Movie Index'].nunique()
unique_user_ids = df['User Index'].nunique()

# Verify the range of movie IDs and user IDs
print("Movie ID range:", min_movie_id, "-", max_movie_id)
print("User ID range:", min_user_id, "-", max_user_id)
print("Unique Movie IDs:", unique_movie_ids)
print("Unique User IDs:", unique_user_ids)

Movie ID range: 0 - 999
User ID range: 0 - 999
Unique Movie IDs: 1000
Unique User IDs: 1000


## Extract the full dataset

In [None]:
import os
from tqdm import tqdm

directory = 'Raw/training_set'

# Create the output file or clear the existing file
output_filepath = 'Full/dataset.csv'
with open(output_filepath, 'w') as output_file:
    output_file.write('Movie Index,User Index,Rating\n')

# Get the total number of files
total_files = len(os.listdir(directory))

# Loop through each file in the directory with a progress bar
for filename in tqdm(os.listdir(directory), total=total_files, desc='Processing files'):
    filepath = os.path.join(directory, filename)

    # Read the file
    with open(filepath, 'r') as file:
        lines = file.readlines()

        # Save the first line as the movie_id
        movie_id = lines[0].split(':')[0]

        # Save the remaining lines as the user - rating pairs
        user_ratings = lines[1:]

        # Save the user - rating pairs to the output file
        with open(output_filepath, 'a') as output_file:
            for user_rating in user_ratings:
                # Split user and rating
                user_id, rating, _ = user_rating.split(',')

                # Write the movie index, user index, and rating to the output file
                output_file.write(f'{movie_id},{user_id},{rating}\n')

In [1]:
import pandas as pd

# Read the CSV dataset file
df = pd.read_csv('Full/dataset.csv')

# Remap user ids
df['User Index'] = df['User Index'].rank(method='dense') - 1

# Remap movie ids
df['Movie Index'] = df['Movie Index'].rank(method='dense') - 1

# Convert the columns to integers
df['User Index'] = df['User Index'].astype(int)
df['Movie Index'] = df['Movie Index'].astype(int)

# Write the updated dataset to the CSV file
df.to_csv('Full/dataset.csv', index=False)

In [2]:
# Read the CSV dataset file
df = pd.read_csv('Full/dataset.csv')

# Get the range of movie IDs and user IDs
min_movie_id = df['Movie Index'].min()
max_movie_id = df['Movie Index'].max()
min_user_id = df['User Index'].min()
max_user_id = df['User Index'].max()

# Get the total number of unique movie IDs and user IDs
unique_movie_ids = df['Movie Index'].nunique()
unique_user_ids = df['User Index'].nunique()

# Verify the range of movie IDs and user IDs
print("Movie ID range:", min_movie_id, "-", max_movie_id)
print("User ID range:", min_user_id, "-", max_user_id)
print("Unique Movie IDs:", unique_movie_ids)
print("Unique User IDs:", unique_user_ids)

Movie ID range: 0 - 17769
User ID range: 0 - 480188
Unique Movie IDs: 17770
Unique User IDs: 480189


In [None]:
# import pandas as pd

# # Read the dataset from the CSV file
# df = pd.read_csv('Full/dataset.csv')

In [None]:
# # Get some statistics about the data
# import json

# # Number of unique users
# num_users = df['User Index'].nunique()

# # Number of unique movies
# num_movies = df['Movie Index'].nunique()

# # Mean rating
# mean_rating = df['Rating'].astype(int).mean()

# # Density
# density = len(df) / (num_users * num_movies)

# # Save these statistic to a JSON file
# stats = {
#     'num_users': num_users,
#     'num_movies': num_movies,
#     'mean_rating': mean_rating,
#     'density': density
# }

# with open('Full/stats.json', 'w') as file:
#     json.dump(stats, file)

In [None]:
# from sklearn.model_selection import train_test_split

# # Split the dataset into train and test data
# train_data, test_data = train_test_split(df, test_size=0.25, random_state=42)

# # Print the shapes of train and test data
# print("Train data shape:", train_data.shape)
# print("Test data shape:", test_data.shape)

In [None]:
# from tqdm import tqdm

# # Sort the DataFrame by User Index
# train_data = train_data.groupby('User Index')

# # Save each user's ratings to a separate CSV file
# for user_index, user_data in tqdm(train_data, desc='Saving user ratings', total=num_users):
#     user_data.to_csv(f'Full/Train/{user_index}.csv', index=False)

# # Sort the DataFrame by User Index
# test_data = test_data.groupby('User Index')

# # Save each user's ratings to a separate CSV file
# for user_index, user_data in tqdm(test_data, desc='Saving user ratings', total=num_users):
#     user_data.to_csv(f'Full/Test/{user_index}.csv', index=False)

In [None]:
# # Delete all the files in the generated data directory
# import shutil

# shutil.rmtree('Full/Train')
# shutil.rmtree('Full/Test')