In [94]:
import torch 
from torch.utils.data import TensorDataset

import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [81]:
ratings_columns = ['userId', 'movieId', 'rating', 'timestamp']
users_columns = ['userId', 'gender', 'age', 'occupation', 'zip_code']
movie_columns = ['movieId', 'title', 'genres']

def load_movielens():
    """Load the MovieLens datasets."""
    ratings_df = pd.read_csv(filepath_or_buffer='data/ml-1m/ratings.dat', sep='::', header=None, names=ratings_columns, encoding='latin-1', engine='python')
    ratings_df.drop(columns='timestamp', inplace=True) # Drop timestamp column
    ratings_df['rating'] = ratings_df['rating'].astype(float)  # Ensure ratings are float
    ratings_df['userId'] = ratings_df['userId'].astype(int) - 1  # Ensure userId is int
    ratings_df['movieId'] = ratings_df['movieId'].astype(int) - 1  # Ensure movieId is int

    users_df = pd.read_csv(filepath_or_buffer='data/ml-1m/users.dat', sep='::', header=None, names=users_columns, encoding='latin-1', engine='python')
    users_df['userId'] = users_df['userId'].astype(int) - 1  # Ensure userId is int
    users_df['gender'] = LabelEncoder().fit_transform(users_df['gender'])  # Encode gender as numeric 0/1
    users_df['age'] = LabelEncoder().fit_transform(users_df['age'])  # Encode age as numeric
    users_df['occupation'] = users_df['occupation'].astype(int)  # Ensure occupation is int
    users_df['zip_code'] = users_df['zip_code'].astype(str).str[:2]
    users_df['zip_code'] = LabelEncoder().fit_transform(users_df['zip_code'])  # Encode zip_code as numeric

    return ratings_df, users_df

ratings_df, users_df = load_movielens()

df = ratings_df.merge(users_df, on='userId')

print(df.head(5))


   userId  movieId  rating  gender  age  occupation  zip_code
0       0     1192     5.0       0    0          10        48
1       0      660     3.0       0    0          10        48
2       0      913     3.0       0    0          10        48
3       0     3407     4.0       0    0          10        48
4       0     2354     5.0       0    0          10        48


In [82]:
user_ids = torch.tensor(df['userId'].values, dtype=torch.long)
movie_ids = torch.tensor(df['movieId'].values, dtype=torch.long)
ratings = torch.tensor(df['rating'].values, dtype=torch.float32)
genders = torch.tensor(df['gender'].values, dtype=torch.long)
ages = torch.tensor(df['age'].values, dtype=torch.long)
occupations = torch.tensor(df['occupation'].values, dtype=torch.long)
zip_codes = torch.tensor(df['zip_code'].values, dtype=torch.long)

In [83]:
dataset = TensorDataset(user_ids, movie_ids, ratings, genders, ages, occupations, zip_codes)

# Split the dataset into train, test, and validation sets. 70% train, 15% test, 15% validation
train_dataset, temp_test_dataset = train_test_split(dataset, test_size=0.3, random_state=42, shuffle=True)
test_dataset, val_dataset = train_test_split(temp_test_dataset, test_size=0.5, random_state=42)

# Save the datasets
torch.save(train_dataset, 'datasets/train_dataset.pt')
torch.save(test_dataset, 'datasets/test_dataset.pt')
torch.save(val_dataset, 'datasets/val_dataset.pt')




In [89]:
# Save metadata
n_users = user_ids.max().item() + 1
n_movies = movie_ids.max().item() + 1
n_genders = users_df['gender'].nunique()
n_ages = users_df['age'].nunique()
n_occupations = users_df['occupation'].nunique()
n_zip_codes = users_df['zip_code'].nunique()
metadata = {
    'n_users': n_users,
    'n_movies': n_movies,
    'n_genders': n_genders,
    'n_ages': n_ages,
    'n_occupations': n_occupations,
    'n_zip_codes': n_zip_codes
}
torch.save(metadata, 'datasets/metadata.pt')

# Print metadata
print(f"Number of users: {n_users}")
print(f"Number of movies: {n_movies}")
print(f"Number of unique genders: {n_genders}")
print(f"Number of unique ages: {n_ages}")
print(f"Number of unique occupations: {n_occupations}")
print(f"Number of unique zip codes: {n_zip_codes}")

Number of users: 6040
Number of movies: 3952
Number of unique genders: 2
Number of unique ages: 7
Number of unique occupations: 21
Number of unique zip codes: 100


In [93]:
import tensorflow_privacy


AttributeError: module 'tensorflow.python.util.tf_export' has no attribute 'ESTIMATOR_API_NAME'