In [12]:
import os
import pandas as pd
import scipy
import torch
import itertools
import tqdm

**DATA PROCESSING**

*Processing Users*

In [2]:
# Specify the delimiter (e.g., ',' for CSV, '\t' for tab, or '::' for MovieLens)
file_path = 'dataset/original_1m/users.dat'
users_df = pd.read_csv(file_path, delimiter='::', engine='python', header=None)
users_df.columns = ["UserID", "Gender", "Age", "Occupation","Zip-code"]
users_df.head()

Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [3]:
# Map Gender to categorical values: F to 0, M to 1    
users_df['Gender'] = users_df['Gender'].map({'F': 0, 'M': 1})
'''
Mapping age to continuous values
	*  1:  "Under 18"
	* 18:  "18-24"
	* 25:  "25-34"
	* 35:  "35-44"
	* 45:  "45-49"
	* 50:  "50-55"
	* 56:  "56+"
'''
#Age mapping into 0-1, interval = 1/7
age_mapping = {}
age_index = [1, 18, 25, 35, 45, 50, 56]
for i in range(0, 7):
    age_mapping[age_index[i-1]] = i/6
print(age_mapping)
users_df['Age'] = users_df['Age'].map(age_mapping)

users_df.pop('Zip-code')
users_df.head()

{56: 0.0, 1: 0.16666666666666666, 18: 0.3333333333333333, 25: 0.5, 35: 0.6666666666666666, 45: 0.8333333333333334, 50: 1.0}


Unnamed: 0,UserID,Gender,Age,Occupation
0,1,0,0.166667,10
1,2,1,0.0,16
2,3,1,0.5,15
3,4,1,0.833333,7
4,5,1,0.5,20


In [4]:
# users_df.to_csv('dataset/preprocessed_data/node_users.csv', index=False)

*Processing Movies*

In [5]:
#  File being encoded in a different format, such as 'latin-1' or 'ISO-8859-1'.
file_path = 'dataset/original_1m/movies.dat'
movies_df = pd.read_csv(file_path, delimiter='::', engine='python', header=None, encoding='latin-1')
movies_df.columns = ["MovieID", "Title", "Genres"]
print(movies_df.head())

   MovieID                               Title                        Genres
0        1                    Toy Story (1995)   Animation|Children's|Comedy
1        2                      Jumanji (1995)  Adventure|Children's|Fantasy
2        3             Grumpier Old Men (1995)                Comedy|Romance
3        4            Waiting to Exhale (1995)                  Comedy|Drama
4        5  Father of the Bride Part II (1995)                        Comedy


In [6]:
movie_node_df = movies_df.copy(deep=True)
# TODO Kan Hon pls help map movieID to embID
movie_node_df['EmbID'] = [i for i in range(len(movie_node_df))]
movie_node_df.pop('Genres')
movie_node_df.pop('Title')

# movie_node_df.to_csv('dataset/preprocessed_data/node_movies.csv', index=False)

0                         Toy Story (1995)
1                           Jumanji (1995)
2                  Grumpier Old Men (1995)
3                 Waiting to Exhale (1995)
4       Father of the Bride Part II (1995)
                       ...                
3878               Meet the Parents (2000)
3879            Requiem for a Dream (2000)
3880                      Tigerland (2000)
3881               Two Family House (2000)
3882                 Contender, The (2000)
Name: Title, Length: 3883, dtype: object

*Process Rating Edges*

In [7]:
file_path = 'dataset/original_1m/ratings.dat'
ratings_df = pd.read_csv(file_path, delimiter='::', engine='python', header=None)
ratings_df.columns = ["UserID","MovieID","Rating","Timestamp"]
print(ratings_df.head())

   UserID  MovieID  Rating  Timestamp
0       1     1193       5  978300760
1       1      661       3  978302109
2       1      914       3  978301968
3       1     3408       4  978300275
4       1     2355       5  978824291


In [8]:
#Normalize by 5
ratings_df['Rating']= [x/5 for x in ratings_df['Rating']]
ratings_df.pop('Timestamp')
# ratings_df.to_csv('dataset/preprocessed_data/u2m_ratings.csv', index=False)


0          978300760
1          978302109
2          978301968
3          978300275
4          978824291
             ...    
1000204    956716541
1000205    956704887
1000206    956704746
1000207    956715648
1000208    956715569
Name: Timestamp, Length: 1000209, dtype: int64

*Processing Genre Edges*

In [13]:
genre_dict = {}
for i in tqdm.tqdm(range(len(movies_df))):
    all_cur_genres = movies_df.iloc[i]['Genres'].split('|')
    for genre in all_cur_genres:
        if genre not in genre_dict:
            genre_dict[genre] = [movies_df.iloc[i]['MovieID']]
        else:
            genre_dict[genre].append(movies_df.iloc[i]['MovieID'])

  0%|          | 0/3883 [00:00<?, ?it/s]

100%|██████████| 3883/3883 [00:02<00:00, 1724.32it/s]


In [17]:
genre_dict.keys()
all_genres = list(genre_dict.keys())
with open('dataset/preprocessed_data/genre_mapping.txt', 'w') as f:
    for i, genre in enumerate(all_genres):
        f.write(genre + ' ' + str(i) + '\n')

# Save dictionary
import pickle
with open('dataset/preprocessed_data/genre_dict.pkl', 'wb') as f:
    pickle.dump(genre_dict, f)

In [25]:
genre_edge_df = pd.DataFrame(columns=['MovieID_0', 'MovieID_1', 'GenreID'])
genre_output_path = 'dataset/preprocessed_data/m2m_genre.csv'
movieid_0 = []
movieid_1 = []
genreids = []
for genre in tqdm.tqdm(all_genres):
    cur_genreid = all_genres.index(genre)
    combins = list(itertools.combinations(genre_dict[genre], 2))
    movieid_0.extend([x[0] for x in combins])
    movieid_1.extend([x[1] for x in combins])
    genreids.extend([cur_genreid for x in range(len(combins))])

genre_edge_df['MovieID_0'] = movieid_0
genre_edge_df['MovieID_1'] = movieid_1
genre_edge_df['GenreID'] = genreids
genre_edge_df.to_csv(genre_output_path, index=False)

  0%|          | 0/18 [00:00<?, ?it/s]

100%|██████████| 18/18 [00:01<00:00,  9.82it/s]


*Processing kNN Edges*

Processing Genre+Plot_kNN Edges