In [102]:
import os
import re
import argparse
import pickle
import pandas as pd
import numpy as np
import scipy.sparse as ssp
import dgl
import torch
import torchtext
from builder import PandasGraphBuilder
from data_utils import *

In [103]:
# get directory of data files
directory = './ml-1m'
# get output file path
output_path = './processed.pkl'

In [104]:
users = []
with open(os.path.join(directory, 'users.dat'), encoding='latin1') as f:
    # UserID::Gender::Age::Occupation::Zip-code
    for l in f:
        id_, gender, age, occupation, zip_ = l.strip().split('::')
        users.append({
            'user_id': int(id_),
            'gender': gender,
            'age': age,
            'occupation': occupation,
            'zip': zip_,
            })
users = pd.DataFrame(users).astype('category')

In [105]:
users

Unnamed: 0,user_id,gender,age,occupation,zip
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,02460
4,5,M,25,20,55455
...,...,...,...,...,...
6035,6036,F,25,15,32603
6036,6037,F,45,1,76006
6037,6038,F,56,1,14706
6038,6039,F,45,0,01060


In [106]:
movies = []
with open(os.path.join(directory, 'movies.dat'), encoding='latin1') as f:
    for l in f:
        # MovieID::Title::Genres
        id_, title, genres = l.strip().split('::')
        
        # get unique genres
        genres_set = set(genres.split('|'))

        # extract year
        assert re.match(r'.*\([0-9]{4}\)$', title)
        year = title[-5:-1]
        title = title[:-6].strip()

        data = {'movie_id': int(id_), 'title': title, 'year': year}
        for g in genres_set:
            data[g] = True
        movies.append(data)
movies = pd.DataFrame(movies).astype({'year': 'category'})

In [107]:
movies

Unnamed: 0,movie_id,title,year,Comedy,Children's,Animation,Adventure,Fantasy,Romance,Drama,...,Thriller,Crime,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
0,1,Toy Story,1995,True,True,True,,,,,...,,,,,,,,,,
1,2,Jumanji,1995,,True,,True,True,,,...,,,,,,,,,,
2,3,Grumpier Old Men,1995,True,,,,,True,,...,,,,,,,,,,
3,4,Waiting to Exhale,1995,True,,,,,,True,...,,,,,,,,,,
4,5,Father of the Bride Part II,1995,True,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,3948,Meet the Parents,2000,True,,,,,,,...,,,,,,,,,,
3879,3949,Requiem for a Dream,2000,,,,,,,True,...,,,,,,,,,,
3880,3950,Tigerland,2000,,,,,,,True,...,,,,,,,,,,
3881,3951,Two Family House,2000,,,,,,,True,...,,,,,,,,,,


In [108]:
ratings = []
with open(os.path.join(directory, 'ratings.dat'), encoding='latin1') as f:
    # UserID::MovieID::Rating::Timestamp
    for l in f:
        user_id, movie_id, rating, timestamp = [int(_) for _ in l.split('::')]
        ratings.append({
            'user_id': user_id,
            'movie_id': movie_id,
            'rating': rating,
            'timestamp': timestamp,
            })
ratings = pd.DataFrame(ratings)

In [109]:
ratings

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [110]:
# Filter the users and items that never appear in the rating table.
distinct_users_in_ratings = ratings['user_id'].unique()
distinct_movies_in_ratings = ratings['movie_id'].unique()
users = users[users['user_id'].isin(distinct_users_in_ratings)]
movies = movies.copy()[movies['movie_id'].isin(distinct_movies_in_ratings)]

In [111]:
users

Unnamed: 0,user_id,gender,age,occupation,zip
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,02460
4,5,M,25,20,55455
...,...,...,...,...,...
6035,6036,F,25,15,32603
6036,6037,F,45,1,76006
6037,6038,F,56,1,14706
6038,6039,F,45,0,01060


In [112]:
movies

Unnamed: 0,movie_id,title,year,Comedy,Children's,Animation,Adventure,Fantasy,Romance,Drama,...,Thriller,Crime,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
0,1,Toy Story,1995,True,True,True,,,,,...,,,,,,,,,,
1,2,Jumanji,1995,,True,,True,True,,,...,,,,,,,,,,
2,3,Grumpier Old Men,1995,True,,,,,True,,...,,,,,,,,,,
3,4,Waiting to Exhale,1995,True,,,,,,True,...,,,,,,,,,,
4,5,Father of the Bride Part II,1995,True,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,3948,Meet the Parents,2000,True,,,,,,,...,,,,,,,,,,
3879,3949,Requiem for a Dream,2000,,,,,,,True,...,,,,,,,,,,
3880,3950,Tigerland,2000,,,,,,,True,...,,,,,,,,,,
3881,3951,Two Family House,2000,,,,,,,True,...,,,,,,,,,,


In [119]:
# Group the movie features into genres (a vector), year (a category), title (a string)
genre_columns = movies.columns.drop(['movie_id', 'title', 'year'])
movies.loc[:, genre_columns] = movies.loc[:, genre_columns].fillna(False).astype(bool)
movies_categorical = movies.drop('title', axis=1)

In [121]:
# Build graph
graph_builder = PandasGraphBuilder()
graph_builder.add_entities(users, 'user_id', 'user')
graph_builder.add_entities(movies_categorical, 'movie_id', 'movie')
graph_builder.add_binary_relations(ratings, 'user_id', 'movie_id', 'watched')
graph_builder.add_binary_relations(ratings, 'movie_id', 'user_id', 'watched-by')

In [122]:
g = graph_builder.build()

In [124]:
g

Graph(num_nodes={'movie': 3706, 'user': 6040},
      num_edges={('movie', 'watched-by', 'user'): 1000209, ('user', 'watched', 'movie'): 1000209},
      metagraph=[('movie', 'user', 'watched-by'), ('user', 'movie', 'watched')])