# Integrate ArangoDB with PyTorch Geometric to Build Recommendation Systems

Reference:
* https://sachinsharma9780.medium.com/integrate-arangodb-with-pytorch-geometric-to-build-recommendation-systems-dd69db688465
* https://python.plainenglish.io/python-for-datascientist-ignoring-warnings-can-backfire-d0463cdf4364

In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
from arango import ArangoClient
from tqdm import tqdm
import numpy as np
import itertools
import requests
import sys
# pip install oasis -> # oasis-0.1.3 sklearn-0.0
import oasis
from arango import ArangoClient

import torch
import torch.nn.functional as F
from torch.nn import Linear
from arango import ArangoClient
import torch_geometric.transforms as T
from torch_geometric.nn import SAGEConv, to_hetero
from torch_geometric.transforms import RandomLinkSplit, ToUndirected
from sentence_transformers import SentenceTransformer
from torch_geometric.data import HeteroData
import yaml

2022-04-15 16:20:07.507889: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.10.1.dylib


In [2]:
##  1, Loading the data present in csv files to ArangoDB
metadata_path = './sampled_movie_dataset/movies_metadata.csv'
df = pd.read_csv(metadata_path)
# on these rows metadata information is missing
df = df.drop([19730, 29503, 35587])
# sampled from links.csv file
links_small = pd.read_csv('./sampled_movie_dataset/links_small.csv')
# selecting tmdbId coloumn from links_small file
links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype('int')
df['id'] = df['id'].astype('int')
sampled_md = df[df['id'].isin(links_small)]
sampled_md['tagline'].fillna('', inplace=True)
sampled_md['description'] = sampled_md['overview'] + sampled_md['tagline']
sampled_md['description'].fillna('', inplace=True)
sampled_md = sampled_md.reset_index()
indices = pd.Series(sampled_md.index, index=sampled_md['title'])
ind_gen = pd.Series(sampled_md.index, index=sampled_md['genres'])

In [3]:
##  2, Let’s Load Ratings File
ratings_path = './sampled_movie_dataset/ratings_small.csv'
ratings_df = pd.read_csv(ratings_path)
# performs user and movie mappings
def node_mappings(path, index_col):
    df = pd.read_csv(path, index_col=index_col)
    mapping = {index: i for i, index in enumerate(df.index.unique())}
    return mapping

user_mapping = node_mappings(ratings_path, index_col='userId')
movie_mapping = node_mappings(ratings_path, index_col='movieId')
m_id = ratings_df['movieId'].tolist()
# all unique movie_ids present inside ratings file
#m_id = list(set(m_id))
m_id = list(dict.fromkeys(m_id))
print(f'len(m_id): {len(m_id)}')

def convert_int(x):
    try:
        return int(x)
    except:
        return np.nan

id_map = pd.read_csv('./sampled_movie_dataset/links_small.csv')[['movieId', 'tmdbId']]
id_map['tmdbId'] = id_map['tmdbId'].apply(convert_int)
id_map.columns = ['movieId', 'id']
# tmbdid is same (of links_small) as of id in sampled_md
id_map = id_map.merge(sampled_md[['title', 'id']], on='id').set_index('title')
indices_map = id_map.set_index('id')

len(m_id): 9066


In [None]:
## 3, ArangoDB Setup
# get temporary credentials for ArangoDB on cloud
login = oasis.getTempCredentials(tutorialName="MovieRecommendations", credentialProvider="https://tutorials.arangodb.cloud:8529/_db/_system/tutorialDB/tutorialDB")

# Connect to the temp database
# Please note that we use the python-arango driver as it has better support for ArangoSearch 
movie_rec_db = oasis.connect_python_arango(login)
# url to access the ArangoDB Web UI
print("https://"+login["hostname"]+":"+str(login["port"]))
print("Username: " + login["username"])
print("Password: " + login["password"])
print("Database: " + login["dbName"])
# remove ids which dont have meta data information
def remove_movies(m_id):
    no_metadata = []
    for idx in range(len(m_id)):
        tmdb_id = id_map.loc[id_map['movieId'] == m_id[idx]]
  
        if tmdb_id.size == 0:
            no_metadata.append(m_id[idx])
            #print('No Meta data information at:', m_id[idx])
    return no_metadata
no_metadata = remove_movies(m_id)
## remove ids which dont have meta data information
for element in no_metadata:
    if element in m_id:
        print("ids with no metadata information:",element)
        m_id.remove(element)
# create new movie_mapping dict with only m_ids having metadata information
movie_mappings = {}
for idx, m in enumerate(m_id):
    movie_mappings[m] = idx