In [39]:
# Install cornac
!pip install --quiet cornac==1.14.2 adjustText


In [40]:
# codes to mount your google drive folder
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/Recommender/

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Recommender


In [43]:
# path configuration
ip_train_path = "./Assign1/cs608_ip_train_v3.csv"
ip_probe_path = "./Assign1/cs608_ip_probe_v3.csv"

In [42]:
#import necessary libraries
import os
import sys

import numpy as np
import pandas as pd
import cornac
from cornac.data import Reader
from cornac.datasets import movielens
from cornac.eval_methods import RatioSplit, BaseMethod
from cornac.models import MF, NMF, BPR, WMF

%tensorflow_version 1.x
import tensorflow as tf

VERBOSE = False

Data Preparation & Exploration

In [44]:
#Create data tables by pandas
train_data = pd.read_csv(ip_train_path)
probe_data = pd.read_csv(ip_probe_path)

In [47]:
#Check data shape
print(f"Train data:", train_data.shape)
print(f"Probe data:", probe_data.shape)

Train data: (147244, 3)
Probe data: (79118, 3)


In [46]:
#Check number of unique users, items and ratings
print("Train data:")
print(train_data.nunique())
print("\n")
print("Probe data:")
print(probe_data.nunique())

Train data:
user_id    46612
item_id    21820
rating         5
dtype: int64


Probe data:
user_id    46061
item_id    18230
rating         5
dtype: int64


We can see that the probe dataset is large and comprehensive enough to be used as test data.

In [48]:
#Check null values
print("Train data:", len(np.where(pd.isnull(train_data))[0]))
print("Probe data:", len(np.where(pd.isnull(probe_data))[0]))

Train data: 0
Probe data: 0


In [49]:
#Check duplicate values
print("Train data:", train_data[train_data.duplicated()]['user_id'].count())
print("Probe data:", probe_data[train_data.duplicated()]['user_id'].count())

Train data: 0
Probe data: 0


  This is separate from the ipykernel package so we can avoid doing imports until


In [50]:
#Check data types
print("Train data:")
print(train_data.dtypes)
print("\n")
print("Probe data:")
print(probe_data.dtypes)

Train data:
user_id    int64
item_id    int64
rating     int64
dtype: object


Probe data:
user_id    int64
item_id    int64
rating     int64
dtype: object


In [51]:
#Check abnomality: negative rating
print('Unique ratings in train data:', train_data['rating'].unique())
print('Unique ratings in probe data:',probe_data['rating'].unique())

Unique ratings in train data: [4 5 1 3 2]
Unique ratings in probe data: [4 2 5 1 3]


We did not find issues with the datasets. They are ready to be used for recommendation models.

Building recommendation models: The following describes the model that provided the highest result.

In [52]:
#Using reader of cornac to read files.
reader = Reader()
train_data = reader.read(ip_train_path, sep=',', skip_lines=1)
probe_data = reader.read(ip_probe_path, sep=',', skip_lines=1)

In [53]:
#Building recommendation model 

#Setting up hyperparameters
K = 400
Max_iter = 10000
Learning_rate=0.1
SEED = 19
Lambda = 0.05

# Instantiate a Base evaluation method using the provided train and probe sets
eval_method = BaseMethod.from_splits(train_data=train_data, test_data=probe_data, exclude_unknowns=True, verbose=True)

# Instantiate the BPR model
bpr = BPR(k=K, max_iter=Max_iter, learning_rate=Learning_rate, lambda_reg=Lambda, verbose=VERBOSE, seed=SEED, name=f"BPR(K={K})")

eval_metrics = [
  cornac.metrics.RMSE(), 
  cornac.metrics.AUC(),
  cornac.metrics.Precision(k=50),
  cornac.metrics.Recall(k=50),
  cornac.metrics.FMeasure(k=50),
  cornac.metrics.NDCG(k=[50])
]
models = [bpr]

# Evaluation
for m in models: 
  test_result, val_result = eval_method.evaluate(
    model=m, metrics=eval_metrics, user_based=True
)
  print(test_result)

rating_threshold = 1.0
exclude_unknowns = True
---
Training data:
Number of users = 46612
Number of items = 21820
Number of ratings = 147244
Max rating = 5.0
Min rating = 1.0
Global mean = 4.6
---
Test data:
Number of users = 46061
Number of items = 18230
Number of ratings = 79118
Number of unknown users = 0
Number of unknown items = 0
---
Total users = 46612
Total items = 21820

[BPR(K=400)] Training started!

[BPR(K=400)] Evaluation started!


Rating:   0%|          | 0/79118 [00:00<?, ?it/s]

Ranking:   0%|          | 0/46061 [00:00<?, ?it/s]

           |   RMSE |    AUC |  F1@50 | NDCG@50 | Precision@50 | Recall@50 | Train (s) | Test (s)
---------- + ------ + ------ + ------ + ------- + ------------ + --------- + --------- + --------
BPR(K=400) | 3.3386 | 0.8304 | 0.0109 |  0.0673 |       0.0057 |    0.1770 | 1755.2945 | 339.0668



In [54]:
#Export results to submit
from google.colab import files

iid2raw = {v:k for k, v in eval_method.train_set.iid_map.items()}
n_users = len(eval_method.train_set.uid_map)

with open('submission.txt', 'w') as f:
  for uid in range(1, n_users + 1):
    ranked_iids = bpr.rank(eval_method.train_set.uid_map[str(uid)])[0]
    top50ranked_raw_iids = [iid2raw[iid] for iid in ranked_iids[:50]]
    f.write(' '.join(top50ranked_raw_iids))
    f.write('\n')

files.download('submission.txt')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>