In [1]:
!pip install jsonlines

Collecting jsonlines
  Downloading jsonlines-3.0.0-py3-none-any.whl (8.5 kB)
Installing collected packages: jsonlines
Successfully installed jsonlines-3.0.0


In [2]:
import os
import sys
import csv, jsonlines
import numpy as np
import copy
import random

In [3]:
%matplotlib inline
import matplotlib.pyplot as plt

Matplotlib is building the font cache; this may take a moment.


In [4]:
%%bash

curl -o ml-latest-small.zip http://files.grouplens.org/datasets/movielens/ml-latest-small.zip
unzip ml-latest-small.zip
rm ml-latest-small.zip

Archive:  ml-latest-small.zip
   creating: ml-latest-small/
  inflating: ml-latest-small/links.csv  
  inflating: ml-latest-small/tags.csv  
  inflating: ml-latest-small/ratings.csv  
  inflating: ml-latest-small/README.txt  
  inflating: ml-latest-small/movies.csv  


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0 48  955k   48  459k    0     0   298k      0  0:00:03  0:00:01  0:00:02  298k100  955k  100  955k    0     0   560k      0  0:00:01  0:00:01 --:--:--  560k


In [5]:
## some utility functions


def load_csv_data(filename, delimiter, verbose=True):
    """
    input: a file readable as csv and separated by a delimiter
    and has format users - movies - ratings - etc
    output: a list, where each row of the list is of the form
    {'in0':userID, 'in1':movieID, 'label':rating}
    """
    to_data_list = list()
    users = list()
    movies = list()
    ratings = list()
    unique_users = set()
    unique_movies = set()
    with open(filename, "r") as csvfile:
        reader = csv.reader(csvfile, delimiter=delimiter)
        header = next(reader)
        for count, row in enumerate(reader):
            # if count!=0:
            to_data_list.append(
                {"in0": [int(row[0])], "in1": [int(row[1])], "label": float(row[2])}
            )
            users.append(row[0])
            movies.append(row[1])
            ratings.append(float(row[2]))
            unique_users.add(row[0])
            unique_movies.add(row[1])
    if verbose:
        print("In file {}, there are {} ratings".format(filename, len(ratings)))
        print(
            "The ratings have mean: {}, median: {}, and variance: {}".format(
                round(np.mean(ratings), 2), round(np.median(ratings), 2), round(np.var(ratings), 2)
            )
        )
        print(
            "There are {} unique users and {} unique movies".format(
                len(unique_users), len(unique_movies)
            )
        )
    return to_data_list


def csv_to_augmented_data_dict(filename, delimiter):
    """
    Input: a file that must be readable as csv and separated by delimiter (to make columns)
    has format users - movies - ratings - etc
    Output:
      Users dictionary: keys as user ID's; each key corresponds to a list of movie ratings by that user
      Movies dictionary: keys as movie ID's; each key corresponds a list of ratings of that movie by different users
    """
    to_users_dict = dict()
    to_movies_dict = dict()
    with open(filename, "r") as csvfile:
        reader = csv.reader(csvfile, delimiter=delimiter)
        header = next(reader)
        for count, row in enumerate(reader):
            # if count!=0:
            if row[0] not in to_users_dict:
                to_users_dict[row[0]] = [(row[1], row[2])]
            else:
                to_users_dict[row[0]].append((row[1], row[2]))
            if row[1] not in to_movies_dict:
                to_movies_dict[row[1]] = list(row[0])
            else:
                to_movies_dict[row[1]].append(row[0])
    return to_users_dict, to_movies_dict


def user_dict_to_data_list(user_dict):
    # turn user_dict format to data list format (acceptable to the algorithm)
    data_list = list()
    for user, movie_rating_list in user_dict.items():
        for movie, rating in movie_rating_list:
            data_list.append({"in0": [int(user)], "in1": [int(movie)], "label": float(rating)})
    return data_list


def divide_user_dicts(user_dict, sp_ratio_dict):
    """
    Input: A user dictionary, a ration dictionary
         - format of sp_ratio_dict = {'train':0.8, "test":0.2}
    Output:
        A dictionary of dictionaries, with key corresponding to key provided by sp_ratio_dict
        and each key corresponds to a subdivded user dictionary
    """
    ratios = [val for _, val in sp_ratio_dict.items()]
    assert np.sum(ratios) == 1, "the sampling ratios must sum to 1!"
    divided_dict = {}
    for user, movie_rating_list in user_dict.items():
        sub_movies_ptr = 0
        sub_movies_list = []
        # movie_list, _ = zip(*movie_rating_list)
        # print(movie_list)
        for i, ratio in enumerate(ratios):
            if i < len(ratios) - 1:
                sub_movies_ptr_end = sub_movies_ptr + int(len(movie_rating_list) * ratio)
                sub_movies_list.append(movie_rating_list[sub_movies_ptr:sub_movies_ptr_end])
                sub_movies_ptr = sub_movies_ptr_end
            else:
                sub_movies_list.append(movie_rating_list[sub_movies_ptr:])
        for subset_name in sp_ratio_dict.keys():
            if subset_name not in divided_dict:
                divided_dict[subset_name] = {user: sub_movies_list.pop(0)}
            else:
                # access sub-dictionary
                divided_dict[subset_name][user] = sub_movies_list.pop(0)

    return divided_dict


def write_csv_to_jsonl(jsonl_fname, csv_fname, csv_delimiter):
    """
    Input: a file readable as csv and separated by delimiter (to make columns)
        - has format users - movies - ratings - etc
    Output: a jsonline file converted from the csv file
    """
    with jsonlines.open(jsonl_fname, mode="w") as writer:
        with open(csv_fname, "r") as csvfile:
            reader = csv.reader(csvfile, delimiter=csv_delimiter)
            header = next(reader)
            for count, row in enumerate(reader):
                # print(row)
                # if count!=0:
                writer.write({"in0": [int(row[0])], "in1": [int(row[1])], "label": float(row[2])})
        print("Created {} jsonline file".format(jsonl_fname))


def write_data_list_to_jsonl(data_list, to_fname):
    """
    Input: a data list, where each row of the list is a Python dictionary taking form
    {'in0':userID, 'in1':movieID, 'label':rating}
    Output: save the list as a jsonline file
    """
    with jsonlines.open(to_fname, mode="w") as writer:
        for row in data_list:
            # print(row)
            writer.write({"in0": row["in0"], "in1": row["in1"], "label": row["label"]})
    print("Created {} jsonline file".format(to_fname))


def data_list_to_inference_format(data_list, binarize=True, label_thres=3):
    """
    Input: a data list
    Output: test data and label, acceptable by SageMaker for inference
    """
    data_ = [({"in0": row["in0"], "in1": row["in1"]}, row["label"]) for row in data_list]
    data, label = zip(*data_)
    infer_data = {"instances": data}
    if binarize:
        label = get_binarized_label(list(label), label_thres)
    return infer_data, label


def get_binarized_label(data_list, thres):
    """
    Input: data list
    Output: a binarized data list for recommendation task
    """
    for i, row in enumerate(data_list):
        if type(row) is dict:
            # if i < 10:
            # print(row['label'])
            if row["label"] > thres:
                # print(row)
                data_list[i]["label"] = 1
            else:
                data_list[i]["label"] = 0
        else:
            if row > thres:
                data_list[i] = 1
            else:
                data_list[i] = 0
    return data_list

In [16]:
## Load data and shuffle
prefix = "ml-latest-small"
rating_data_path = os.path.join(prefix, "ratings.csv")


rating_data_list = load_csv_data(train_path, ",")
random.shuffle(train_data_list)

In file ml-latest-small/ratings.csv, there are 100836 ratings
The ratings have mean: 3.5, median: 3.5, and variance: 1.09
There are 610 unique users and 9724 unique movies


In [17]:
from sklearn.model_selection import train_test_split
train_data_list, validation_data_list = train_test_split(rating_data_list)

In [27]:
## Save training and validation data locally for rating-prediction (regression) task

write_data_list_to_jsonl(copy.deepcopy(train_data_list), "train_r.jsonl")
write_data_list_to_jsonl(copy.deepcopy(validation_data_list), "validation_r.jsonl")

Created train_r.jsonl jsonline file
Created validation_r.jsonl jsonline file


In [18]:
## Save training and validation data locally for recommendation (classification) task

### binarize the data

train_c = get_binarized_label(copy.deepcopy(train_data_list), 3.0)
valid_c = get_binarized_label(copy.deepcopy(validation_data_list), 3.0)

write_data_list_to_jsonl(train_c, "train_c.jsonl")
write_data_list_to_jsonl(valid_c, "validation_c.jsonl")

Created train_c.jsonl jsonline file
Created validation_c.jsonl jsonline file


In [19]:
train_c_label = [row["label"] for row in train_c]
valid_c_label = [row["label"] for row in valid_c]

print(
    "There are {} fraction of positive ratings in train_c.jsonl".format(
        np.count_nonzero(train_c_label) / len(train_c_label)
    )
)
print(
    "There are {} fraction of positive ratings in validation_c.jsonl".format(
        np.sum(valid_c_label) / len(valid_c_label)
    )
)

There are 0.6127441257751861 fraction of positive ratings in train_c.jsonl
There are 0.6099408941251141 fraction of positive ratings in validation_c.jsonl


In [20]:
def get_mse_loss(res, labels):
    if type(res) is dict:
        res = res["predictions"]
    assert len(res) == len(labels), "result and label length mismatch!"
    loss = 0
    for row, label in zip(res, labels):
        if type(row) is dict:
            loss += (row["scores"][0] - label) ** 2
        else:
            loss += (row - label) ** 2
    return round(loss / float(len(labels)), 2)

In [21]:
valid_r_data, valid_r_label = data_list_to_inference_format(
    copy.deepcopy(validation_data_list), binarize=False
)


In [22]:
train_r_label = [row["label"] for row in copy.deepcopy(train_data_list)]

bs1_prediction = round(np.mean(train_r_label), 2)
print("The Baseline 1 (global rating average) prediction is {}".format(bs1_prediction))
print(
    "The validation mse loss of the Baseline 1 is {}".format(
        get_mse_loss(len(valid_r_label) * [bs1_prediction], valid_r_label)
    )
)


The Baseline 1 (global rating average) prediction is 3.5
The validation mse loss of the Baseline 1 is 1.1


In [23]:
def bs2_predictor(test_data, user_dict, is_classification=False, thres=3):
    test_data = copy.deepcopy(test_data["instances"])
    predictions = list()
    for row in test_data:
        userID = str(row["in0"][0])
        # predict movie ID based on local average of user's prediction
        local_movies, local_ratings = zip(*user_dict[userID])
        local_ratings = [float(score) for score in local_ratings]
        predictions.append(np.mean(local_ratings))
        if is_classification:
            predictions[-1] = int(predictions[-1] > 3)
    return predictions

In [24]:
bs2_prediction = bs2_predictor(valid_r_data, to_users_dict, is_classification=False)
print(
    "The validation loss of the Baseline 2 (user-based rating average) is {}".format(
        get_mse_loss(bs2_prediction, valid_r_label)
    )
)

The validation loss of the Baseline 2 (user-based rating average) is 0.89


In [25]:
import boto3
import os
import sagemaker

bucket = sagemaker.session.Session().default_bucket()
input_prefix = "object2vec/movielens/input"
output_prefix = "object2vec/movielens/output"

In [28]:
from sagemaker.inputs import TrainingInput

s3_client = boto3.client("s3")
input_paths = {}
output_path = os.path.join("s3://", bucket, output_prefix)

for data_name in ["train", "validation"]:
    pre_key = os.path.join(input_prefix, "rating", f"{data_name}")
    fname = "{}_r.jsonl".format(data_name)
    data_path = os.path.join("s3://", bucket, pre_key, fname)
    s3_client.upload_file(fname, bucket, os.path.join(pre_key, fname))
    input_paths[data_name] = TrainingInput(
        data_path, distribution="ShardedByS3Key", content_type="application/jsonlines"
    )
    print("Uploaded {} data to {} and defined input path".format(data_name, data_path))

print("Trained model will be saved at", output_path)

Uploaded train data to s3://sagemaker-ap-northeast-1-073956268323/object2vec/movielens/input/rating/train/train_r.jsonl and defined input path
Uploaded validation data to s3://sagemaker-ap-northeast-1-073956268323/object2vec/movielens/input/rating/validation/validation_r.jsonl and defined input path
Trained model will be saved at s3://sagemaker-ap-northeast-1-073956268323/object2vec/movielens/output


In [29]:
import sagemaker
from sagemaker import get_execution_role

sess = sagemaker.Session()

role = get_execution_role()
print(role)

## Get docker image of ObjectToVec algorithm
from sagemaker import image_uris

container = image_uris.retrieve(region=boto3.Session().region_name, framework="object2vec")

arn:aws:iam::073956268323:role/service-role/AmazonSageMaker-ExecutionRole-20220425T161571


In [37]:
for data_name in ["train", "validation"]:
    fname = "{}_c.jsonl".format(data_name)
    pre_key = os.path.join(input_prefix, "recommendation", f"{data_name}")
    data_path = os.path.join("s3://", bucket, pre_key, fname)
    s3_client.upload_file(fname, bucket, os.path.join(pre_key, fname))
    input_paths[data_name] = TrainingInput(
        data_path, distribution="ShardedByS3Key", content_type="application/jsonlines"
    )
    print("Uploaded data to {}".format(data_path))

Uploaded data to s3://sagemaker-ap-northeast-1-073956268323/object2vec/movielens/input/recommendation/train/train_c.jsonl
Uploaded data to s3://sagemaker-ap-northeast-1-073956268323/object2vec/movielens/input/recommendation/validation/validation_c.jsonl


In [38]:
from sagemaker.session import s3_input

hyperparameters_c = {
    "_kvstore": "device",
    "_num_gpus": "auto",
    "_num_kv_servers": "auto",
    "bucket_width": 0,
    "early_stopping_patience": 3,
    "early_stopping_tolerance": 0.01,
    "enc0_cnn_filter_width": 3,
    "enc0_layers": "auto",
    "enc0_max_seq_len": 1,
    "enc0_network": "pooled_embedding",
    "enc0_token_embedding_dim": 300,
    "enc0_vocab_size": 611,
    "enc1_cnn_filter_width": 3,
    "enc1_layers": "auto",
    "enc1_max_seq_len": 1,
    "enc1_network": "pooled_embedding",
    "enc1_token_embedding_dim": 300,
    "enc1_vocab_size": 193610,
    "enc_dim": 2048,
    "epochs": 20,
    "learning_rate": 0.001,
    "mini_batch_size": 2048,
    "mlp_activation": "relu",
    "mlp_dim": 1024,
    "mlp_layers": 1,
    "num_classes": 2,
    "optimizer": "adam",
    "output_layer": "softmax",
}

In [40]:
## get estimator
classifier = sagemaker.estimator.Estimator(
    container,
    role,
    instance_count=1,
    instance_type="ml.m5.4xlarge",
    output_path=output_path,
    sagemaker_session=sess,
)

## set hyperparameters
classifier.set_hyperparameters(**hyperparameters_c)

## train, tune, and test the model
classifier.fit(input_paths)

2022-04-25 13:37:53 Starting - Starting the training job...
2022-04-25 13:38:16 Starting - Preparing the instances for trainingProfilerReport-1650893873: InProgress
......
2022-04-25 13:39:23 Downloading - Downloading input data...
2022-04-25 13:39:38 Training - Downloading the training image........[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[04/25/2022 13:41:05 INFO 140532052252480 integration.py:636] worker started[0m
[34m[04/25/2022 13:41:05 INFO 140532052252480] Reading default configuration from /opt/amazon/lib/python3.7/site-packages/algorithm/default-input.json: {'enc_dim': 4096, 'mlp_dim': 512, 'mlp_activation': 'linear', 'mlp_layers': 2, 'output_layer': 'softmax', 'optimizer': 'adam', 'learning_rate': 0.0004, 'mini_batch_size': 32, 'epochs': 30, 'bucket_width': 0, 'early_stopping_tolerance': 0.01, 'early_stopping_patience': 3, 'dropout': 0, 'weight_decay': 0, 'negative_sampling_rate': 0, 'comparato


2022-04-25 13:41:17 Training - Training image download completed. Training in progress.[34m[04/25/2022 13:41:56 INFO 140532052252480] **************[0m
[34m[04/25/2022 13:41:56 INFO 140532052252480] Completed Epoch: 0, time taken: 0:00:47.340911[0m
[34m[04/25/2022 13:41:56 INFO 140532052252480] Epoch 0 Training metrics:   perplexity: 1.785 cross_entropy: 0.580 accuracy: 0.696 [0m
[34m[04/25/2022 13:41:56 INFO 140532052252480] #quality_metric: host=algo-1, epoch=0, train cross_entropy <loss>=0.579517438605025[0m
[34m[04/25/2022 13:41:56 INFO 140532052252480] #quality_metric: host=algo-1, epoch=0, train accuracy <score>=0.6962758657094594[0m
[34m[04/25/2022 13:41:58 INFO 140532052252480] Epoch 0 Validation metrics: perplexity: 1.715 cross_entropy: 0.539 accuracy: 0.728 [0m
[34m[04/25/2022 13:41:58 INFO 140532052252480] #quality_metric: host=algo-1, epoch=0, validation cross_entropy <loss>=0.5392816800337571[0m
[34m[04/25/2022 13:41:58 INFO 140532052252480] #quality_metric

[34m[04/25/2022 13:45:08 INFO 140532052252480] **************[0m
[34m[04/25/2022 13:45:08 INFO 140532052252480] Completed Epoch: 4, time taken: 0:00:43.096698[0m
[34m[04/25/2022 13:45:08 INFO 140532052252480] Epoch 4 Training metrics:   perplexity: 1.026 cross_entropy: 0.025 accuracy: 0.994 [0m
[34m[04/25/2022 13:45:08 INFO 140532052252480] #quality_metric: host=algo-1, epoch=4, train cross_entropy <loss>=0.02542099356651306[0m
[34m[04/25/2022 13:45:08 INFO 140532052252480] #quality_metric: host=algo-1, epoch=4, train accuracy <score>=0.9937447212837838[0m
[34m[04/25/2022 13:45:11 INFO 140532052252480] Epoch 4 Validation metrics: perplexity: 2.504 cross_entropy: 0.918 accuracy: 0.719 [0m
[34m[04/25/2022 13:45:11 INFO 140532052252480] #quality_metric: host=algo-1, epoch=4, validation cross_entropy <loss>=0.9177479468859159[0m
[34m[04/25/2022 13:45:11 INFO 140532052252480] #quality_metric: host=algo-1, epoch=4, validation accuracy <score>=0.7186373197115384[0m
[34m[04/25


2022-04-25 13:47:02 Uploading - Uploading generated training model
2022-04-25 13:50:33 Completed - Training job completed
Training seconds: 670
Billable seconds: 670


In [42]:
# import numpy as np
from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import JSONDeserializer

In [43]:
classification_model = classifier.create_model()

predictor_2 = classification_model.deploy(
    serializer=JSONSerializer(),
    deserializer=JSONDeserializer(),
    content_type="application/json",
    initial_instance_count=1,
    instance_type="ml.m5.4xlarge",
)

-------!

In [44]:
valid_c_data, valid_c_label = data_list_to_inference_format(
    copy.deepcopy(validation_data_list), label_thres=3, binarize=True
)
predictions = predictor_2.predict(valid_c_data)

In [45]:
def get_class_accuracy(res, labels, thres):
    if type(res) is dict:
        res = res["predictions"]
    assert len(res) == len(labels), "result and label length mismatch!"
    accuracy = 0
    for row, label in zip(res, labels):
        if type(row) is dict:
            if row["scores"][1] > thres:
                prediction = 1
            else:
                prediction = 0
            if label > thres:
                label = 1
            else:
                label = 0
            accuracy += 1 - (prediction - label) ** 2
    return accuracy / float(len(res))


print(
    "The accuracy on the binarized validation set is %.3f"
    % get_class_accuracy(predictions, valid_c_label, 0.5)
)

The accuracy on the binarized validation set is 0.729


In [50]:
def get_movie_embedding_dict(movie_ids, trained_model):
    input_instances = list()
    for s_id in movie_ids:
        input_instances.append({"in1": [s_id]})
    data = {"instances": input_instances}
    movie_embeddings = trained_model.predict(data)
    embedding_dict = {}
    for s_id, row in zip(movie_ids, movie_embeddings["predictions"]):
        embedding_dict[s_id] = np.array(row["embeddings"])
    return embedding_dict


def load_movie_id_name_map(item_file):
    movieID_name_map = {}
    with open(item_file, "r", encoding="ISO-8859-1") as f:
        next(f)
        for row in f.readlines():
            row = row.strip()
            split = row.split(",")
            movie_id = split[0]
            movie_name = split[1]
            #sparse_tags = split[-19:]
            movieID_name_map[int(movie_id)] = movie_name
    return movieID_name_map


def get_nn_of_movie(movie_id, candidate_movie_ids, embedding_dict):
    movie_emb = embedding_dict[movie_id]
    min_dist = float("Inf")
    best_id = candidate_movie_ids[0]
    for idx, m_id in enumerate(candidate_movie_ids):
        candidate_emb = embedding_dict[m_id]
        curr_dist = np.linalg.norm(candidate_emb - movie_emb)
        if curr_dist < min_dist:
            best_id = m_id
            min_dist = curr_dist
    return best_id, min_dist


def get_unique_movie_ids(data_list):
    unique_movie_ids = set()
    for row in data_list:
        unique_movie_ids.add(row["in1"][0])
    return list(unique_movie_ids)

In [48]:
train_data_list = load_csv_data(train_path, ",", verbose=False)
unique_movie_ids = get_unique_movie_ids(train_data_list)
embedding_dict = get_movie_embedding_dict(unique_movie_ids, predictor_2)
candidate_movie_ids = unique_movie_ids.copy()

In [51]:
movie_id_to_examine = 195 

In [52]:
candidate_movie_ids.remove(movie_id_to_examine)
best_id, min_dist = get_nn_of_movie(movie_id_to_examine, candidate_movie_ids, embedding_dict)
movieID_name_map = load_movie_id_name_map("ml-latest-small/movies.csv")
print(
    "The closest movie to {} in the embedding space is {}".format(
        movieID_name_map[movie_id_to_examine], movieID_name_map[best_id]
    )
)
candidate_movie_ids.append(movie_id_to_examine)

The closest movie to Something to Talk About (1995) in the embedding space is Loaded Weapon 1 (National Lampoon's Loaded Weapon 1) (1993)


In [54]:
predictor_2.delete_endpoint()