Link to Drive for CDs dataset: https://drive.google.com/drive/folders/1t2Y24NSpGlT2M29zJUDvvEg2KfP0JqkN?usp=share_link

# Preprocessing...

In [None]:
import pandas as pd
import numpy as np
import torch
import json
import matplotlib.pyplot as plt
import os
import tqdm
import pickle
from pathlib import Path
from torch.utils.data import DataLoader
from sklearn.metrics import ndcg_score
np.random.seed(0)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Preprocessing Amazon ...


In [None]:
!wget https://jmcauley.ucsd.edu/data/amazon_v2/categoryFiles/CDs_and_Vinyl.json.gz --no-check-certificate 

--2023-01-21 11:16:26--  https://jmcauley.ucsd.edu/data/amazon_v2/categoryFiles/CDs_and_Vinyl.json.gz
Resolving jmcauley.ucsd.edu (jmcauley.ucsd.edu)... 137.110.160.73
Connecting to jmcauley.ucsd.edu (jmcauley.ucsd.edu)|137.110.160.73|:443... connected.
  Unable to locally verify the issuer's authority.
HTTP request sent, awaiting response... 200 OK
Length: 1172666826 (1.1G) [application/x-gzip]
Saving to: ‘CDs_and_Vinyl.json.gz’

CDs_and_Vinyl.json.   2%[                    ]  33.23M  27.4MB/s               ^C


In [None]:
!gunzip CDs_and_Vinyl.json.gz

In [None]:
!wget https://jmcauley.ucsd.edu/data/amazon_v2/metaFiles2/meta_CDs_and_Vinyl.json.gz --no-check-certificate 

--2023-01-21 09:43:16--  https://jmcauley.ucsd.edu/data/amazon_v2/metaFiles2/meta_CDs_and_Vinyl.json.gz
Resolving jmcauley.ucsd.edu (jmcauley.ucsd.edu)... 137.110.160.73
Connecting to jmcauley.ucsd.edu (jmcauley.ucsd.edu)|137.110.160.73|:443... connected.
  Unable to locally verify the issuer's authority.
HTTP request sent, awaiting response... 200 OK
Length: 161716387 (154M) [application/x-gzip]
Saving to: ‘meta_CDs_and_Vinyl.json.gz’


2023-01-21 09:43:26 (15.9 MB/s) - ‘meta_CDs_and_Vinyl.json.gz’ saved [161716387/161716387]



In [None]:
user_thresh=15
feature_thresh=10
review_dir='/content/CDs_and_Vinyl.json'
sentires_dir='/content/drive/Shareddrives/Unlimited Drive | @LicenseMarket/Recommender/cds/CDs_and_Vinyl'
test_length=5
sample_ratio=2
val_length=1
neg_length=100
dataset='cds_and_vinyl'
save_path='/content/drive/Shareddrives/Unlimited Drive | @LicenseMarket/Recommender/cds/'

In [None]:
import numpy as np
import math
import matplotlib.pyplot as plt

np.random.seed(0)

In [None]:
def get_feature_list(sentiment_data):
    """
    from user sentiment data, get all the features [F1, F2, ..., Fk] mentioned in the reviews
    :param sentiment_data: [user, item, [feature1, opinion1, sentiment1], [feature2, opinion2, sentiment2] ...]
    :return: feature set F
    """
    feature_list = []
    for row in sentiment_data:
        for fos in row[2:]:
            feature = fos[0]
            if feature not in feature_list:
                feature_list.append(feature)
    feature_list = np.array(feature_list)
    return feature_list

In [None]:
def get_user_attention_matrix(sentiment_data, user_num, feature_list, max_range=5):
    """
    build user attention matrix
    :param sentiment_data: [user, item, [feature1, opinion1, sentiment1], [feature2, opinion2, sentiment2] ...]
    :param user_num: number of users
    :param feature_list: [F1, F2, ..., Fk]
    :param max_range: normalize the attention value to [1, max_range]
    :return: the user attention matrix, Xij is user i's attention on feature j
    """
    user_counting_matrix = np.zeros((user_num, len(feature_list)))  # tij = x if user i mention feature j x times
    for row in sentiment_data:
        user = row[0]
        for fos in row[2:]:
            feature = fos[0]
            user_counting_matrix[user, feature] += 1
    user_attention_matrix = np.zeros((user_num, len(feature_list)))  # xij = [1-N], normalized attention matrix
    for i in range(len(user_counting_matrix)):
        for j in range(len(user_counting_matrix[i])):
            if user_counting_matrix[i, j] == 0:
                norm_v = 0  # if nor mentioned: 0
            else:
                norm_v = 1 + (max_range - 1) * ((2 / (1 + np.exp(-user_counting_matrix[i, j]))) - 1)  # norm score
            user_attention_matrix[i, j] = norm_v
    user_attention_matrix = np.array(user_attention_matrix, dtype='float32')
    return user_attention_matrix

In [None]:
def get_item_quality_matrix(sentiment_data, item_num, feature_list, max_range=5):
    """
    build item quality matrix
    :param sentiment_data: [user, item, [feature1, opinion1, sentiment1], [feature2, opinion2, sentiment2] ...]
    :param item_num: number of items
    :param feature_list: [F1, F2, ..., Fk]
    :param max_range: normalize the quality value to [1, max_range]
    :return: the item quality matrix, Yij is item i's quality on feature j
    """
    item_counting_matrix = np.zeros((item_num, len(feature_list)))  # kij = x if item i's feature j is mentioned x times
    item_sentiment_matrix = np.zeros((item_num, len(feature_list)))  # sij = x if the overall rating is x (sum up)
    for row in sentiment_data:
        item = row[1]
        for fos in row[2:]:
            feature = fos[0]
            sentiment = fos[2]
            item_counting_matrix[item, feature] += 1
            if sentiment == '+1':
                item_sentiment_matrix[item, feature] += 1
            elif sentiment == '-1':
                item_sentiment_matrix[item, feature] -= 1
            else:
                print("sentiment data error: the sentiment value can only be +1 or -1")
                exit(1)
    item_quality_matrix = np.zeros((item_num, len(feature_list)))
    for i in range(len(item_counting_matrix)):
        for j in range(len(item_counting_matrix[i])):
            if item_counting_matrix[i, j] == 0:
                norm_v = 0  # if not mentioned: 0
            else:
                norm_v = 1 + ((max_range - 1) / (1 + np.exp(-item_sentiment_matrix[i, j])))  # norm score
            item_quality_matrix[i, j] = norm_v
    item_quality_matrix = np.array(item_quality_matrix, dtype='float32')
    return item_quality_matrix

In [None]:
def get_user_item_dict(sentiment_data):
    """
    build user & item dictionary
    :param sentiment_data: [user, item, [feature1, opinion1, sentiment1], [feature2, opinion2, sentiment2] ...]
    :return: user dictionary {u1:[i, i, i...], u2:[i, i, i...]}, similarly, item dictionary
    """
    user_dict = {}
    item_dict = {}
    for row in sentiment_data:
        user = row[0]
        item = row[1]
        if user not in user_dict:
            user_dict[user] = [item]
        else:
            user_dict[user].append(item)
        if item not in item_dict:
            item_dict[item] = [user]
        else:
            item_dict[item].append(user)
    return user_dict, item_dict

In [None]:
def get_user_item_set(sentiment_data):
    """
    get user item set
    :param sentiment_data: [user, item, [feature1, opinion1, sentiment1], [feature2, opinion2, sentiment2] ...]
    :return: user_set = set(u1, u2, ..., um); item_set = (i1, i2, ..., in)
    """
    user_set = set()
    item_set = set()
    for row in sentiment_data:
        user = row[0]
        item = row[1]
        user_set.add(user)
        item_set.add(item)
    return user_set, item_set

In [None]:
def sample_training_pairs(user, training_items, item_set, sample_ratio=10):
    positive_items = set(training_items)
    negative_items = set()
    for item in item_set:
        if item not in positive_items:
            negative_items.add(item)
    neg_length = len(positive_items) * sample_ratio
    negative_items = np.random.choice(np.array(list(negative_items)), neg_length, replace=False)
    train_pairs = []
    for p_item in positive_items:
        train_pairs.append([user, p_item, 1])
    for n_item in negative_items:
        train_pairs.append([user, n_item, 0])
    return train_pairs

In [None]:
def check_string(string):
    # if the string contains letters
    string_lowercase = string.lower()
    contains_letters = string_lowercase.islower()
    return contains_letters

In [None]:
def visualization(train_losses, val_losses, path):
    plt.plot(np.arange(len(train_losses)), train_losses, label='training loss')
    plt.plot(np.array(len(val_losses)), val_losses, label='validation loss')
    plt.legend()
    plt.savefig(path)
    plt.clf()

In [None]:
def get_mask_vec(user_attantion, k):
    """
    get the top-k mask for features. The counterfactual explanations can only be chosen from this space
    :param user_attantion: user's attantion vector on all the features
    :param k: the k from mask
    :return: a mask vector with 1's on the top-k features that the user cares about and 0's for others.
    """
    top_indices = np.argsort(user_attantion)[::-1][:k]
    mask = [0 for i in range(len(user_attantion))]
    for index in top_indices:
        if user_attantion[index] > 0:  # only consider the user mentioned features
            mask[index] = 1
    return np.array(mask)

In [None]:
def feature_filtering(sentiment_data, valid_features):
  """
  filter the sentiment data, remove the invalid features
  :param sentiment_data: [userID, itemID, [fos triplet 1], [fos triplet 2], ...]
  :param valid_features: set of valid features
  :return: the filtered sentiment data
  """
  cleaned_sentiment_data = []
  for row in sentiment_data:
      user = row[0]
      item = row[1]
      cleaned_sentiment_data.append([user, item])
      for fos in row[2:]:
          if fos[0] in valid_features:
              cleaned_sentiment_data[-1].append(fos)
      # if len(cleaned_sentiment_data[-1]) == 2:
      #     del cleaned_sentiment_data[-1]
  return np.array(cleaned_sentiment_data)

In [None]:
def sentiment_data_filtering(sentiment_data, user_thresh, feature_thresh):
    """
    filter the sentiment data, remove the users with less review number less than "user_thresh" and remove the features
    mentioned less than "feature_thresh" or don't contain letters.
    :param sentiment_data: [userID, itemID, [fos triplet 1], [fos triplet 2], ...]
    :param user_thresh: the threshold for user reviews
    :param feature_thresh: the threshold features
    :return: the filtered sentiment data
    """
    print('======================= filtering sentiment data =======================')
    sentiment_data = np.array(sentiment_data)
    last_length = len(sentiment_data)
    un_change_count = 0  # iteratively filtering users and features, if the data stay unchanged twice, stop
    user_dict, item_dict = get_user_item_dict(sentiment_data)
    features = get_feature_list(sentiment_data)
    print("original review length: ", len(sentiment_data))
    print("original user length: ", len(user_dict))
    print("original item length: ", len(item_dict))
    print("original feature length: ", len(features))
    while True:
        # feature filtering
        feature_count_dict = {}
        for row in sentiment_data:
            for fos in row[2:]:
                feature = fos[0]
                if feature not in feature_count_dict:
                    feature_count_dict[feature] = 1
                else:
                    feature_count_dict[feature] += 1
        valid_features = set()
        for key, value in feature_count_dict.items():
            if check_string(key) :
                valid_features.add(key)
        # sentiment_data = [row for row in sentiment_data if row[2][0] in valid_features]
        sentiment_data = feature_filtering(sentiment_data, valid_features)
        length = len(sentiment_data)
        if length != last_length:
            last_length = length
            un_change_count = 0
        else:
            un_change_count += 1
            if un_change_count == 2:
                break
        # user filtering
        user_dict, item_dict = get_user_item_dict(sentiment_data)
        valid_user = set()  # the valid users
        for key, value in user_dict.items():
            # if len(value) > (user_thresh - 1):
            valid_user.add(key)
        sentiment_data = [x for x in sentiment_data if x[0] in valid_user]  # remove user with small interactions
        length = len(sentiment_data)
        if length != last_length:
            last_length = length
            un_change_count = 0
        else:
            un_change_count += 1
            if un_change_count == 2:
                break
    user_dict, item_dict = get_user_item_dict(sentiment_data)
    features = get_feature_list(sentiment_data)
    print('valid review length: ', len(sentiment_data))
    print("valid user: ", len(user_dict))
    print('valid item : ', len(item_dict))
    print("valid feature length: ", len(features))
    print('user dense is:', len(sentiment_data) / len(user_dict))
    sentiment_data = np.array(sentiment_data)
    return sentiment_data

In [None]:
from re import S
import torch
import numpy as np
import json
import pickle
# from torch._C import R
import tqdm
from torch.random import seed


class AmazonDataset():
    def __init__(self):
        super().__init__()
        self.sentiment_data = None  # [userID, itemID, [fos triplet 1], [fos triplet 2], ...]

        self.user_name_dict = {}  # rename users to integer names
        self.item_name_dict = {}
        self.feature_name_dict = {}

        self.features = []  # feature list
        self.users = []
        self.items = []

        # the interacted items for each user, sorted with date {user:[i1, i2, i3, ...], user:[i1, i2, i3, ...]}
        self.user_hist_inter_dict = {}
        # the interacted users for each item
        self.item_hist_inter_dict = {}  

        self.user_num = None
        self.item_num = None
        self.feature_num = None  # number of features

        self.user_feature_matrix = None  # user aspect attention matrix
        self.item_feature_matrix = None  # item aspect quality matrix

        self.training_data = None
        self.test_data = None
        self.pre_processing()
        self.get_user_item_feature_matrix()
        self.sample_training()  # sample training data, for traning BPR loss
        self.sample_test()  # sample test data

    def pre_processing(self,):
        sentiment_data = []  # [userID, itemID, [fos triplet 1], [fos triplet 2], ...]
        with open(sentires_dir, 'r') as f:
            line = f.readline().strip()
            while line:
                # print(count)
                # print('line', line)
                user = line.split('@')[0]
                item = line.split('@')[1]
                sentiment_data.append([user, item])
                l = len(user) + len(item)
                fosr_data = line[l+3:]
                for seg in fosr_data.split('||'):
                    fos = seg.split(':')[0].strip('|')
                    if len(fos.split('|')) > 1:
                        feature = fos.split('|')[0]
                        opinion = fos.split('|')[1]
                        sentiment = fos.split('|')[2]
                        sentiment_data[-1].append([feature, opinion, sentiment])
                line = f.readline().strip()
        sentiment_data = np.array(sentiment_data)
        # sentiment_data = sentiment_data_filtering(
        #     sentiment_data, 
        #     user_thresh, 
        #     feature_thresh)
        user_dict, item_dict = get_user_item_dict(sentiment_data)  # not sorted with time
        user_item_date_dict = {}   # {(user, item): date, (user, item): date ...}  # used to remove duplicate

        for i, line in enumerate(open(review_dir, "r")):
            record = json.loads(line)
            user = record['reviewerID']
            item = record['asin']
            date = record['unixReviewTime']
            if user in user_dict and item in user_dict[user] and (user, item) not in user_item_date_dict:
                user_item_date_dict[(user, item)] = date

        # remove the (user, item) not exist in the official dataset, possibly due to update?
        sentiment_data = [row for row in sentiment_data if (row[0], row[1]) in user_item_date_dict]
        # sentiment_data = sentiment_data_filtering(sentiment_data, user_thresh, feature_thresh)
        user_dict, item_dict = get_user_item_dict(sentiment_data)
        # for key in list(user_item_date_dict.keys()):
        #     if key[0] not in user_dict or key[1] not in user_dict[key[0]]:
        #         del user_item_date_dict[key]
        
        # rename users, items, and features to integer names
        user_name_dict = {}
        item_name_dict = {}
        feature_name_dict = {}
        features = get_feature_list(sentiment_data)
        
        count = 0
        for user in user_dict:
            if user not in user_name_dict:
                user_name_dict[user] = count
                count += 1
        count = 0
        for item in item_dict:
            if item not in item_name_dict:
                item_name_dict[item] = count
                count += 1
        count = 0
        for feature in features:
            if feature not in feature_name_dict:
                feature_name_dict[feature] = count
                count += 1
        
        for i in range(len(sentiment_data)):
            sentiment_data[i][0] = user_name_dict[sentiment_data[i][0]]
            sentiment_data[i][1] = item_name_dict[sentiment_data[i][1]]
            for j in range(len(sentiment_data[i]) - 2):
                sentiment_data[i][j+2][0] = feature_name_dict[sentiment_data[i][j + 2][0]]

        renamed_user_item_date_dict = {}
        for key, value in user_item_date_dict.items():
            renamed_user_item_date_dict[user_name_dict[key[0]], item_name_dict[key[1]]] = value
        user_item_date_dict = renamed_user_item_date_dict

        # sort with date
        user_item_date_dict = dict(sorted(user_item_date_dict.items(), key=lambda item: item[1]))

        user_hist_inter_dict = {}  # {"u1": [i1, i2, i3, ...], "u2": [i1, i2, i3, ...]}, sort with time
        item_hist_inter_dict = {}
        # ranked_user_item_dict = {}  # {"u1": [i1, i2, i3, ...], "u2": [i1, i2, i3, ...]}
        for key, value in user_item_date_dict.items():
            user = key[0]
            item = key[1]
            if user not in user_hist_inter_dict:
                user_hist_inter_dict[user] = [item]
            else:
                user_hist_inter_dict[user].append(item)
            if item not in item_hist_inter_dict:
                item_hist_inter_dict[item] = [user]
            else:
                item_hist_inter_dict[item].append(user)

        user_hist_inter_dict = dict(sorted(user_hist_inter_dict.items()))
        item_hist_inter_dict = dict(sorted(item_hist_inter_dict.items()))

        users = list(user_hist_inter_dict.keys())
        items = list(item_hist_inter_dict.keys())

        self.sentiment_data = sentiment_data
        self.user_name_dict = user_name_dict
        self.item_name_dict = item_name_dict
        self.feature_name_dict = feature_name_dict
        self.user_hist_inter_dict = user_hist_inter_dict
        self.item_hist_inter_dict = item_hist_inter_dict
        self.users = users
        self.items = items
        self.features = features
        self.user_num = len(users)
        self.item_num = len(items)
        self.feature_num = len(features)
        return True
    
    def get_user_item_feature_matrix(self,):
        # exclude test data from the sentiment data to construct matrix
        train_u_i_set = set()
        for user, items in self.user_hist_inter_dict.items():
            items = items
            for item in items:
                train_u_i_set.add((user, item))

        train_sentiment_data = []
        for row in self.sentiment_data:
            user = row[0]
            item = row[1]
            if (user, item) in train_u_i_set:
                train_sentiment_data.append(row)
        self.user_feature_matrix = get_user_attention_matrix(
            train_sentiment_data, 
            self.user_num, 
            self.features, 
            max_range=5)
        self.item_feature_matrix = get_item_quality_matrix(
            train_sentiment_data, 
            self.item_num, 
            self.features, 
            max_range=5)
        print(len(train_sentiment_data))
        return True
    
    def sample_training(self):
        print('======================= sample training data =======================')
        # print(self.user_feature_matrix.shape, self.item_feature_matrix.shape)
        training_data = []
        training_pairs = np.loadtxt(save_path+'training_data.txt',dtype=str)
        for pair in training_pairs:
          if pair[0] in self.user_name_dict.keys() and pair[1] in self.item_name_dict.keys():
            training_data.append([self.user_name_dict[pair[0]],self.item_name_dict[pair[1]],int(pair[2])])

        # item_set = set(self.items)
        # for user, items in self.user_hist_inter_dict.items():
        #     items = items[:-(test_length+val_length)]
        #     training_pairs = sample_training_pairs(
        #         user, 
        #         items, 
        #         item_set, 
        #         sample_ratio)
        #     for pair in training_pairs:
        #         training_data.append(pair)
        print('# training samples :', len(training_data))
        self.training_data = np.array(training_data)
        return True
    
    def sample_test(self):
        print('======================= sample test data =======================')
        user_item_label_list = []  # [[u, [item1, item2, ...], [l1, l2, ...]], ...]
        with open(save_path+'test_data.pickle', 'rb') as f:
            test_pairs= pickle.load(f)
        for user_id in test_pairs.keys():
          if user_id in self.user_name_dict.keys():
            user=self.user_name_dict[user_id]
            items_ids=test_pairs[user_id][0]
            labels=test_pairs[user_id][1]
            items=np.array([self.item_name_dict[item] for item in items_ids if item in self.item_name_dict.keys()])
            labels=np.array([float(label) for label in labels])
            user_item_label_list.append([user,items,labels])
        
        # for user, items in self.user_hist_inter_dict.items():
            # items = items[-(test_length+val_length):]
            # user_item_label_list.append([user, items, np.ones(len(items))])  # add the test items
            # negative_items = [item for item in self.items if 
            #     item not in self.user_hist_inter_dict[user]]  # the not interacted items
            # negative_items = np.random.choice(np.array(negative_items), neg_length, replace=False)
            # user_item_label_list[-1][1] = np.concatenate((user_item_label_list[-1][1], negative_items), axis=0)
            # user_item_label_list[-1][2] = np.concatenate((user_item_label_list[-1][2], np.zeros(neg_length)), axis=0)
        print('# test samples :', len(user_item_label_list))
        self.test_data = np.array(user_item_label_list)
        user_item_label_list2 = []  # [[u, [item1, item2, ...], [l1, l2, ...]], ...]
        with open(save_path+'validation_data.pickle', 'rb') as f:
            validation_pairs= pickle.load(f)
        for user_id in validation_pairs.keys():
          if user_id in self.user_name_dict.keys():
            user=self.user_name_dict[user_id]
            items_ids=validation_pairs[user_id][0]
            labels=validation_pairs[user_id][1]
            items=np.array([self.item_name_dict[item] for item in items_ids if item in self.item_name_dict.keys()])
            labels=np.array([float(label) for label in labels])
            user_item_label_list2.append([user,items,labels])
        
        # for user, items in self.user_hist_inter_dict.items():
            # items = items[-(test_length+val_length):]
            # user_item_label_list.append([user, items, np.ones(len(items))])  # add the test items
            # negative_items = [item for item in self.items if 
            #     item not in self.user_hist_inter_dict[user]]  # the not interacted items
            # negative_items = np.random.choice(np.array(negative_items), neg_length, replace=False)
            # user_item_label_list[-1][1] = np.concatenate((user_item_label_list[-1][1], negative_items), axis=0)
            # user_item_label_list[-1][2] = np.concatenate((user_item_label_list[-1][2], np.zeros(neg_length)), axis=0)
        print('# validation samples :', len(user_item_label_list2))
        self.validation_data = np.array(user_item_label_list2)
        return True

    def save(self, save_path):
        return True
    
    def load(self):
        return False



In [None]:
def amazon_preprocessing():
    rec_dataset = AmazonDataset()
    return rec_dataset

In [None]:
def dataset_init():
	if dataset == "yelp":
		rec_dataset = yelp_preprocessing()
	elif dataset == "cell_phones" or "kindle_store" or "electronic" or "cds_and_vinyl":
		rec_dataset = amazon_preprocessing()
	return rec_dataset

# Training base Model...

In [None]:
dataset="cds_and_vinyl"
gpu=True
cuda='0'
weight_decay=0.00001
lr=0.01
epochs=15
batch_size=64
rec_k=5

In [None]:
import numpy as np
from torch.utils.data import Dataset
class UserItemInterDataset(Dataset):
    def __init__(self, data, user_feature_matrix, item_feature_matrix):
        self.data = data
        self.user_feature_matrix = user_feature_matrix
        self.item_feature_matrix = item_feature_matrix

    def __getitem__(self, index):
        user = self.data[index][0]
        item = self.data[index][1]
        label = self.data[index][2]
        user_feature = self.user_feature_matrix[user]
        item_feature = self.item_feature_matrix[item]
        return user_feature, item_feature, label
    def __len__(self):
        return len(self.data)

In [None]:
import numpy as np
import torch
from sklearn.metrics import ndcg_score

In [None]:
def compute_ndcg(test_data, user_feature_matrix, item_feature_matrix, k, model, device):
    model.eval()
    ndcgs = []
    with torch.no_grad():
        for row in test_data:
            user = row[0]
            items = row[1]
            gt_labels = row[2]
            user_features = np.array([user_feature_matrix[user] for i in range(len(items))])
            item_features = np.array([item_feature_matrix[item] for item in items])
            scores = model(torch.from_numpy(user_features).to(device),
                                    torch.from_numpy(item_features).to(device)).squeeze()
            scores = np.array(scores.to('cpu'))
            ndcg = ndcg_score([gt_labels], [scores], k=k)
            ndcgs.append(ndcg)
    ave_ndcg = np.mean(ndcgs)
    return ave_ndcg


In [None]:
import torch
import numpy as np
import os
import tqdm
import pickle
from pathlib import Path
from torch.utils.data import DataLoader

In [None]:
from numpy import core

class BaseRecModel(torch.nn.Module):
    def __init__(self, feature_length):
        super(BaseRecModel, self).__init__()
        self.fc = torch.nn.Sequential(
            torch.nn.Linear(feature_length * 2, 512),
            torch.nn.ReLU(),
            torch.nn.Linear(512, 256),
            torch.nn.ReLU(),
            torch.nn.Linear(256, 1),
            torch.nn.Sigmoid()
        )

    def forward(self, user_feature, item_feature):
        fusion = torch.cat((user_feature, item_feature), 1)
        out = self.fc(fusion)
        return out


In [None]:
if gpu:
  device = torch.device('cuda:%s' % cuda)
else:
  device = 'cpu'
print(device)

cuda:0


In [None]:
rec_dataset = dataset_init()



268135
# training samples : 958920
# test samples : 5695




In [None]:
def train_base_recommendation():
    if gpu:
        device = torch.device('cuda:%s' % cuda)
    else:
        device = 'cpu'

    rec_dataset = dataset_init()
    Path(save_path).mkdir(parents=True, exist_ok=True)
    with open(os.path.join(save_path,dataset + "_dataset_obj_main.pickle"), 'wb') as outp:
        pickle.dump(rec_dataset, outp, pickle.HIGHEST_PROTOCOL)

    train_loader = DataLoader(dataset=UserItemInterDataset(rec_dataset.training_data, 
                                rec_dataset.user_feature_matrix, 
                                rec_dataset.item_feature_matrix),
                          batch_size=batch_size,
                          shuffle=True)

    model = BaseRecModel(rec_dataset.feature_num).to(device)
    loss_fn = torch.nn.BCELoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=lr, weight_decay=weight_decay)

    out_path = os.path.join("./logs", dataset + "_logs")
    Path(out_path).mkdir(parents=True, exist_ok=True)

    ndcg = compute_ndcg(rec_dataset.test_data, 
            rec_dataset.user_feature_matrix, 
            rec_dataset.item_feature_matrix, 
            rec_k, 
            model, 
            device)
    print('init ndcg:', ndcg)
    for epoch in tqdm.trange(epochs):
        model.train()
        optimizer.zero_grad()
        losses = []
        for user_behaviour_feature, item_aspect_feature, label in train_loader:
            user_behaviour_feature = user_behaviour_feature.to(device)
            item_aspect_feature = item_aspect_feature.to(device)
            label = label.float().to(device)
            out = model(user_behaviour_feature, item_aspect_feature).squeeze()
            loss = loss_fn(out, label)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            losses.append(loss.to('cpu').detach().numpy())
            ave_train = np.mean(np.array(losses))
        print('epoch %d: ' % epoch, 'training loss: ', ave_train)
        # compute necg
        if epoch % 10 == 0:
            ndcg = compute_ndcg(rec_dataset.validation_data, 
            rec_dataset.user_feature_matrix, 
            rec_dataset.item_feature_matrix, 
            rec_k, 
            model, 
            device)
            print('epoch %d: ' % epoch, 'training loss: ', ave_train, 'NDCG_validation: ', ndcg)
    torch.save(model.state_dict(), os.path.join(out_path, "model_main.model"))
    ndcg = compute_ndcg(rec_dataset.test_data, 
            rec_dataset.user_feature_matrix, 
            rec_dataset.item_feature_matrix, 
            rec_k, 
            model, 
            device)
    print('ndcg_test:',ndcg)
    return 0


if __name__ == "__main__":
    torch.manual_seed(0)
    np.random.seed(0)
    if gpu:
        os.environ["CUDA_VISIBLE_DEVICES"] =cuda
        print("Using CUDA",cuda)
    else:
        print("Using CPU")
    train_base_recommendation()

Using CUDA 0




268135
# training samples : 958920
# test samples : 5695
# validation samples : 5695




init ndcg: 0.19035359251113088


  0%|          | 0/15 [00:00<?, ?it/s]

epoch 0:  training loss:  0.37519625


  7%|▋         | 1/15 [01:12<16:51, 72.25s/it]

epoch 0:  training loss:  0.37519625 NDCG_validation:  0.46051518283695214


 13%|█▎        | 2/15 [02:16<14:39, 67.69s/it]

epoch 1:  training loss:  0.3359493


 20%|██        | 3/15 [03:19<13:07, 65.64s/it]

epoch 2:  training loss:  0.32185248


 27%|██▋       | 4/15 [04:23<11:52, 64.76s/it]

epoch 3:  training loss:  0.31146774


 33%|███▎      | 5/15 [05:27<10:46, 64.69s/it]

epoch 4:  training loss:  0.3026597


 40%|████      | 6/15 [06:32<09:40, 64.50s/it]

epoch 5:  training loss:  0.2948305


 47%|████▋     | 7/15 [07:35<08:33, 64.18s/it]

epoch 6:  training loss:  0.2874111


 53%|█████▎    | 8/15 [08:40<07:30, 64.31s/it]

epoch 7:  training loss:  0.2802841


 60%|██████    | 9/15 [09:44<06:25, 64.17s/it]

epoch 8:  training loss:  0.27290282


 67%|██████▋   | 10/15 [10:47<05:20, 64.04s/it]

epoch 9:  training loss:  0.26584113
epoch 10:  training loss:  0.2584132


 73%|███████▎  | 11/15 [11:59<04:25, 66.36s/it]

epoch 10:  training loss:  0.2584132 NDCG_validation:  0.518816797175427


 80%|████████  | 12/15 [13:02<03:16, 65.41s/it]

epoch 11:  training loss:  0.2514084


 87%|████████▋ | 13/15 [14:07<02:10, 65.20s/it]

epoch 12:  training loss:  0.24439223


 93%|█████████▎| 14/15 [15:10<01:04, 64.66s/it]

epoch 13:  training loss:  0.23698409


100%|██████████| 15/15 [16:13<00:00, 64.93s/it]

epoch 14:  training loss:  0.22957689





ndcg_test: 0.5522675455268641


# Training ExpOptimization Model...

In [None]:
dataset="cds_and_vinyl"
base_model_path="/content/drive/Shareddrives/Unlimited Drive | @LicenseMarket/Recommender/cds/"
gpu=True
cuda='0'
data_obj_path="/content/drive/Shareddrives/Unlimited Drive | @LicenseMarket/Recommender/cds/"
rec_k=5
lam=100
gam=1.0
alp=0.2
user_mask=True
lr=0.01
step=1000
mask_thresh=0.1
test_num=-1
save_path="/content/drive/Shareddrives/Unlimited Drive | @LicenseMarket/Recommender/cds/"

In [None]:
import torch
import pickle
import os
from pathlib import Path

In [None]:
def evaluate_user_perspective(user_perspective_data, u_i_expl_dict):
    pres = []
    recs = []
    f1s = []
    for u_i, gt_features in user_perspective_data.items():
        if u_i in u_i_expl_dict:
            TP = 0
            pre_features = u_i_expl_dict[u_i]
            # print('f: ', gt_features, pre_features)
            for feature in pre_features:
                if feature in gt_features:
                    TP += 1
            pre = TP / len(pre_features)
            rec = TP / len(gt_features)
            if (pre + rec) != 0:
                f1 = (2 * pre * rec) / (pre + rec)
            else:
                f1 = 0
            pres.append(pre)
            recs.append(rec)
            f1s.append(f1)
    ave_pre = np.mean(pres)
    ave_rec = np.mean(recs)
    ave_f1 = np.mean(f1s)
    return ave_pre, ave_rec, ave_f1

In [None]:
def evaluate_model_perspective(
        rec_dict,
        u_i_exp_dict,
        base_model,
        user_feature_matrix,
        item_feature_matrix,
        rec_k,
        device):
    """
    compute PN, PS and F_NS score for the explanations
    :param rec_dict: {u1: [i1, i2, i3, ...] , u2: [i1, i2, i3, ...]}
    :param u_i_exp_dict: {(u, i): [f1, f2, ...], ...}
    :param base_model: the trained base recommendation model
    :param user_feature_matrix: |u| x |p| matrix, the attention on each feature p for each user u
    :param item_feature_matrix: |i| x |p| matrix, the quality on each feature p for each item i
    :param rec_k: the length of the recommendation list, only generated explanations for the items on the list
    :param device: the device of the model
    :return: the mean of the PN, PS and FNS scores
    """
    pn_count = 0
    ps_count = 0
    for u_i, fs in u_i_exp_dict.items():
        user = u_i[0]
        target_item = u_i[1]
        features = set(fs)
        items = rec_dict[user]
        target_index = items.index(target_item)
        # compute PN
        cf_items_features = []
        for item in items:
            item_ori_feature = np.array(item_feature_matrix[item])
            item_cf_feature = np.array([0 if s in features else item_ori_feature[s]
                                        for s in range(len(item_ori_feature))], dtype='float32')
            cf_items_features.append(item_cf_feature)
        cf_ranking_scores = base_model(torch.from_numpy(np.array([user_feature_matrix[user]
                                                                      for i in range(len(cf_items_features))])
                                                            ).to(device),
                                           torch.from_numpy(np.array(cf_items_features)).to(device)).squeeze()
        cf_score_list = cf_ranking_scores.to('cpu').detach().numpy()
        sorted_index = np.argsort(cf_score_list)[::-1]
        cf_rank = np.argwhere(sorted_index == target_index)[0, 0]  # the updated ranking of the current item
        if cf_rank > rec_k - 1:
            pn_count += 1
        # compute NS
        cf_items_features = []
        for item in items:
            item_ori_feature = np.array(item_feature_matrix[item])
            item_cf_feature = np.array([item_ori_feature[s] if s in features else 0
                                        for s in range(len(item_ori_feature))], dtype='float32')
            cf_items_features.append(item_cf_feature)
        cf_ranking_scores = base_model(torch.from_numpy(np.array([user_feature_matrix[user]
                                                                      for i in range(len(cf_items_features))])
                                                            ).to(device),
                                           torch.from_numpy(np.array(cf_items_features)).to(device)).squeeze()
        cf_score_list = cf_ranking_scores.to('cpu').detach().numpy()
        sorted_index = np.argsort(cf_score_list)[::-1]
        cf_rank = np.argwhere(sorted_index == target_index)[0, 0]  # the updated ranking of the current item
        if cf_rank < rec_k:
            ps_count += 1
    if len(u_i_exp_dict) != 0:
        pn = pn_count / len(u_i_exp_dict)
        ps = ps_count / len(u_i_exp_dict)
        if (pn + ps) != 0:
            fns = (2 * pn * ps) / (pn + ps)
        else:
            fns = 0
    else:
        pn = 0
        ps = 0
        fns = 0
    return pn, ps, fns

In [None]:
import json

In [None]:
class ExpOptimizationModel(torch.nn.Module):
    def __init__(self, base_model, rec_dataset, device):
        super(ExpOptimizationModel, self).__init__()
        self.base_model = base_model
        self.rec_dataset = rec_dataset
        self.device = device
        self.u_i_exp_dict = {}  # {(user, item): [f1, f2, f3 ...], ...}
        self.user_feature_matrix = torch.from_numpy(self.rec_dataset.user_feature_matrix).to(self.device)
        self.item_feature_matrix = torch.from_numpy(self.rec_dataset.item_feature_matrix).to(self.device)
        self.rec_dict, self.user_perspective_test_data = self.generate_rec_dict()

    def generate_rec_dict(self):
        rec_dict = {}
        correct_rec_dict = {}  # used for user-side evaluation
        for row in self.rec_dataset.test_data:
            user = row[0]
            items = row[1]
            labels = row[2]
            correct_rec_dict[user] = []
            user_features = self.user_feature_matrix[user].repeat(len(items), 1)
            scores = self.base_model(user_features,
                        self.item_feature_matrix[items]).squeeze()
            scores = np.array(scores.to('cpu'))
            sort_index = sorted(range(len(scores)), key=lambda k: scores[k], reverse=True)
            sorted_items = [items[i] for i in sort_index]
            rec_dict[user] = sorted_items
            for i in range(rec_k):  # find the correct items and add to the user side test data
                if labels[sort_index[i]] == 1:
                    correct_rec_dict[user].append(items[sort_index[i]])
        user_item_feature_dict = {}  # {(u, i): f, (u, i): f]

        for row in self.rec_dataset.sentiment_data:
            user = row[0]
            item = row[1]
            user_item_feature_dict[(user, item)] = []
            for fos in row[2:]:
                feature = fos[0]
                user_item_feature_dict[(user, item)].append(feature)
        user_perspective_test_data = {}  # {(u, i):f, (u, i): f]}
        for user, items in correct_rec_dict.items():
            for item in items:
                if (user, item) in user_item_feature_dict.keys():
                  feature = user_item_feature_dict[(user, item)]
                  user_perspective_test_data[(user, item)] = feature
        return rec_dict, user_perspective_test_data

    def generate_explanation(self):
        # u_i_exps_dict = {}  # {(user, item): [f1, f2, f3 ...], ...}
        exp_nums = []
        exp_complexities = []
        self.no_exp_count = 0
        if test_num == -1:
            test_num1 = len(list(self.rec_dict.items()))
        else:
            test_num1 = test_num
        count=0
        for user, items in tqdm.tqdm(list(self.rec_dict.items())[:20]):
            count+=1
            items = self.rec_dict[user]
            margin_item = items[rec_k]
            margin_score = self.base_model(self.user_feature_matrix[user].unsqueeze(0), 
                            self.item_feature_matrix[margin_item].unsqueeze(0)).squeeze()
            if user_mask:
                # mask_vec = self.generate_mask(user)
                mask_vec = torch.where(self.user_feature_matrix[user]>0, 1., 0.).unsqueeze(0)  # only choose exps from the user cared aspects
            else:
                mask_vec = torch.ones(self.rec_dataset.feature_num, device=self.device).unsqueeze(0)
            for item in items[: rec_k]:
                explanation_features, exp_num, exp_complexity = self.explain(
                    self.user_feature_matrix[user], 
                    self.item_feature_matrix[item], 
                    margin_score,
                    mask_vec)
                
                if explanation_features is None:
                    # print('no explanation for user %d and item %d' % (user, item))
                    self.no_exp_count += 1
                else:
                    self.u_i_exp_dict[(user, item)] = explanation_features
                    exp_nums.append(exp_num)
                    exp_complexities.append(exp_complexity)
            # if count%500==0:
            #   json1 = json.dumps(self.u_i_exp_dict)
            #   f = open("drive/MyDrive/ranjbar/dict{}.json".format(count),"w")
            #   f.write(json1)
            #   f.close()

              
        print('ave num: ', np.mean(exp_nums), 'ave complexity: ', np.mean(exp_complexities) , 'no_exp_count: ', self.no_exp_count)
        return True
    
    def explain(self, user_feature, item_feature, margin_score, mask_vec):
        exp_generator = EXPGenerator(
            self.rec_dataset, 
            self.base_model, 
            user_feature, 
            item_feature, 
            margin_score, 
            mask_vec,
            self.device).to(self.device)

        # optimization
        optimizer = torch.optim.SGD(exp_generator.parameters(), lr=lr, weight_decay=0)
        exp_generator.train()
        lowest_loss = None
        lowest_bpr = None
        lowest_l2 = 0
        optimize_delta = None
        score = exp_generator()
        bpr, l2, l1, loss = exp_generator.loss(score)
        # print('init: ', 0, '  train loss: ', loss, '  bpr: ', bpr, '  l2: ', l2, '  l1: ', l1)
        lowest_loss = loss
        optimize_delta = exp_generator.delta.detach().to('cpu').numpy()
        lowest_l2 = l2
        for epoch in range(step):
            exp_generator.zero_grad()
            score = exp_generator()
            bpr, l2, l1, loss = exp_generator.loss(score)
            # if epoch % 100 == 0:
            #     print(
            #         'epoch', epoch,
            #         'bpr: ', bpr,
            #         'l2: ', l2,
            #         'l1', l1,
            #         'loss', loss)

            loss.backward()
            optimizer.step()
            if loss < lowest_loss:
                lowest_loss = loss
                lowest_l2 = l2
                lowest_bpr = bpr
                optimize_delta = exp_generator.delta.detach().to('cpu').numpy()
        if lowest_bpr >= lam * alp:
            explanation_features = None 
            exp_num = None
            exp_complexity = None
        else:
            # optimize_delta = exp_generator.delta.detach().to('cpu').numpy()
            explanation_features = np.argwhere(optimize_delta < - mask_thresh).squeeze(axis=1)
            if len(explanation_features) == 0:
                explanation_features = np.array([np.argmin(optimize_delta)])
            exp_num = len(explanation_features)
            exp_complexity = lowest_l2.to('cpu').detach().numpy() + gam * exp_num
        return explanation_features, exp_num, exp_complexity
    
    def user_side_evaluation(self):
        ave_pre, ave_rec, ave_f1 = evaluate_user_perspective(self.user_perspective_test_data, self.u_i_exp_dict)
        print('user\'s perspective:')
        print('ave pre: ', ave_pre, '  ave rec: ', ave_rec, '  ave f1: ', ave_f1)
    
    def model_side_evaluation(self):
        ave_pn, ave_ps, ave_fns = evaluate_model_perspective(
            self.rec_dict,
            self.u_i_exp_dict,
            self.base_model,
            self.rec_dataset.user_feature_matrix,
            self.rec_dataset.item_feature_matrix,
            rec_k,
            self.device)
        print('model\'s perspective:')
        print('ave PN: ', ave_pn, '  ave PS: ', ave_ps, '  ave F_{NS}: ', ave_fns)  


In [None]:
class EXPGenerator(torch.nn.Module):
    def __init__(self, rec_dataset, base_model, user_feature, item_feature, margin_score, mask_vec, device):
        super(EXPGenerator, self).__init__()
        self.rec_dataset = rec_dataset
        self.base_model = base_model
        self.user_feature = user_feature
        self.item_feature = item_feature
        self.margin_score = margin_score
        self.mask_vec = mask_vec
        self.device = device
        self.feature_range = [0, 5]  # hard coded, should be improved later
        self.delta_range = self.feature_range[1] - self.feature_range[0]  # the maximum feature value.
        self.delta = torch.nn.Parameter(
            torch.FloatTensor(len(self.user_feature)).uniform_(-self.delta_range, 0))

    def get_masked_item_feature(self):
        item_feature_star = torch.clamp(
            (self.item_feature + torch.clamp((self.delta * self.mask_vec), -self.delta_range, 0)),
            self.feature_range[0], self.feature_range[1])
        # print(self.item_feature)
        # print(self.delta)
        return item_feature_star
    
    def forward(self):
        item_feature_star = self.get_masked_item_feature()
        # print(item_feature_star)
        score = self.base_model(self.user_feature.unsqueeze(0), item_feature_star)
        return score
    
    def loss(self, score):
        bpr = torch.nn.functional.relu(alp + score - self.margin_score) * lam
        # print(score)
        # print(self.margin_score)
        l2 = torch.linalg.norm(self.delta)
        l1 = torch.linalg.norm(self.delta, ord=1) * gam
        loss = l2 + bpr + l1
        return bpr, l2, l1, loss

In [None]:
save_path2='/content/drive/MyDrive/CDs'

In [None]:
def generate_explanation():
    if gpu:
        device = torch.device('cuda:%s' %cuda)
    else:
        device = 'cpu'
    print(device)
    # import dataset
    with open(os.path.join(data_obj_path, dataset + "_dataset_obj_main.pickle"), 'rb') as inp:
        rec_dataset = pickle.load(inp)
    
    base_model = BaseRecModel(rec_dataset.feature_num).to(device)
    base_model.load_state_dict(torch.load(os.path.join(base_model_path,"model_main.model"),map_location=torch.device(device)))
    base_model.eval()
    #  fix the rec model
    for param in base_model.parameters():
        param.requires_grad = False
    
    # Create optimization model
    opt_model = ExpOptimizationModel(
        base_model=base_model,
        rec_dataset=rec_dataset,
        device = device,
        
    )

    opt_model.generate_explanation()
    opt_model.user_side_evaluation()
    opt_model.model_side_evaluation()
    # print(opt_model.u_i_exp_dict)
    Path(save_path).mkdir(parents=True, exist_ok=True)
    with open(os.path.join(save_path2, dataset + "_explanation_obj_main.pickle"), 'wb') as outp:
        pickle.dump(opt_model, outp, pickle.HIGHEST_PROTOCOL)
    return opt_model


if __name__ == "__main__":
    opt_model=generate_explanation()

cuda:0


100%|██████████| 20/20 [02:15<00:00,  6.79s/it]


ave num:  5.229166666666667 ave complexity:  11.135265263418356 no_exp_count:  4
user's perspective:
ave pre:  0.25303466691026494   ave rec:  0.3021791010921446   ave f1:  0.22085151503471323
model's perspective:
ave PN:  0.8854166666666666   ave PS:  0.59375   ave F_{NS}:  0.7108274647887324


cuda:0

100%|██████████| 5695/5695 [10:08:42<00:00,  6.41s/it]

ave num:  5.091492435832058 ave complexity:  11.402920393188065 no_exp_count:  4943
user's perspective:
ave pre:  0.22303986309932253   ave rec:  0.3291177828586174   ave f1:  0.22893694747165352
model's perspective:
ave PN:  0.7782593914669387   ave PS:  0.6795852456229815   ave F_{NS}:  0.7255829410796528


In [None]:
with open(os.path.join(save_path, dataset + "_explanation_obj_main.pickle"), 'rb') as opt:
    opt_model = pickle.load(opt)

In [None]:
print(rec_dataset.item_name_dict)

{'0005164885': 0, '1573300411': 1, '1858705770': 2, '1858705789': 3, '1858705223': 4, '3937406875': 5, '5552622685': 6, '5558565314': 7, '5559664235': 8, '5558225718': 9, '5559550833': 10, '5559291986': 11, '5558160063': 12, '630025075X': 13, '6300342468': 14, '6301248252': 15, '6301334175': 16, '6301732901': 17, '6301846052': 18, '6301955013': 19, '6302158249': 20, '6302250056': 21, '6302404282': 22, '6302371619': 23, '6302482402': 24, '6302559359': 25, '6302626625': 26, '6302714222': 27, '6303234844': 28, '6303422748': 29, '6303415571': 30, '6303477259': 31, '6303553486': 32, '6303568041': 33, '6303694713': 34, '6303668380': 35, '6304051948': 36, '6304204000': 37, '6304281803': 38, '630463949X': 39, '6304683960': 40, '6304708025': 41, '6305026009': 42, '6305026734': 43, '6305131961': 44, '6305131147': 45, '6305198365': 46, '6305194513': 47, '6305205639': 48, '630522336X': 49, '6305226296': 50, '6305302499': 51, '6305269076': 52, '6305323615': 53, '6305323526': 54, '6305484554': 55, '

In [None]:
print(rec_dataset.user_name_dict)

{'A3LEN0P07MGJE2': 0, 'A2PBPFPMTEZYKP': 1, 'A1XETBGQF7SK2J': 2, 'A2LYK2N9IHPGZB': 3, 'A2GT0MCMRKLKDW': 4, 'A9VROMAWXIL9G': 5, 'A2KYG4U9VNPDVQ': 6, 'A1URYPVG5DLLZZ': 7, 'A2I6MHMAZZDCRX': 8, 'A27ZOCD5B63Y0P': 9, 'A1SCJWCMQ3W3KK': 10, 'A30BCRSABL9AYI': 11, 'A1BHJY46QTUS7J': 12, 'A2HRVE5MUWP0PN': 13, 'A2UGVZFKH8O5X4': 14, 'A3464G00K8ZYD1': 15, 'A1OKMIT8B373YD': 16, 'ALD1CRUBY1365': 17, 'A16YLL2PIINBVT': 18, 'A2FBJK2NDAD5M0': 19, 'A32TV3GASOEI3G': 20, 'A10872FHIJAKKD': 21, 'ARZ6WWAHX0J8K': 22, 'A3LYLRNW15DPU6': 23, 'A27LE5555J8X0I': 24, 'A3P81N48BDEYVK': 25, 'A16HM5IJDLDQOP': 26, 'A2FOTOS0E5GS7M': 27, 'A1MHTHU9A6OFN3': 28, 'A1EI65WJC85U68': 29, 'A8W9Q9K0TCVKT': 30, 'AWWN9XK9AHTG5': 31, 'A32STMMCLNJQTM': 32, 'A3VPISGFRPR0BH': 33, 'A2E6KYA1K2PHSF': 34, 'A3MEH37R79AD2S': 35, 'A2ERVCPWCHUOGP': 36, 'A1L6XHK8PNBK8P': 37, 'A3M459NLAT69P8': 38, 'A8Y17SG3G05PS': 39, 'A1YG9TB69OT00R': 40, 'A2RWGW6TGUX7E5': 41, 'A2ZF5D0KM2AWGU': 42, 'A39AN2T5IYT650': 43, 'A2GJXG6JMHBD60': 44, 'A2A0FBLKJN4ZY5': 45, 'AS

In [None]:
opt_model.u_i_exp_dict

In [None]:
opt_model.u_i_exp_dict

{(1552, 4251): array([  1,  33,  49,  50,  52,  91, 200]),
 (1552, 4098): array([  1,   3,  52,  61,  91, 367]),
 (1552, 9723): array([  1,  52, 450]),
 (1552, 9278): array([ 1,  3, 39]),
 (1552, 4194): array([ 3,  9, 49, 52, 91]),
 (0, 11243): array([  3,  10,  21,  49,  61,  66, 213, 367]),
 (0, 9484): array([61, 91]),
 (0, 12618): array([  1,  10, 131]),
 (0, 4260): array([ 1,  3, 61]),
 (0, 14044): array([213]),
 (4227, 11661): array([1]),
 (4227, 8316): array([1]),
 (4227, 1233): array([1]),
 (1, 8725): array([ 3,  6, 10]),
 (1, 3831): array([3]),
 (1, 8403): array([3, 4, 6]),
 (1, 7349): array([3, 6]),
 (1, 12090): array([ 6, 10]),
 (10365, 9928): array([ 6, 10]),
 (10365, 13255): array([ 3, 10]),
 (10365, 12449): array([ 3,  6, 10]),
 (10365, 5205): array([  3,   6, 131]),
 (3, 4251): array([ 1,  3, 10, 91]),
 (3, 9621): array([  3,  91, 131]),
 (3, 7930): array([91]),
 (3, 13674): array([ 3, 10]),
 (3, 2795): array([91]),
 (4, 7480): array([  1,   4,   6,  10,  34, 145]),
 (4, 

In [None]:
with open(os.path.join(data_obj_path, dataset + "_dataset_obj_main.pickle"), 'rb') as inp:
        rec_dataset = pickle.load(inp)

In [None]:
inv_user_name_dict = {v: k for k, v in rec_dataset.user_name_dict.items()}
inv_item_name_dict = {v: k for k, v in rec_dataset.item_name_dict.items()}
inv_feature_name_dict = {v: k for k, v in rec_dataset.feature_name_dict.items()}

In [None]:
i=0
for (user,item) in opt_model.u_i_exp_dict.keys():
  if i>100:
    break
  i+=1
  user_id=inv_user_name_dict[user]
  item_id=inv_item_name_dict [item]
  # if(user_id,item_id) in user_item_ids:
  print(user_id,item_id)
  print(user,item)
  feas= opt_model.u_i_exp_dict[(user,item)]
  for fea in feas:
    print(inv_feature_name_dict[fea])
  print('----------')

A30M3WWF54M74L B0000033O3
1552 4251
songs
music
sounds
fan
sound
concert
cd
talent
----------
A30M3WWF54M74L B0000032CU
1552 4098
songs
music
sounds
sound
cd
harmonies
----------
A30M3WWF54M74L B000FC2FUG
1552 9723
songs
cd
hits collection
----------
A30M3WWF54M74L B000A7X6U4
1552 9278
songs
music
fan
----------
A30M3WWF54M74L B0000033P8
1552 4194
music
band
sound
cd
hits
----------
A3LEN0P07MGJE2 B001G0LBXS
0 11243
album
haunting
performance
favorites
----------
A3LEN0P07MGJE2 B000BSM29I
0 9484
lyrics
release
hits
----------
A3LEN0P07MGJE2 B0096233ZW
0 12618
songs
album
collection
----------
A3LEN0P07MGJE2 B00000373U
0 4260
songs
music
release
----------
A3LEN0P07MGJE2 B016PUSFFK
0 14044
favorites
----------
AZOILH84GFKHO B000042OHB
4227 6298
songs
hits
----------
AZOILH84GFKHO B002UZXJA6
4227 11661
songs
----------
AZOILH84GFKHO B0000C9ZK6
4227 8316
songs
----------
AZOILH84GFKHO B0000024ZV
4227 1233
songs
----------
A2PBPFPMTEZYKP B000002VEC
1 3831
music
----------
A2PBPFPMTEZYKP B0

In [None]:
def generate_explanation_check_stability():
    if gpu:
        device = torch.device('cuda:%s' %cuda)
    else:
        device = 'cpu'
    print(device)
    # import dataset
    with open(os.path.join(data_obj_path, dataset + "_dataset_obj_main.pickle"), 'rb') as inp:
        rec_dataset = pickle.load(inp)
    
    base_model = BaseRecModel(rec_dataset.feature_num).to(device)
    base_model.load_state_dict(torch.load(os.path.join(base_model_path,"model_main.model"),map_location=torch.device(device)))
    base_model.eval()
    #  fix the rec model
    for param in base_model.parameters():
        param.requires_grad = False
    
    # Create optimization model

    features_found=[]
    for i in range(10):
      opt_model = ExpOptimizationModel(
        base_model=base_model,
        rec_dataset=rec_dataset,
        device = device,)
      opt_model.generate_explanation()
      features_found.append(opt_model.u_i_exp_dict)
      
    
    return features_found


if __name__ == "__main__":
    features_found=generate_explanation_check_stability()

cuda:0


100%|██████████| 20/20 [02:10<00:00,  6.54s/it]


ave num:  5.09375 ave complexity:  10.94768756131331 no_exp_count:  4


100%|██████████| 20/20 [02:06<00:00,  6.32s/it]


ave num:  5.43298969072165 ave complexity:  11.407736933108458 no_exp_count:  3


100%|██████████| 20/20 [02:05<00:00,  6.28s/it]


ave num:  5.28125 ave complexity:  11.210509702563286 no_exp_count:  4


100%|██████████| 20/20 [02:06<00:00,  6.30s/it]


ave num:  5.11578947368421 ave complexity:  10.93648785791899 no_exp_count:  5


100%|██████████| 20/20 [02:05<00:00,  6.28s/it]


ave num:  5.061224489795919 ave complexity:  10.988381838311954 no_exp_count:  2


100%|██████████| 20/20 [02:05<00:00,  6.29s/it]


ave num:  5.288659793814433 ave complexity:  11.203695538117714 no_exp_count:  3


100%|██████████| 20/20 [02:15<00:00,  6.77s/it]


ave num:  5.195876288659794 ave complexity:  11.095307908107324 no_exp_count:  3


100%|██████████| 20/20 [02:05<00:00,  6.27s/it]


ave num:  4.625 ave complexity:  10.37072429060936 no_exp_count:  4


100%|██████████| 20/20 [02:05<00:00,  6.26s/it]


ave num:  5.208333333333333 ave complexity:  11.04939572016398 no_exp_count:  4


100%|██████████| 20/20 [02:05<00:00,  6.27s/it]

ave num:  5.333333333333333 ave complexity:  11.186549494663874 no_exp_count:  4





In [None]:
dict_features={}
for iter_feas in features_found:
  for u_i in iter_feas.keys():
    if u_i in dict_features.keys():
      dict_features[u_i].append(iter_feas[u_i])
    else:
      dict_features[u_i]=[]
      dict_features[u_i].append(iter_feas[u_i])

In [None]:
dict_features

# calculate Stability...

In [None]:
stability=0
# count_all=0
for ui in dict_features.keys():
  features=dict_features[ui]
  stabs=0
  count=0
  if(len(features)>1):
    # count_all+=1
    for i in range(len(features)):
      for j in range(len(features)):
        if i != j:
          intersection = list(set(features[i]) & set(features[j]))
          union = list(set(features[i]) | set(features[j]))
          # print(features[i],features[j])
          # print(intersection)
          # print(union)
          count+=1
          stabs+=(len(intersection)/len(union))
    # print(stabs)
    # print(len(features)*(len(features)-1))
    # print((stabs/(len(features)*(len(features)-1))))
    stability+=(stabs/(9.0*10.0))

stability=stability/len( dict_features)
print(stability)

0.6857752618053184


In [None]:
print(len( dict_features))

98
