In [1]:
from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split
from surprise.model_selection import KFold
from surprise.model_selection import GridSearchCV
from surprise import SVD
from surprise import AlgoBase
from surprise import Trainset

from scipy import sparse

import matplotlib.pyplot as plt
%matplotlib inline

import os
# import sys
import math
import statistics
import collections

import sklearn as sk
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.decomposition import NMF, truncatedSVD

import numpy as np
import pandas as pd

import random
my_seed = 0
random.seed(my_seed)
np.random.seed(my_seed)

import datetime
from datetime import datetime
from time import time
import timeit

from collections import defaultdict

# from multiprocessing import Pool

In [2]:
class RatingDataset:
    import numpy as np
    from scipy import sparse
    
    def __init__(self):
        self.rating_mat = None
        # self.time_mat = None
        self._data_file_path = ''
        
        # list of raw user_IDs (dataset IDs)
        self.items = []
        self.users = []
        self.item_n = 0
        self.user_n = 0
        
        # maps raw user_id to user_iid(or inner id)
        self.user_to_iid = {}
        # maps user inner id to dataset raw ID
        self.user_to_ID = {}
        # maps raw item_id (dataset) to item_iid(or inner id)
        self.item_to_iid = {}
        # maps item inner id to dataset raw ID
        self.item_to_ID = {}
        
        # list of triples of (item, rating, timestamp) for each user_iid. 
        # TODO: In case there were no Timestamp in the data, pairs of (item, rating) will be kept
        self.user_ratings = []
        # list of pair of (user, rating) for each item_iid
        self.item_ratings = []
        
        
    def __get_line_format_indices(self, line_format):
        # specifying the order of 'user, item, rating, timestamp' in each line 
        lf_sp = line_format.split(' ')
        # if len(lf_sp) != 4:
        #     raise Exception('''Bad line format!
        #     line_format should be space-separated and it should always specified by 
        #     "user item rating timestamp" with any order!''')
        user_idx = -1
        item_idx = -1
        rating_idx = -1
        # timestamp_idx = -1
        for c in range(len(lf_sp)):
            if lf_sp[c] == 'user':
                user_idx = c
            elif lf_sp[c] == 'item':
                item_idx = c
            elif lf_sp[c] == 'rating':
                rating_idx = c
            # elif lf_sp[c] == 'timestamp':
            #     timestamp_idx = c
            else:
                raise Exception('line_format must be exactly dictated by one of: (user/item/rating/timestamp) separated by sep!')
        
        # return user_idx, item_idx, rating_idx, timestamp_idx
        return user_idx, item_idx, rating_idx
    
    
    '''
        Read the rating data from file and parse it and then make the dataset.
    '''
    # def read_from_file(self, data_fn, skip_lines=0, sep=',', line_format='user item rating timestamp'):
    def read_from_file(self, data_fn, skip_lines=0, sep=',', line_format='user item rating'):
        
        # user_fmt_idx, item_fmt_idx, rating_fmt_idx, timestamp_fmt_idx = self.__get_line_format_indices(line_format)
        user_fmt_idx, item_fmt_idx, rating_fmt_idx = self.__get_line_format_indices(line_format)
        
        file = open(data_fn, 'r')
        
        # skip lines that are specified from input
        for _ in range(skip_lines):
            file.readline()
            
        # users list as in input file
        users_lin = []
        items_lin = []
        ratings_lin = []
        # timestamps_lin = []

        self.raw_ratings = []
        for l in file:
            lsp = l.split(sep)
            user_id = lsp[user_fmt_idx]
            item_id = lsp[item_fmt_idx]
            rating = float(lsp[rating_fmt_idx])
            # timestamp = int(lsp[timestamp_fmt_idx].strip('\n'))
            
            users_lin.append(user_id)
            items_lin.append(item_id)
            ratings_lin.append(rating)
            # timestamps_lin.append(timestamp)

            self.raw_ratings.append((user_id, item_id, rating))
            
        self.users = list(set(users_lin))
        self.items = list(set(items_lin))
        
        self.user_n = len(self.users)
        self.item_n = len(self.items)
        
        '''note that raw ids are in STRING format, and the iid in INTEGER format!'''
        # set the mappings
        for idx in range(self.user_n): 
            self.user_to_iid[self.users[idx]] = idx
            
        for idx in range(self.user_n):
            self.user_to_ID[idx] = self.users[idx] 
            
        for idx in range(self.item_n):
            self.item_to_iid[self.items[idx]] = idx 
            
        for idx in range(self.item_n):
            self.item_to_ID[idx] = self.items[idx] 
        
        # init rating matrix
        self.rating_mat = sparse.lil_matrix((self.user_n, self.item_n))
        # self.time_mat = sparse.lil_matrix((self.user_n, self.item_n))
        for idx in range(len(users_lin)):
            user_iid = self.user_to_iid[users_lin[idx]]
            item_iid = self.item_to_iid[items_lin[idx]]
            rating = ratings_lin[idx]
            self.rating_mat[user_iid, item_iid] = rating
            # self.time_mat[user_iid, item_iid] = timestamps_lin[idx]
            
            
    def list_users_ratings(self, rating_matrix):
        # finding the user and item ratings
        user_ratings = []
        for user_iid in range(self.user_n):
            # append a list for this user
            user_ratings.append([])
            user_nonze = np.nonzero(rating_matrix[user_iid])
            for item_iid in user_nonze[1]:
                # add items and its rating into the last user added to the list
                user_ratings[-1].append((item_iid, rating_matrix[user_iid, item_iid]))
                if rating_matrix[user_iid, item_iid] == 0:
                    raise Exception('Found zero rating in nonzero ratings of user with inner id %d and item iid %d!' % (user_iid, item_iid))
        return user_ratings
    
            
    def list_items_ratings(self, rating_matrix):
        item_ratings = []
        for item_iid in range(self.item_n):
            # append a list for this item
            item_ratings.append([])
            item_nonze = np.nonzero(rating_matrix.T[item_iid])
            for user_iid in item_nonze[1]:
                # add users and its rating into the last item added to the list
                item_ratings[-1].append((user_iid, rating_matrix[user_iid, item_iid]))
                if rating_matrix[user_iid, item_iid] == 0:
                    raise Exception('Found zero rating in nonzero ratings of user with inner id %d and item iid %d!' % (user_iid, item_iid))
        return item_ratings
        
            
    def train_test_split(self, test_percent=0.2, least_userlen_test=10):
        if test_percent > 1:
            raise Exception('test_percent should be between 0 and 1.')
            
        user_ratings = self.list_users_ratings(self.rating_mat)
        
        mat = sparse.lil_matrix((self.user_n, self.item_n))
        user_tests = {}
        n_users_in_test = 0
        n_ratings_in_test = 0
        n_ratings_in_train = 0
        
        for user_iid in range(self.user_n):
            len_u = len(user_ratings[user_iid])
            if len_u >= least_userlen_test:
                n_users_in_test += 1
                test_len = int(len_u * test_percent)
                test_set_u = list(range(len_u))
#                 print(test_len, len_u)
                random.shuffle(test_set_u)
                
                train_set_u = test_set_u[test_len:][:]
                test_set_u = test_set_u[:test_len][:]
                
#                 print(len(train_set_u))
                
                for ir_idx in train_set_u:
                    # ir = the pair of (item, rating)
                    ir = user_ratings[user_iid][ir_idx]
                    mat[user_iid, ir[0]] = ir[1]
                    n_ratings_in_train += 1
                
                user_tests[user_iid] = []
                for ir_idx in test_set_u:
                    # ir = the pair of (item, rating)
                    ir = user_ratings[user_iid][ir_idx]
                    user_tests[user_iid].append(ir)
                    n_ratings_in_test += 1
                    
            else: # if no test set should be seprated from ratings of this user
                for ir in user_ratings[user_iid]:
                    # ir = the pair of (item, rating)
                    mat[user_iid, ir[0]] = ir[1]
                    n_ratings_in_train += 1
    
        print('\nNumber of users with some items in testset: %d' % n_users_in_test)
        print('Number of ratings in trainset: %d \t Number of ratings in testset: %d\n' % (n_ratings_in_train, n_ratings_in_test))
        return mat, user_tests
    
    def construct_trainset(self, raw_trainset):

        raw2inner_id_users = {}
        raw2inner_id_items = {}

        current_u_index = 0
        current_i_index = 0

        ur = defaultdict(list)
        ir = defaultdict(list)

        # user raw id, item raw id, translated rating, time stamp
        for urid, irid, r, timestamp in raw_trainset:
            try:
                uid = raw2inner_id_users[urid]
            except KeyError:
                uid = current_u_index
                raw2inner_id_users[urid] = current_u_index
                current_u_index += 1
            try:
                iid = raw2inner_id_items[irid]
            except KeyError:
                iid = current_i_index
                raw2inner_id_items[irid] = current_i_index
                current_i_index += 1

            ur[uid].append((iid, r))
            ir[iid].append((uid, r))

        n_users = len(ur)  # number of users
        n_items = len(ir)  # number of items
        n_ratings = len(raw_trainset)

        trainset = Trainset(
            ur,
            ir,
            n_users,
            n_items,
            n_ratings,
            self.reader.rating_scale,
            raw2inner_id_users,
            raw2inner_id_items,
        )

        return trainset

    def construct_testset(self, raw_testset):

        return [(ruid, riid, r_ui_trans) for (ruid, riid, r_ui_trans, _) in raw_testset]


In [3]:
df = pd.read_csv('ml-100k/udata.csv', 
                 sep=";", header=0, engine="python")

user = pd.read_csv('ml-100k/uuser.csv', 
                   sep=";", header=0, engine ="python")

genre = pd.read_csv('ml-100k/ugenre.csv', 
                    sep=";", header=0, engine = "python")

In [4]:
df.head()

Unnamed: 0,user,item,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


In [5]:
n_core = 45
filter_items = df['item'].value_counts() > 45

In [6]:
# To reduce the dimensionality of the dataset,
# we will filter out rarely rated movies and rarely rating users

min_ratings = n_core
min_user_ratings = n_core

init_df = df
init_shp = df.shape[0]
filt_shp = 0.0

while True:

    filter_items = init_df['item'].value_counts() > min_ratings
    filter_items = filter_items[filter_items == True].index.tolist()

    filter_users = init_df['user'].value_counts() > min_user_ratings
    filter_users = filter_users[filter_users == True].index.tolist()

    filt_df = init_df[(init_df['item'].isin(filter_items)) & (init_df['user'].isin(filter_users))]

    print('The original data frame shape:\t{}'.format(init_df.shape))
    print('The new data frame shape:\t{}'.format(filt_df.shape))
    print()
    
    init_shp = init_df.shape[0]
    filt_shp = filt_df.shape[0]
    
    # print(init_shp, filt_shp)
    
    if (init_shp == filt_shp):
        break
    
    init_df = filt_df

    
#------------------------------------------------------------
'Updating the df to its filtered version'
'Now filt-df is called df.' 
df = filt_df
print(filt_df.shape)
print(df.head())

print()
print('#users: ', np.unique(df['user']).shape)
print('#items: ', np.unique(df['item']).shape)

The original data frame shape:	(100000, 3)
The new data frame shape:	(76420, 3)

The original data frame shape:	(76420, 3)
The new data frame shape:	(73656, 3)

The original data frame shape:	(73656, 3)
The new data frame shape:	(73003, 3)

The original data frame shape:	(73003, 3)
The new data frame shape:	(72690, 3)

The original data frame shape:	(72690, 3)
The new data frame shape:	(72600, 3)

The original data frame shape:	(72600, 3)
The new data frame shape:	(72600, 3)

(72600, 3)
   user  item  rating
1   186   302       3
3   244    51       2
5   298   474       4
6   115   265       2
7   253   465       5

#users:  (551,)
#items:  (593,)


In [7]:
df.to_csv('./filtered_ml_%icore.csv'%n_core, sep=',', index=False)

In [8]:
# data = Dataset.load_from_df(df[['user', 'item', 'rating']], reader)

dataset = RatingDataset()
data_fn = './filtered_ml_%icore.csv'%n_core
dataset.read_from_file(data_fn, skip_lines=1, line_format='user item rating', sep=',')


print('# users', dataset.user_n)
print('# items', dataset.item_n)


# user_tests is the test_mat
train_mat, test_mat = dataset.train_test_split(test_percent=0.2, least_userlen_test=10)


# ir = the pair of (item, rating)
# ir = user_ratings[user_iid][ir_idx]
user_ratings = dataset.list_users_ratings(dataset.rating_mat)
print(dataset.rating_mat.shape)


# users 551
# items 593

Number of users with some items in testset: 551
Number of ratings in trainset: 58307 	 Number of ratings in testset: 14293

(551, 593)


In [9]:
user_ratings = dataset.list_users_ratings(train_mat)
# user_ratings[0]
# train_df = pd.DataFrame(columns = ['item','rating'])


tr_lst = []

for user_iid in range(dataset.user_n): 
    # trainset or dataset.user_n?? we keep the users the same (user-fixed) so they are equal.
    
    if user_ratings[user_iid]:
        base_rec = pd.DataFrame(user_ratings[user_iid])
        base_rec[2] = user_iid

        # base_rec[3] = 0

        tr_lst.append(base_rec[[2,0,1]])
        # tr_lst.append(base_rec[[2,0,1,3]])
    

train_df = pd.concat(tr_lst, ignore_index=True)
train_df.columns = ['user','item','rating']
# train_df.columns = ['user','item','rating','timestamp']
train_df.head()

Unnamed: 0,user,item,rating
0,0,1,2.0
1,0,6,3.0
2,0,7,4.0
3,0,9,5.0
4,0,11,4.0


In [10]:
# test set to a dataframe
test_lst = []

for uiid in test_mat.keys():
    base_rec = pd.DataFrame(test_mat[uiid])
    base_rec[2] = uiid

    # base_rec[3] = 0

    test_lst.append(base_rec[[2,0,1]])
    # test_lst.append(base_rec[[2,0,1,3]])


test_df = pd.concat(test_lst, ignore_index=True)
test_df.columns = ['user','item','rating']
# test_df.columns = ['user','item','rating','timestamp']
test_df.head()

Unnamed: 0,user,item,rating
0,0,4,4.0
1,0,379,3.0
2,0,93,4.0
3,0,159,3.0
4,0,515,2.0


In [17]:
# tstart = datetime.now()

reader = Reader(rating_scale=(1, 5))
# oracle_SVD = SVD()

# rmse_lst = []

# build the train into surprise format
# trainset_all = Dataset.load_from_df(train_df[['user', 'item', 'rating']], reader).build_full_trainset()
trainset_all = Dataset.load_from_df(train_df[['user','item','rating']], reader)

# build test set into surpirse format
# testset_all = Dataset.load_from_df(test_df[['user', 'item', 'rating']], reader).build_full_trainset().build_testset()
testset_all = Dataset.load_from_df(test_df[['user', 'item', 'rating']], reader)

# fit 
# oracle_SVD.fit(trainset_all)

# predictions
# fin_preds = oracle_SVD.test(testset_all)

# get the RMSE
# fin_acc = accuracy.rmse(fin_preds, verbose=False)

# print
# print(fin_acc)

# tend = datetime.now()    
# print("\n in ms : \n")
# print(tend-tstart)

In [12]:
import math
# ground_truth: list of items ordered by time
def nDCG_Time(ground_truth, _recList):
    rec_num = len(_recList) # topK
    # ground_truth is already sorted by time
    idealOrder = ground_truth
    idealDCG = 0.0
    for j in range(min(rec_num, len(idealOrder))):
        idealDCG += ((math.pow(2.0, len(idealOrder) - j) - 1) / math.log(2.0 + j))

    recDCG = 0.0
    for j in range(rec_num):
        item = _recList[j]
        if item in ground_truth:
            rank = len(ground_truth) - ground_truth.index(item) # why ground truth?
            recDCG += ((math.pow(2.0, rank) - 1) / math.log(1.0 + j + 1))

    return (recDCG / idealDCG)


def Recall(_test_set, _recList):
    hit = len(set(_recList).intersection(set(_test_set)))
    return hit / float(len(_test_set))


def Precision(_test_set, _recList):
    hit = len(set(_recList).intersection(set(_test_set)))
    return hit / float(len(_recList))

In [13]:
def recNMF_2(user_iid, _est, mat, topk):
    
    rated_before = np.nonzero(mat[user_iid, :])[1]
    estimations = _est[user_iid]
    estimations[rated_before] = 0 
    # you don't want to recommend the items to the user that have rated before duh!
    
    # top_items = np.argpartition(-estimations, topk)[:topk]
    top_items = np.argsort(-estimations)[:topk]
    top_ratings = -np.sort(-estimations)[:topk]
    
    return (user_iid, top_items, top_ratings)

In [45]:
start = timeit.default_timer()
from sklearn.decomposition import NMF
feature_n = 40

mf = NMF(n_components=feature_n, init='random', random_state=2, tol=0.01,
         solver='cd', max_iter=1000, alpha=1, beta_loss='frobenius',
         l1_ratio=0)


user_f = mf.fit_transform(train_mat)
H = mf.components_
item_f = mf.components_.T


stop = timeit.default_timer()
print('Process Time: %.2f secs' % (stop - start))
start = timeit.default_timer()
est = np.dot(user_f, item_f.T)
res = []

# Choose it to be 1000 instead of 10, and then the re-ranker will chose the final top 10
for u in range(dataset.user_n):
    res.append(recNMF_2(u, est, train_mat, 200))
    
user_recs_allinclude = {}
for x in res:
    user_recs_allinclude[x[0]] = x[1]



Process Time: 0.19 secs


In [46]:
u_rec_list = []
for i in range(len(res)):
    base_rec = pd.DataFrame(res[i][1:]).T
    base_rec[2] = res[i][0]
    u_rec_list.append(base_rec[[2,0,1]])

u_rec_df = pd.concat(u_rec_list, ignore_index=True)

# u_rec_df.to_csv('./ml_results/nmf_base_rec_ML_.csv', index=False, header=None)

In [47]:
stop = timeit.default_timer()
print('Process Time: %.2f secs' % (stop - start))

p = []
r = []
n = []

for u in test_mat.keys():
    if len(test_mat[u]) > 0:
        
        test_items = [t[0] for t in test_mat[u] if t[1] >= 4]
        
        if len(test_items) > 0:
            # to be comparable with the other algorithms, the list size should be the same that is 10 here.
            top_items = user_recs_allinclude[u][:10] 
            
            recall = Recall(test_items, top_items)
            precision = Precision(test_items, top_items)
            ndcg = nDCG_Time(test_items, top_items)

            p.append(precision)
            r.append(recall)
            n.append(ndcg)

print (" avg-precision %.3f\n avg-recall %.3f\n avg-nDCG %.3f" %
       (np.average(p),np.average(r),np.average(n)))

Process Time: 3.33 secs
 avg-precision 0.321
 avg-recall 0.243
 avg-nDCG 0.176


## Grid Search

### Active Learner Class (Surprise version)

In [23]:
class ActiveLearner(AlgoBase):

    def __init__(self, feature_n=40, random_state=0, max_iter=500,
                 strategy='MaxRating', initial_n=5, epochs=10, query_n=10):
        '''
        Prediction-based active learner class.
        '''

        AlgoBase.__init__(self)

        self.feature_n = feature_n
        self.random_state = random_state
        self.max_iter = max_iter
        self.decomposer = NMF(n_components=self.feature_n, init='random', random_state=self.random_state, max_iter=self.max_iter, verbose=False)
        # self.decomposer = truncatedSVD(n_components=self.feature_n, algorithm='randomized', random_state=self.random_state, n_iter=self.max_iter)
        
        self.strategy = strategy
        self.initial_n = initial_n
        self.epochs = epochs
        self.query_n = query_n

        self.user_f = None
        self.item_f = None 
        self.est = None
        self.known_mat = None

        # remember unavailable queries (kind of cheating?)
        self.unavl = {i: [] for i in range(dataset.user_n)}

    def fit(self, trainset):
        
        AlgoBase.fit(self, trainset)

        # initial fit
        candidate_user_ratings = trainset.ur
        initial_user_ratings = {}

        for user_iid in range(dataset.user_n):

            initial_user_ratings[user_iid] = []
            len_u = len(candidate_user_ratings[user_iid])

            if len_u >= self.initial_n:
                selected_u = list(range(len_u))
                random.shuffle(selected_u)
                selected_u = selected_u[:self.initial_n][:]

                for ir_idx in selected_u:
                    ir = candidate_user_ratings[user_iid][ir_idx]
                    initial_user_ratings[user_iid].append(ir)

            else:
                selected_u = list(range(len_u))
                random.shuffle(selected_u)

                for ir_idx in selected_u:
                    ir = candidate_user_ratings[user_iid][ir_idx]
                    initial_user_ratings[user_iid].append(ir)
        
        initial_train_df = self._rating_dic_to_df(initial_user_ratings)

        self.known_mat = self._convert_df_to_mat(initial_train_df)
        self.user_f = self.decomposer.fit_transform(self.known_mat)
        self.item_f = self.decomposer.components_.T
        self.est = np.dot(self.user_f, self.item_f.T)

        # active learning process
        for epoch in range(self.epochs):
            for user_iid in range(dataset.user_n):

                # get the items to be queried
                query_item_lst = self.query(user_iid, self.query_n)[0]
                candidate_df_u = pd.DataFrame(candidate_user_ratings[user_iid])
                query_df = candidate_df_u.loc[candidate_df_u[0].isin(query_item_lst)].copy()
                query_df[2] = user_iid
                query_df = query_df[[2, 0, 1]]
                query_df.columns = [0, 1, 2]
                # add to known rating matrix
                self.add_query(query_df)

                # remember the items that are unavailable
                unavailable_item_lst = [i for i in list(query_item_lst) if i not in list(query_df[1])]
                self.unavl[user_iid] += unavailable_item_lst
            
            self.train()
    
    def _rating_dic_to_df(self, rating_dic):
        lst = []

        for user_iid in range(dataset.user_n):

            if rating_dic[user_iid]:
                res = pd.DataFrame(rating_dic[user_iid])
                res[2] = user_iid

                lst.append(res[[2,0,1]])
        
        df = pd.concat(lst, ignore_index=True)
        df.columns = ['user', 'item', 'rating']
        return df
    
    def _convert_df_to_mat(self, df):
        '''
        Convert DaraFrame to sparse matrix
        
        Arg:
            df: DataFrame, training DataFrame
        
        Return:
            mat: lil_matrix, sparse matrix containing training data
        '''

        mat = sparse.lil_matrix((dataset.user_n, dataset.item_n))
        for _, row in df.iterrows():
            user_iid = int(row[0])
            item_iid = int(row[1])
            rating = row[2]
            mat[user_iid, item_iid] = rating
        
        return mat 

    def add_query(self, query_df):
        '''
        Add queried data to known matrix.
        
        Arg:
            query_df: DataFrame, new data to be added.
        '''

        for _, row in query_df.iterrows():
            user_iid = int(row[0])
            item_iid = int(row[1])
            rating = row[2]
            self.known_mat[user_iid, item_iid] = rating

    def train(self):
        '''
        Fit self.decomposer to known data.
        '''

        self.user_f = self.decomposer.fit_transform(self.known_mat)
        self.item_f = self.decomposer.components_.T

        self.est = np.dot(self.user_f, self.item_f.T)

        # return self to enable learner.train().predict()
        # return self
    
    def estimate(self, user_iid, item_iid):
        '''
        Estimate rating of user and item pair.

        Args:
            user_iid: int, inner id of the user to be predicted
            item_iid: int, inner id of the item to be predicted
        
        Return:
            float
        '''

        return self.est[user_iid, item_iid]
    
    def query(self, user_iid, topk):
        '''
        Find the indices of candidates which should be queried for true ratings.

        Args:
            user_iid: int, inner id of the user to be queried
            topk:     int, number of queried candidates
        
        Return:
            (top_items, top_ratings): (numpy int array, numpy float array)
        '''
        
        rated_before = np.nonzero(self.known_mat[user_iid, :])[1]
        unavailable = self.unavl[user_iid]
        
        estimations = self.est[user_iid]
        estimations[rated_before] = 0
        estimations[unavailable] = 0

        if self.strategy == 'MaxRating':
            # query the top 10 highest predicted ratings for each user
            
            top_items = np.argsort(-estimations)[:topk]    # argsort sorts in increasing order
            top_ratings = -np.sort(-estimations)[:topk] # revert to original ratings

        elif self.strategy == 'MinRating':
            top_items = np.argsort(estimations)[:topk]
            top_ratings = np.sort(estimations)[:topk]
        
        else:
            k = int(topk/2)
            low_items = np.argsort(estimations)[:topk-k]
            low_ratings = np.sort(estimations)[:topk-k]
            
            high_items = np.argsort(-estimations)[:k]
            high_ratings = -np.sort(-estimations)[:k]

            top_items = np.concatenate((low_items, high_items))
            top_ratings = np.concatenate((low_ratings, high_ratings))
        
        return (top_items, top_ratings)

In [24]:
param_grid = {'feature_n': [40,50],
    'random_state': [0],
    'max_iter': [10,100],
    'strategy': ['MaxRating'],
    'initial_n': [5],
    'epochs': [10],
    'query_n': [10]}

gs = GridSearchCV(ActiveLearner, param_grid, measures=['rmse'])

In [25]:
gs.fit(trainset_all)



In [26]:

# best RMSE score
print(gs.best_score["rmse"])

# combination of parameters that gave the best RMSE score
print(gs.best_params["rmse"])

2.769972996263446
{'feature_n': 40, 'random_state': 0, 'max_iter': 100, 'strategy': 'MaxRating', 'initial_n': 5, 'epochs': 10, 'query_n': 10}


## Binary Prediction

In [None]:
class BinaryPredicitonLearner(AlgoBase):

    def __init__(self, feature_n=40, random_state=0, max_iter=500,
                 initial_n=5, epochs=10, query_n=10):
        '''
        Binary Prediction-based active learner class.
        '''

        AlgoBase.__init__(self)

        self.feature_n = feature_n
        self.random_state = random_state
        self.max_iter = max_iter
        self.decomposer = NMF(n_components=self.feature_n, init='random', random_state=self.random_state, max_iter=self.max_iter, verbose=False)
        # self.decomposer = truncatedSVD(n_components=self.feature_n, algorithm='randomized', random_state=self.random_state, n_iter=self.max_iter)
        
        self.initial_n = initial_n
        self.epochs = epochs
        self.query_n = query_n

        self.user_f = None
        self.item_f = None 
        self.est = None
        self.known_mat = None

        # remember unavailable queries (kind of cheating?)
        self.unavl = {i: [] for i in range(dataset.user_n)}

    def fit(self, trainset):
        
        AlgoBase.fit(self, trainset)

        # initial fit
        candidate_user_ratings = trainset.ur
        initial_user_ratings = {}

        for user_iid in range(dataset.user_n):

            initial_user_ratings[user_iid] = []
            len_u = len(candidate_user_ratings[user_iid])

            if len_u >= self.initial_n:
                selected_u = list(range(len_u))
                random.shuffle(selected_u)
                selected_u = selected_u[:self.initial_n][:]

                for ir_idx in selected_u:
                    ir = candidate_user_ratings[user_iid][ir_idx]
                    initial_user_ratings[user_iid].append(ir)

            else:
                selected_u = list(range(len_u))
                random.shuffle(selected_u)

                for ir_idx in selected_u:
                    ir = candidate_user_ratings[user_iid][ir_idx]
                    initial_user_ratings[user_iid].append(ir)
        
        initial_train_df = self._rating_dic_to_df(initial_user_ratings)

        self.known_mat = self._convert_df_to_mat(initial_train_df)
        self.user_f = self.decomposer.fit_transform(self.known_mat)
        self.item_f = self.decomposer.components_.T
        self.est = np.dot(self.user_f, self.item_f.T)

        # active learning process
        for epoch in range(self.epochs):
            for user_iid in range(dataset.user_n):

                # get the items to be queried
                query_item_lst = self.query(user_iid, self.query_n)[0]
                candidate_df_u = pd.DataFrame(candidate_user_ratings[user_iid])
                query_df = candidate_df_u.loc[candidate_df_u[0].isin(query_item_lst)].copy()
                query_df[2] = user_iid
                query_df = query_df[[2, 0, 1]]
                query_df.columns = [0, 1, 2]
                # add to known rating matrix
                self.add_query(query_df)

                # remember the items that are unavailable
                unavailable_item_lst = [i for i in list(query_item_lst) if i not in list(query_df[1])]
                self.unavl[user_iid] += unavailable_item_lst
            
            self.train()
    
    def _rating_dic_to_df(self, rating_dic):
        lst = []

        for user_iid in range(dataset.user_n):

            if rating_dic[user_iid]:
                res = pd.DataFrame(rating_dic[user_iid])
                res[2] = user_iid

                lst.append(res[[2,0,1]])
        
        df = pd.concat(lst, ignore_index=True)
        df.columns = ['user', 'item', 'rating']
        return df
    
    def _convert_df_to_mat(self, df):
        '''
        Convert DaraFrame to sparse matrix
        
        Arg:
            df: DataFrame, training DataFrame
        
        Return:
            mat: lil_matrix, sparse matrix containing training data
        '''

        mat = sparse.lil_matrix((dataset.user_n, dataset.item_n))
        for _, row in df.iterrows():
            user_iid = int(row[0])
            item_iid = int(row[1])
            rating = row[2]
            mat[user_iid, item_iid] = rating
        
        return mat 

    def add_query(self, query_df):
        '''
        Add queried data to known matrix.
        
        Arg:
            query_df: DataFrame, new data to be added.
        '''

        for _, row in query_df.iterrows():
            user_iid = int(row[0])
            item_iid = int(row[1])
            rating = row[2]
            self.known_mat[user_iid, item_iid] = rating

    def train(self):
        '''
        Fit self.decomposer to known data.
        '''

        self.user_f = self.decomposer.fit_transform(self.known_mat)
        self.item_f = self.decomposer.components_.T

        self.est = np.dot(self.user_f, self.item_f.T)

        # return self to enable learner.train().predict()
        # return self
    
    def estimate(self, user_iid, item_iid):
        '''
        Estimate rating of user and item pair.

        Args:
            user_iid: int, inner id of the user to be predicted
            item_iid: int, inner id of the item to be predicted
        
        Return:
            float
        '''

        return self.est[user_iid, item_iid]
    
    def query(self, user_iid, topk):
        '''
        Find the indices of candidates which should be queried for true ratings.

        Args:
            user_iid: int, inner id of the user to be queried
            topk:     int, number of queried candidates
        
        Return:
            (top_items, top_ratings): (numpy int array, numpy float array)
        '''
        
        rated_before = np.nonzero(self.known_mat[user_iid, :])[1]
        unavailable = self.unavl[user_iid]
        
        estimations = self.est[user_iid]
        estimations[rated_before] = 0
        estimations[unavailable] = 0

        top_items = np.argsort(-estimations)[:topk]    # argsort sorts in increasing order
        top_ratings = -np.sort(-estimations)[:topk] # revert to original ratings
        
        return (top_items, top_ratings)