## Create function: Given user list and top n, generate "recommendations" (Pandas DataFrame)

Assumptions:

1. Row indices of the utility matrix corresponds to the user_id

2. Column indices of the utility matrix corresponds to the item_id

3. id starts from 0

Notes:

- I don't seem to need the item ids

In [1]:
import numpy as np
import pandas as pd

def random_user_list(n_user=100, sample_size=10, random_seed=1):
    
    """Generates random user-to-cluster assignment.
    
            Parameters:
                    n_user (int): Number of users
                    sample_size (int): Number of user to sample

            Returns:
                    user_list (numpy.ndarray): List of users to recommend to
    """
    
    np.random.seed(random_seed)
    user_list = np.random.choice(range(n_user), size=sample_size, replace=False)

    return user_list

def random_user_cluster(n_user=100, n_user_cluster=5, random_seed=1):
    
    """Generates random user-to-cluster assignment.
    
            Parameters:
                    n_user (int): Number of users
                    n_user_cluster (int): Number of user clusters

            Returns:
                    uc_assignment (numpy.ndarray): List of cluster assignments
    """
    
    np.random.seed(random_seed)
    uc_assignment = np.random.randint(low=0, high=n_user_cluster, size=n_user)
    
    return uc_assignment

def random_utility_matrix(n_user_cluster=5, n_item_cluster=5, random_seed=1):

    """Generates a random imputed utility matrix.
    
            Parameters:
                    n_user (int): Number of users
                    n_item (int): Number of users
                    n_user_cluster (int): Number of user clusters
                    n_item_cluster (int): Number of item clusters
                    random_seed (int): Random seed

            Returns:
                    utility_matrix_o (numpy.ndarray): A random utility matrix before imputation
                    utility_matrix (numpy.ndarray): A random utility matrix after imputation            
    """

    user_cluster_list = list(range(n_user_cluster))
    item_cluster_list = list(range(n_item_cluster))
    
    # Generate random utility matrix
    np.random.seed(random_seed)
    utility_matrix = np.eye(N=len(user_cluster_list), M=len(item_cluster_list))
    np.random.shuffle(utility_matrix)
    
    utility_matrix_o = utility_matrix.copy() # Assume that 1 indicates that it has been rated, everything else is imputed
    
    utility_matrix += np.random.beta(a=1, b=1, size=(len(user_cluster_list), len(item_cluster_list))).round(4)
    utility_matrix[utility_matrix > 1] = 1
    
    return utility_matrix_o, utility_matrix

In [2]:
# Do I write a function for this?
n_user = 100
n_item = 50 # Unused
sample_size = 10
n_user_cluster = 5
n_item_cluster = 5
random_seed = 1

user_id_list = list(range(n_user))
user_list = random_user_list(n_user, sample_size, random_seed)
uc_assignment = random_user_cluster(n_user, n_user_cluster, random_seed)
utility_matrix_o, utility_matrix = random_utility_matrix(n_user_cluster, n_item_cluster, random_seed)

In [3]:
user_list

array([80, 84, 33, 81, 93, 17, 36, 82, 69, 65])

In [4]:
uc_assignment

array([3, 4, 0, 1, 3, 0, 0, 1, 4, 4, 1, 2, 4, 2, 4, 3, 4, 2, 4, 2, 4, 1,
       1, 0, 1, 1, 1, 1, 0, 4, 1, 0, 0, 3, 2, 1, 0, 3, 1, 1, 3, 4, 0, 1,
       3, 4, 2, 4, 0, 3, 1, 2, 0, 4, 1, 2, 2, 1, 0, 1, 3, 4, 3, 1, 3, 0,
       0, 2, 2, 1, 3, 4, 2, 0, 0, 1, 1, 3, 0, 0, 4, 2, 4, 3, 3, 0, 3, 4,
       3, 4, 4, 4, 1, 0, 4, 2, 0, 2, 4, 1])

In [5]:
uc_assignment[user_list]

array([4, 3, 3, 2, 0, 2, 0, 4, 1, 0])

In [6]:
utility_matrix_o

array([[0., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0.]])

In [7]:
utility_matrix

array([[0.3732, 0.3739, 1.    , 0.5149, 0.1519],
       [0.0812, 1.    , 0.6708, 0.0519, 0.6888],
       [0.6676, 0.1352, 0.0962, 0.1855, 1.    ],
       [1.    , 0.1501, 0.0326, 0.5469, 0.3875],
       [0.8347, 0.9869, 0.871 , 1.    , 0.8348]])

In [8]:
# Arguments
top_n = 3

In [9]:
def get_rec(utility_matrix, utility_matrix_o, user_list, uc_assignment, top_n):
    
    """Returns the top N recommendations for each user in the user list.
    
            Parameters:
                    utility_matrix (numpy.ndarray): Matrix of utilities for each user-item pairing (assumes that indices correspond to user_cluster_id and item_cluster_id)
                    utility_matrix_o (numpy.ndarray): Original utility matrix, before imputation (i need this so i dont recommend items that have already been "consumed"/"rated")
                    user_list (array-like): List of users
                    uc_assignment (array-like): List containing the cluster assignment of each user (assumes that indices correspond to user_id)
                    top_n (int): Number of item clusters to recommend

            Returns:
                    df_rec (pandas.DataFrame): Table containing the top N recommendations for each user in the user list
                    
    """
    
    # Don't recommend items that are already rated
    utility_matrix[np.where(utility_matrix_o != 0)] = -np.inf
    
    # Get top N per user cluster
    cluster_rec = utility_matrix.argsort()[:, -top_n:]

    # Create recommendation table
    df_rec = pd.DataFrame()
    df_rec['user_id'] = user_list
    
    for i in range(top_n):
        df_rec['rank_'+str(i+1)] = np.zeros(df_rec.shape[0])
        for j in range(df_rec.shape[0]):
            df_rec.iloc[j, i+1] = cluster_rec[uc_assignment[user_list[j]], top_n-i-1]
    
    #df_rec['u_cluster'] = uc_assignment[user_list] # Comment out later!
    return df_rec

In [10]:
df_rec = get_rec(utility_matrix, utility_matrix_o, user_list, uc_assignment, top_n)
df_rec

Unnamed: 0,user_id,rank_1,rank_2,rank_3
0,80,1.0,2.0,4.0
1,84,3.0,4.0,1.0
2,33,3.0,4.0,1.0
3,81,0.0,3.0,1.0
4,93,3.0,1.0,0.0
5,17,0.0,3.0,1.0
6,36,3.0,1.0,0.0
7,82,1.0,2.0,4.0
8,69,4.0,2.0,0.0
9,65,3.0,1.0,0.0


### Unit Tests

In [12]:
import unittest

class TestGetRec(unittest.TestCase):
    
    def test_1(self):
        
        # Set-up
        n_user = 100
        sample_size = 10
        n_user_cluster = 5
        n_item_cluster = 5
        random_seed = 1
        top_n = 3

        user_id_list = list(range(n_user))
        user_list = random_user_list(n_user, sample_size, random_seed)
        uc_assignment = random_user_cluster(n_user, n_user_cluster, random_seed)
        utility_matrix_o, utility_matrix = random_utility_matrix(n_user_cluster, n_item_cluster, random_seed)

        df_rec = get_rec(utility_matrix, utility_matrix_o, user_list, uc_assignment, top_n)
        
        test_case = np.array([
            [80.,  1.,  2.,  4.],
            [84.,  3.,  4.,  1.],
            [33.,  3.,  4.,  1.],
            [81.,  0.,  3.,  1.],
            [93.,  3.,  1.,  0.],
            [17.,  0.,  3.,  1.],
            [36.,  3.,  1.,  0.],
            [82.,  1.,  2.,  4.],
            [69.,  4.,  2.,  0.],
            [65.,  3.,  1.,  0.]
        ])
        
        self.assertEqual(df_rec.to_numpy().tolist(), test_case.tolist())
        
    def test_2(self):
        
        # Set-up
        n_user = 100
        sample_size = 10
        n_user_cluster = 5
        n_item_cluster = 5
        random_seed = 2
        top_n = 3

        user_id_list = list(range(n_user))
        user_list = random_user_list(n_user, sample_size, random_seed)
        uc_assignment = random_user_cluster(n_user, n_user_cluster, random_seed)
        utility_matrix_o, utility_matrix = random_utility_matrix(n_user_cluster, n_item_cluster, random_seed)

        df_rec = get_rec(utility_matrix, utility_matrix_o, user_list, uc_assignment, top_n)
        
        test_case = np.array([
            [80.,  1.,  2.,  4.],
            [84.,  3.,  4.,  1.],
            [33.,  3.,  4.,  1.],
            [81.,  0.,  3.,  1.],
            [93.,  3.,  1.,  0.],
            [17.,  0.,  3.,  1.],
            [36.,  3.,  1.,  0.],
            [82.,  1.,  2.,  4.],
            [69.,  4.,  2.,  0.],
            [65.,  3.,  1.,  0.]
        ])
        
        self.assertEqual(df_rec.to_numpy().tolist(), test_case.tolist())
        
unittest.main(argv=[''], verbosity=2, exit=False)

test_1 (__main__.TestGetRec) ... ok
test_2 (__main__.TestGetRec) ... FAIL

FAIL: test_2 (__main__.TestGetRec)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "<ipython-input-12-812f5dbf2343>", line 67, in test_2
    self.assertEqual(df_rec.to_numpy().tolist(), test_case.tolist())
AssertionError: Lists differ: [[83.0, 2.0, 1.0, 4.0], [30.0, 4.0, 1.0, 2.0[180 chars]4.0]] != [[80.0, 1.0, 2.0, 4.0], [84.0, 3.0, 4.0, 1.0[181 chars]0.0]]

First differing element 0:
[83.0, 2.0, 1.0, 4.0]
[80.0, 1.0, 2.0, 4.0]

Diff is 802 characters long. Set self.maxDiff to None to see it.

----------------------------------------------------------------------
Ran 2 tests in 0.022s

FAILED (failures=1)


<unittest.main.TestProgram at 0x1f4169612e0>