In [130]:
import time
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import collections


# The Matcher for later, not that important rn

In [131]:
# from matching import Player
# 
# players = [
#     Player("jerry"),
#     Player("george"),
#     Player("kramer"),
#     Player("elaine"),
# ]
# jerry, george, elaine, kramer = players
# 
# jerry.set_prefs([george, elaine, kramer])
# george.set_prefs([jerry, kramer, elaine])
# elaine.set_prefs([jerry, kramer, george])
# kramer.set_prefs([elaine, george, jerry])
# from matching.games import StableRoommates
# 
# game = StableRoommates(players)
# game.solve()

# Calculate the similarity of preferences

## Pereference Matrix
One row is the activity rating of a user
Each column is an activity

In [132]:

users = ['Snake','Panda','Tree','Dolphin']
preference_matrix = np.array([
    [1, 2, 3, 3, 1, 2],
    [1, 2, 4, 5, 1, 1],
    [5, 5, 3, 0, 5, 1],
    [5, 3, 2, 0, 4, 1]
], dtype='uint8')
size_in_bytes = preference_matrix.nbytes
print(f"Size of the matrix: {size_in_bytes} bytes")

Size of the matrix: 24 bytes


Create Huge Matrix to stress test

In [133]:
# num_users = 10000
# num_preferences = 10000
# huge_matrix = np.random.randint(0, 256, size=(num_users, num_preferences), dtype=np.uint8)
# 
# preference_matrix = huge_matrix
# 
# size_in_bytes = huge_matrix.nbytes
# size_in_gb = size_in_bytes / (1024**3)
# 
# print(f"Size of the matrix: {size_in_gb:.2f} GB")

## Manual Cos Calc
May come in useful later, when we want to calculate it on a per user basis

In [134]:
# def cosine_similarity(matrix):
#     dot_product = np.dot(matrix, matrix.T)
#     #print(dot_product)
#     norms = np.linalg.norm(matrix, axis=1)
#     # print(norms)
#     cosine_sim = dot_product / (np.outer(norms, norms))
#     return cosine_sim
# 
# 
# start_time = time.time()
# cosine_sim_matrix = cosine_similarity(preference_matrix)
# end_time = time.time()
# elapsed_time = end_time - start_time
# print(f"Elapsed time: {elapsed_time} seconds")
# # Print the cosine similarity matrix
# print(cosine_sim_matrix)


## Use sklearn to Cos Calc

In [135]:
start_time = time.time()

cosine_sim_matrix = cosine_similarity(preference_matrix)
end_time = time.time()

elapsed_time = end_time - start_time
print(f"Elapsed time: {np.round(elapsed_time,6)} seconds")


print(cosine_sim_matrix)


Elapsed time: 0.001172 seconds
[[1.         0.95470327 0.63543805 0.58609427]
 [0.95470327 1.         0.51663504 0.46709937]
 [0.63543805 0.51663504 1.         0.97990505]
 [0.58609427 0.46709937 0.97990505 1.        ]]


## Interpret the result
With words...

In [136]:
print('Higher Score = Better')

for i, user_pref in enumerate(cosine_sim_matrix):
    print(f'User {users[i]} gets along with:')
    for j, partner_score in enumerate(user_pref):
        print(f"> user {users[j]}: {partner_score}")
        


Higher Score = Better
User Snake gets along with:
> user Snake: 0.9999999999999999
> user Panda: 0.9547032697824668
> user Tree: 0.6354380478103605
> user Dolphin: 0.5860942701532693
User Panda gets along with:
> user Snake: 0.9547032697824668
> user Panda: 1.0000000000000002
> user Tree: 0.5166350401452484
> user Dolphin: 0.46709936649691386
User Tree gets along with:
> user Snake: 0.6354380478103605
> user Panda: 0.5166350401452484
> user Tree: 0.9999999999999998
> user Dolphin: 0.9799050483303551
User Dolphin gets along with:
> user Snake: 0.5860942701532693
> user Panda: 0.46709936649691386
> user Tree: 0.9799050483303551
> user Dolphin: 0.9999999999999999


In [137]:
# preference_list = {}
# 
# for i, user_pref in enumerate(cosine_sim_matrix):
#     print(f"list for {users[i]}")
#     the_list = {score:users[j] for j, score in enumerate(user_pref) if j != i} # j != i If its not the user itself
#     print(f'unsorted {the_list}')
#     sorted_list = dict(sorted(the_list.items(), reverse=True))
#     print(f'sorted {sorted_list}')
#     final_list = list(sorted_list.values())
#     print(final_list)
#     preference_list[users[i]] = final_list
#     print('======')
# 
# preference_list


## Cleaned Functions
Use these to actually implement it

In [142]:

def get_sorted_list_func(users, user_preferences):
    def get_sorted_list(user_id):
        user_score_list = {users[partner_id]:score for partner_id, score in enumerate(user_preferences[user_id]) if partner_id != user_id}
        user_score_ordered = collections.OrderedDict(sorted(user_score_list.items(), key=lambda item: item[1], reverse=True))
        return list(user_score_ordered)
    return get_sorted_list

def check_input(users, preference_matrix):
    user_limit = 10000
    if len(users)>user_limit or len(preference_matrix)>user_limit:
        raise ValueError(f"The user limit is {user_limit}")
    if len(users) != len(preference_matrix):
        raise ValueError(f"user list and matrix length do not match")
    if len(users) != len(set(users)):
        raise ValueError(f"Duplicates exist in the users array")


def get_preference_lists(users, preference_matrix):
    check_input(users, preference_matrix)

    user_preferences = cosine_similarity(preference_matrix)
    
    get_sorted_list = get_sorted_list_func(users, user_preferences)
    return {user_name:get_sorted_list(user_id) for user_id, user_name in enumerate(users)}





## Use the Functions

In [145]:

users = ['Snake','Panda','Tree','Dolphin','Water','Basel','Table 16']
preference_matrix = np.array([
    [1, 2, 3, 3, 1, 2],
    [1, 2, 4, 5, 1, 1],
    [5, 5, 3, 0, 5, 1],
    [0, 1, 1, 0, 0, 1],
    [5, 5, 0, 0, 1, 1],
    [4, 3, 3, 0, 5, 1],
    [1, 1, 1, 0, 1, 1]
], dtype='uint8')
size_in_bytes = preference_matrix.nbytes
print(f"Size of the matrix: {size_in_bytes} bytes")

get_preference_lists(users, preference_matrix)



Size of the matrix: 42 bytes


{'Snake': ['Panda', 'Dolphin', 'Table 16', 'Tree', 'Basel', 'Water'],
 'Panda': ['Snake', 'Dolphin', 'Table 16', 'Basel', 'Tree', 'Water'],
 'Tree': ['Basel', 'Table 16', 'Water', 'Snake', 'Dolphin', 'Panda'],
 'Dolphin': ['Table 16', 'Snake', 'Panda', 'Tree', 'Basel', 'Water'],
 'Water': ['Tree', 'Table 16', 'Basel', 'Dolphin', 'Snake', 'Panda'],
 'Basel': ['Tree', 'Table 16', 'Water', 'Snake', 'Panda', 'Dolphin'],
 'Table 16': ['Basel', 'Tree', 'Dolphin', 'Snake', 'Water', 'Panda']}