In [5]:
import itertools
from typing import Dict, List, Tuple

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [6]:
sports = [
    'badminton',
    'basketball',
    'biking',
    'boxing',
    'fighting',
    'fishing',
    'football',
    'hockey',
    'running',
    'swimming',
    'tabletennis',
    'tennis',
    'volleyball'
]

In [7]:
people = ['Barbara', 'Birol', 'Guido', 'Lisa', 'Rudi', 'Suna', 'Sven', 'Yvonne']

In [8]:
ratings = {
    "Barbara": {"football": 3, "basketball": 5, "boxing": 4, "biking": 2, "fighting": 4},
    "Birol": {"boxing": 4, "hockey": 2, "biking": 4, "fighting": 5, "swimming": 5, "tennis": 5},
    "Guido": {"basketball": 2, "tennis": 4, "boxing": 2, "biking": 2, "volleyball": 4, "football": 5},
    "Lisa": {"football": 4, "tabletennis": 3, "running": 4, "volleyball": 5, "swimming": 1},
    "Rudi": {"football": 1, "badminton": 4, "biking": 5, "running": 5, "tabletennis": 1},
    "Suna": {"swimming": 4, "volleyball": 5, "running": 3, "tennis": 5, "tabletennis": 4},
    "Sven": {"swimming": 5, "biking": 4, "running": 4, "fishing": 1, "badminton": 5},
    "Yvonne": {"basketball": 1, "badminton": 3, "tennis": 5, "fighting": 2, "football": 5, "running": 5}
}
ratings = {k: ratings[k] for k in sorted(ratings.keys())}

In [9]:
rows = []
for person, individual_ratings in ratings.items():
    for sport, rating in individual_ratings.items():
        rows.append([person, sport, rating])

ratings_df = pd.DataFrame(rows, columns=["person", "sport", "rating"])

In [10]:
m = len(people)
n = len(sports)

## Fill the Matrix

In [11]:
# initialize
rating_matrix = np.zeros((m, n))
# fill with ratings
for person_idx, person in enumerate(people):
    individual_ratings = ratings[person]
    for sport, rating in individual_ratings.items():
        sport_idx = sports.index(sport)
        rating_matrix[person_idx, sport_idx] = rating

In [12]:
rating_matrix

array([[0., 5., 2., 4., 4., 0., 3., 0., 0., 0., 0., 0., 0.],
       [0., 0., 4., 4., 5., 0., 0., 2., 0., 5., 0., 5., 0.],
       [0., 2., 2., 2., 0., 0., 5., 0., 0., 0., 0., 4., 4.],
       [0., 0., 0., 0., 0., 0., 4., 0., 4., 1., 3., 0., 5.],
       [4., 0., 5., 0., 0., 0., 1., 0., 5., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 3., 4., 4., 5., 5.],
       [5., 0., 4., 0., 0., 1., 0., 0., 4., 5., 0., 0., 0.],
       [3., 1., 0., 0., 2., 0., 5., 0., 5., 0., 0., 5., 0.]])

In [13]:
sparsity = (rating_matrix == 0).sum() / rating_matrix.size
print(f"Sparsity: {sparsity: .2%}")

Sparsity:  58.65%


## Nearest Neighborhood Collaborative Filtering (user-based)
* compute similarities among the users
* perform neighborhood-based collaborative filtering

### User-User Similarities

In [15]:
def get_cosine_sim(a: int, b: int, entity_ratings: dict) -> tuple:
    # 1. isolate e.g. users that have rated both items (a and b)
    key_intersection = set(entity_ratings[a].keys()).intersection(entity_ratings[b].keys())
    ratings = np.array([(entity_ratings[a][key], entity_ratings[b][key]) for key in key_intersection])
    n_joint_ratings = len(ratings)
    
    sim = None
    if n_joint_ratings > 1:
        nom = ratings[:, 0].dot(ratings[:, 1])
        denom = np.linalg.norm(ratings[:, 0]) * np.linalg.norm(ratings[:, 1])
        sim = nom / denom
        
    return sim, n_joint_ratings

In [16]:
user_user_sims = {}
user_pairs = itertools.combinations(people, 2)

In [17]:
for pair in user_pairs:
    user_user_sims[pair] = get_cosine_sim(pair[0], pair[1], ratings)

In [18]:
user_user_sims[("Barbara", "Rudi")]

(0.7071067811865476, 2)

### 1. Nearest Neighbors for a given user

In [19]:
def get_k_nearest_neighbors(user: int, k: int, users: list, user_user_sims: dict) -> list:
    neighbors = set(users)
    neighbors.remove(user)

    nearest_neighbors = dict()
    for neighbor in neighbors:
        sim = user_user_sims[tuple(sorted((user, neighbor)))][0]
        if pd.notnull(sim):
            nearest_neighbors[neighbor] = sim

    nearest_neighbors = sorted(nearest_neighbors.items(),
                               key=lambda kv: kv[1],
                               reverse=True)
    
    return nearest_neighbors[:k]

In [20]:
user_neighbors = get_k_nearest_neighbors("Barbara", 2, people, user_user_sims)

In [21]:
user_neighbors

[('Birol', 0.9713237285143654), ('Guido', 0.8277591347639633)]

### 2. Obtain the Neighborhood Ratings

In [22]:
def get_neighborhood_ratings(user, user_neighbors: list, ratings: dict) -> dict:
    neighborhood_ratings = {}
    for neighbor, sim in user_neighbors:
        neighbor_ratings = ratings[neighbor].copy()
        
        # collect neighbor ratings and items
        for item, rating in neighbor_ratings.items():
            add_item = {'sim': sim, 'rating': rating}
            if item not in neighborhood_ratings.keys():
                neighborhood_ratings[item] = [add_item]
            else:
                neighborhood_ratings[item].append(add_item)
        
    # remove known items
    known_items = list(ratings[user].keys())
    for known_item in known_items:
        neighborhood_ratings.pop(known_item, None)
    
    return neighborhood_ratings

In [23]:
neighborhood_ratings = get_neighborhood_ratings("Barbara", user_neighbors, ratings)

In [24]:
neighborhood_ratings

{'hockey': [{'sim': 0.9713237285143654, 'rating': 2}],
 'swimming': [{'sim': 0.9713237285143654, 'rating': 5}],
 'tennis': [{'sim': 0.9713237285143654, 'rating': 5},
  {'sim': 0.8277591347639633, 'rating': 4}],
 'volleyball': [{'sim': 0.8277591347639633, 'rating': 4}]}

### 3. Compute Rating Predictions from Neighborhood Ratings

In [25]:
def compute_rating_pred(neighborhood_ratings: dict) -> dict:
    rating_preds = dict()
    for item, ratings in neighborhood_ratings.items():
        if len(ratings) > 0:
            sims = np.array([rating['sim'] for rating in ratings])
            ratings = np.array([rating['rating'] for rating in ratings])
            pred_rating = (sims * ratings).sum() / sims.sum()
            count = len(sims)
            rating_preds[item] = {'pred': pred_rating,
                                  'count': count}
        else:
            rating_preds[item] = {'pred': None, 'count': 0}

    return rating_preds

In [26]:
rating_preds = compute_rating_pred(neighborhood_ratings)

In [27]:
rating_preds

{'hockey': {'pred': 2.0, 'count': 1},
 'swimming': {'pred': 5.0, 'count': 1},
 'tennis': {'pred': 4.5398993833693675, 'count': 2},
 'volleyball': {'pred': 4.0, 'count': 1}}

### 4. Compute the Top-$N$ Recommendation Items

In [28]:
from collections import OrderedDict

def compute_top_n(rating_preds: dict, min_count: int, N: int) -> OrderedDict:
    rating_preds = {key: val for (key, val) in rating_preds.items()
                    if val['count'] >= min_count}
    # assuming more ratings mean higher confidence in the prediction
    sorted_rating_preds = sorted(rating_preds.items(),
                                 key=lambda kv: (kv[1]['pred'], kv[1]['count']),
                                 reverse=True)

    return OrderedDict(sorted_rating_preds[:N])

In [29]:
top_n_recs = compute_top_n(rating_preds, min_count=2, N=1)

In [30]:
top_n_recs

OrderedDict([('tennis', {'pred': 4.5398993833693675, 'count': 2})])

### Combined all steps

In [31]:
def get_recommendations(user: int,
                        users: list,
                        user_user_sims: dict,
                        ratings: dict,
                        k: int,
                        C: int,
                        N: int):
    user_neighbors = get_k_nearest_neighbors(user, k=k, users=users, user_user_sims=user_user_sims)
    neighborhood_ratings = get_neighborhood_ratings(user, user_neighbors, ratings)
    rating_preds = compute_rating_pred(neighborhood_ratings)
    top_n_recs = compute_top_n(rating_preds, min_count=C, N=N)
    return top_n_recs

In [32]:
rec = get_recommendations("Barbara", people, user_user_sims, ratings, k=2, C=2, N=1)

In [33]:
rec

OrderedDict([('tennis', {'pred': 4.5398993833693675, 'count': 2})])

In [34]:
for person in people:
    recs = get_recommendations(person, people, user_user_sims, ratings, k=2, C=2, N=1)
    person = person.ljust(7)
    if len(recs) > 0:
        sport = list(recs)[0]
        pred, count = recs.pop(sport).values()
        print(f"{person} --> {sport.ljust(10)} @ {round(pred, 1)} - {count} neighbor ratings")
    else:
        print(f"{person} --> Nothing for you :(")

Barbara --> tennis     @ 4.5 - 2 neighbor ratings
Birol   --> running    @ 3.5 - 2 neighbor ratings
Guido   --> running    @ 4.0 - 2 neighbor ratings
Lisa    --> tennis     @ 4.5 - 2 neighbor ratings
Rudi    --> Nothing for you :(
Suna    --> biking     @ 3.0 - 2 neighbor ratings
Sven    --> tennis     @ 5.0 - 2 neighbor ratings
Yvonne  --> volleyball @ 4.5 - 2 neighbor ratings
