# Collaborative filtering

## Item-to-item collaborative filtering

1.   List item
2.   List item



$$
         w(i, j) = \frac{\sum_{u\in U_i \cap U_j}(r_{u,i}-\bar{r}_u)(r_{u,j}-\bar{r}_u)}{\sqrt{\sum_{u\in U_i \cap U_j} (r_{u,i}-\bar{r}_u)^2}\sqrt{\sum_{u\in U_i \cap U_j} (r_{u,j}-\bar{r}_u)^2}}.
$$

\begin{equation}
 w(c,I_u) = \sum_{i\in I_u} w_{c,i}.
\end{equation}


### Import useful requirements

In [63]:
import os

if not (os.path.exists("recsys.zip") or os.path.exists("recsys")):
    !wget https://github.com/nzhinusoftcm/review-on-collaborative-filtering/raw/master/recsys.zip
    !unzip recsys.zip

### Import requirements

In [64]:
import os
import sys
import typing as tp

import joblib
import numpy as np
import pandas as pd
import tqdm.notebook
from recsys.datasets import ml1m, ml100k
from sklearn.preprocessing import LabelEncoder

### Dataset upload\

In [65]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Will use movielens dataset

In [66]:
ratings, movies = ml100k.load()

In [67]:
ratings.head(100)

Unnamed: 0,userid,itemid,rating
0,1,1,5
1,1,2,3
2,1,3,4
3,1,4,3
4,1,5,3
...,...,...,...
95,1,96,5
96,1,97,3
97,1,98,4
98,1,99,3


In [68]:
movies.head()

Unnamed: 0,itemid,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


### Preprocessing

In [69]:
def ids_encoder(ratings):
    users = sorted(ratings["userid"].unique())
    items = sorted(ratings["itemid"].unique())

    # create users and items encoders
    uencoder = LabelEncoder()
    iencoder = LabelEncoder()

    # fit users and items ids to the corresponding encoder
    uencoder.fit(users)
    iencoder.fit(items)

    # encode userids and itemids
    ratings.userid = uencoder.transform(ratings.userid.tolist())
    ratings.itemid = iencoder.transform(ratings.itemid.tolist())

    return ratings, uencoder, iencoder

In [70]:
# create the encoder
ratings, uencoder, iencoder = ids_encoder(ratings)

In [71]:
ratings

Unnamed: 0,userid,itemid,rating
0,0,0,5
1,0,1,3
2,0,2,4
3,0,3,3
4,0,4,3
...,...,...,...
99995,942,1066,2
99996,942,1073,4
99997,942,1187,3
99998,942,1227,3


In [72]:
iencoder

## Implementation

### Part 1. Similarities

\begin{equation}
 w_{i,j}= \frac{\sum_{u\in U}(r_{u,i}-\bar{r}_u)(r_{u,j}-\bar{r}_u)}{\sqrt{\sum_{u\in U} (r_{u,i}-\bar{r}_u)^2}\sqrt{\sum_{u\in U} (r_{u,j}-\bar{r}_u)^2}}.
\end{equation}

In [73]:
def normalize(ratings: pd.DataFrame) -> pd.DataFrame:
    """Normalize ratings by user"""
    # calculate mean for every user
    mean = ratings.groupby(by="userid", as_index=False)["rating"].mean()
    norm_ratings = pd.merge(ratings, mean, suffixes=("", "_mean"), on="userid")

    # normalize each rating by substracting the mean rating of the corresponding user
    norm_ratings["norm_rating"] = norm_ratings["rating"] - norm_ratings["rating_mean"]

    return norm_ratings[ratings.columns.tolist() + ["norm_rating"]]

In [74]:
def test_normalize():
    test_df = pd.DataFrame(
        {
            "userid": [0, 0, 0, 1, 1],
            "itemid": [0, 1, 2, 1, 3],
            "rating": [2, 2, 5, 5, 5],
        }
    )

    expected = pd.DataFrame(
        {
            "userid": [0, 0, 0, 1, 1],
            "itemid": [0, 1, 2, 1, 3],
            "rating": [2, 2, 5, 5, 5],
            "norm_rating": [-1, -1, 2, 0, 0],
        }
    )

    assert (
        test_df.shape[0] == expected.shape[0]
    ), "Number of user-item interactions is different"
    assert test_df.shape[1] + 1 == expected.shape[1], "Number of columns is incorrect"
    assert (normalize(test_df) == expected).all().all(), "Result is incorrect"


test_normalize()

In [75]:
norm_ratings = normalize(ratings)
np_ratings = norm_ratings.to_numpy()
norm_ratings.head()

Unnamed: 0,userid,itemid,rating,norm_rating
0,0,0,5,1.389706
1,0,1,3,-0.610294
2,0,2,4,0.389706
3,0,3,3,-0.610294
4,0,4,3,-0.610294


In [76]:
def cosine(x: np.array, y: np.array) -> float:
    if np.linalg.norm(x) == 0 or np.linalg.norm(y) == 0:
        return 0

    return np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))

In [77]:
from functools import lru_cache


@lru_cache(2000)
def ratings_for_item(i):
    return np_ratings[np_ratings[:, 1] == i]


def calculate_similarity_between_two(np_ratings: np.array, i: int, j: int) -> float:
    """
    np_ratings: array containing: (user_id, item_id, rating, rating_mean, norm_rating)
    i: index of the first item
    j: index of the second item

    Returns:
        pearson correlation between i and j
    """
    if i == j:
        return 1.0

    ratings_i, ratings_j = ratings_for_item(i), ratings_for_item(j)

    common_users = np.intersect1d(ratings_i[:, 0], ratings_j[:, 0])

    if len(common_users) == 0:
        return -1.0  # 0.0

    common_ratings_i = ratings_i[np.isin(ratings_i[:, 0], common_users)]
    common_ratings_j = ratings_j[np.isin(ratings_j[:, 0], common_users)]

    assert sorted(common_ratings_i[:, 0]) == sorted(common_ratings_j[:, 0])

    x = common_ratings_i[:, 3]
    y = common_ratings_j[:, 3]
    return cosine(x, y)

In [78]:
assert np.isclose(calculate_similarity_between_two(np_ratings, 0, 0), 1.0)
assert np.isclose(calculate_similarity_between_two(np_ratings, 1, 2), 0.1069226)
assert np.isclose(calculate_similarity_between_two(np_ratings, 1, 3), 0.0555092)
assert np.isclose(calculate_similarity_between_two(np_ratings, 1, 5), -0.125509)
assert np.isclose(calculate_similarity_between_two(np_ratings, 1, 1431), 1.0)
assert np.isclose(calculate_similarity_between_two(np_ratings, 4, 1123), 0.0)

In [79]:
from tqdm import tqdm
def adjusted_cosine(
    np_ratings: np.array, similarity_between_two
) -> tp.Tuple[np.array, np.array]:
    """Computes correlation for all pairs

    np_ratings: array containing: (user_id, item_id, rating, rating_mean, norm_rating)
    similarity_between_two: function to calculate similarity
    """
    nb_items = np.unique(np_ratings[:, 1]).size
    similarities = np.zeros((nb_items, nb_items), dtype=float)

    np.fill_diagonal(similarities, 1)

    items = sorted(set(map(int, np_ratings[:, 1])))

    with tqdm(total=len(items) * (len(items) - 1) // 2) as pbar:
        for i in range(len(items)):
            for j in range(i + 1, len(items)):
                sim = similarity_between_two(np_ratings, items[i], items[j])
                similarities[items[i], items[j]] = sim
                similarities[items[j], items[i]] = sim
                pbar.update()

    assert np.all(
        similarities.T == similarities
    ), "Similarity matrix should be symmetrical"
    assert np.allclose(
        np.diag(similarities), 1.0
    ), "Similarities of items with themselves should be 1"

    # get neighbors by their neighbors in decreasing order of similarities
    neighbors = np.flip(np.argsort(similarities), axis=1)

    # sort similarities in decreasing order
    similarities = np.flip(np.sort(similarities), axis=1)

    return similarities, neighbors

In [80]:
similarities, neighbors = adjusted_cosine(np_ratings, calculate_similarity_between_two)

100%|██████████| 1413721/1413721 [02:37<00:00, 8964.11it/s] 


In [81]:
np.unique(similarities[100])

array([-1.        , -0.99999998, -0.99810888, ...,  1.        ,
        1.        ,  1.        ])

In [82]:
def neighbours_viz(
    item_id: int,
    movies: pd.DataFrame,
    similarities: np.array,
    neighbours: np.array,
    k=5,
):
    orig_index = iencoder.inverse_transform([item_id])[0]
    film_name = movies[movies.itemid == orig_index].title.values[0]
    similar_films = (
        (
            neighbor_id,
            movies[
                movies.itemid == iencoder.inverse_transform([neighbor_id])[0]
            ].title.values[0],
            similarity,
        )
        for neighbor_id, similarity in zip(
            neighbors[item_id][:k], similarities[item_id][:k]
        )
    )
    display(
        pd.DataFrame(
            dict(zip(("item_id", film_name, "Similarity"), zip(*similar_films)))
        )
    )
    print("\n")

In [83]:
neighbours_viz(49, movies, similarities, neighbors)
neighbours_viz(68, movies, similarities, neighbors)
neighbours_viz(914, movies, similarities, neighbors)
neighbours_viz(319, movies, similarities, neighbors)
neighbours_viz(200, movies, similarities, neighbors)

Unnamed: 0,item_id,Star Wars (1977),Similarity
0,1638,Bitter Sugar (Azucar Amargo) (1996),1.0
1,1326,Captives (1994),1.0
2,1636,Girls Town (1996),1.0
3,1629,"Silence of the Palace, The (Saimt el Qusur) (1...",1.0
4,1242,Night Flier (1997),1.0






Unnamed: 0,item_id,Forrest Gump (1994),Similarity
0,765,Man of the Year (1995),1.0
1,1599,Guantanamera (1994),1.0
2,690,Dark City (1998),1.0
3,991,Head Above Water (1996),1.0
4,1606,Hurricane Streets (1998),1.0






Unnamed: 0,item_id,Primary Colors (1998),Similarity
0,1593,Everest (1998),1.0
1,622,Angels in the Outfield (1994),1.0
2,934,Paradise Road (1997),1.0
3,798,Boys Life (1995),1.0
4,266,unknown,1.0






Unnamed: 0,item_id,Paradise Lost: The Child Murders at Robin Hood Hills (1996),Similarity
0,1193,Once Were Warriors (1994),1.0
1,77,Free Willy (1993),1.0
2,1343,"Story of Xinghua, The (1993)",1.0
3,920,Farewell My Concubine (1993),1.0
4,1402,Caro Diario (Dear Diary) (1994),1.0






Unnamed: 0,item_id,Evil Dead II (1987),Similarity
0,919,Two Bits (1995),1.0
1,1332,Midnight Dancers (Sibak) (1994),1.0
2,1136,Beautiful Thing (1996),1.0
3,888,"Tango Lesson, The (1997)",1.0
4,1552,"Underneath, The (1995)",1.0






In [84]:
def calculate_similarity_between_two_with_threshold(
    np_ratings: np.array, i: int, j: int
) -> float:
    if i == j:
        return 1.0

    ratings_i, ratings_j = ratings_for_item(i), ratings_for_item(j)
    common_users = np.intersect1d(ratings_i[:, 0], ratings_j[:, 0])
    common_ratings_i = ratings_i[np.isin(ratings_i[:, 0], common_users)]
    common_ratings_j = ratings_j[np.isin(ratings_j[:, 0], common_users)]

    if len(common_users) > 20:
        assert sorted(common_ratings_i[:, 0]) == sorted(common_ratings_j[:, 0])
        x = common_ratings_i[:, 3]
        y = common_ratings_j[:, 3]
        return cosine(x, y)
    return -1.0

In [85]:
assert np.isclose(
    calculate_similarity_between_two_with_threshold(np_ratings, 1, 1431), -1.0
)
assert np.isclose(
    calculate_similarity_between_two_with_threshold(np_ratings, 1, 17), -1.0
)
assert np.isclose(
    calculate_similarity_between_two_with_threshold(np_ratings, 4, 1123), -1.0
)
assert np.isclose(
    calculate_similarity_between_two_with_threshold(np_ratings, 914, 1681), -1.0
)

In [86]:
similarities, neighbors = adjusted_cosine(
    np_ratings, calculate_similarity_between_two_with_threshold
)

100%|██████████| 1413721/1413721 [02:03<00:00, 11415.95it/s]


In [87]:
neighbours_viz(49, movies, similarities, neighbors)
neighbours_viz(68, movies, similarities, neighbors)
neighbours_viz(154, movies, similarities, neighbors)
neighbours_viz(200, movies, similarities, neighbors)

Unnamed: 0,item_id,Star Wars (1977),Similarity
0,49,Star Wars (1977),1.0
1,171,"Empire Strikes Back, The (1980)",0.826287
2,180,Return of the Jedi (1983),0.728182
3,173,Raiders of the Lost Ark (1981),0.71425
4,407,"Close Shave, A (1995)",0.659379






Unnamed: 0,item_id,Forrest Gump (1994),Similarity
0,68,Forrest Gump (1994),1.0
1,214,Field of Dreams (1989),0.445981
2,309,"Rainmaker, The (1997)",0.436435
3,21,Braveheart (1995),0.422572
4,965,"Affair to Remember, An (1957)",0.420235






Unnamed: 0,item_id,Dirty Dancing (1987),Similarity
0,154,Dirty Dancing (1987),1.0
1,626,Robin Hood: Prince of Thieves (1991),0.727235
2,568,Wolf (1994),0.716361
3,254,My Best Friend's Wedding (1997),0.708146
4,416,"Parent Trap, The (1961)",0.634029






Unnamed: 0,item_id,Evil Dead II (1987),Similarity
0,200,Evil Dead II (1987),1.0
1,183,Army of Darkness (1993),0.574317
2,23,Rumble in the Bronx (1995),0.536677
3,90,"Nightmare Before Christmas, The (1993)",0.491057
4,557,Heavenly Creatures (1994),0.481768






In [88]:
#np.unique(similarities[8])


## Problem I.1

In [89]:
import numpy as np

def calculate_jaccard_similarity_between_two(np_ratings, i, j):
    if i == j:
        return 1.0

    ratings_i, ratings_j = np_ratings[np_ratings[:, 1] == i], np_ratings[np_ratings[:, 1] == j]

    users_i = set(ratings_i[:, 0])
    users_j = set(ratings_j[:, 0])

    intersection = len(users_i.intersection(users_j))
    union = len(users_i) + len(users_j) - intersection

    if union == 0:
        return 0.0
    else:
        return float(intersection) / union


In [90]:
calculate_jaccard_similarity_between_two(np_ratings, 1, 17)

0.029197080291970802

In [91]:
similarities_jaccard, neighbors_jaccard = adjusted_cosine(
    np_ratings, calculate_jaccard_similarity_between_two
)

100%|██████████| 1413721/1413721 [14:26<00:00, 1630.90it/s]


## Problem I.2

In [92]:
import numpy as np

def calculate_inner_product_similarity(np_ratings, i, j):
    if i == j:
        return 1.0

    ratings_i, ratings_j = np_ratings[np_ratings[:, 1] == i], np_ratings[np_ratings[:, 1] == j]

    users_i = set(ratings_i[:, 0])
    users_j = set(ratings_j[:, 0])

    common_users = list(users_i.intersection(users_j))

    if not common_users:
        return 0.0

    common_ratings_i = ratings_i[np.isin(ratings_i[:, 0], common_users)]
    common_ratings_j = ratings_j[np.isin(ratings_j[:, 0], common_users)]

    inner_product = np.sum(common_ratings_i[:, 2] * common_ratings_j[:, 2])

    return inner_product


In [93]:
similarities_inner_product, neighbors_inner_product = adjusted_cosine(
    np_ratings, calculate_inner_product_similarity
)

100%|██████████| 1413721/1413721 [16:16<00:00, 1447.10it/s]


## Problem I.3

In [94]:
import numpy as np

def calculate_thresholded_pearson_similarity(np_ratings, i, j, threshold=0):
    if i == j:
        return 1.0

    ratings_i, ratings_j = np_ratings[np_ratings[:, 1] == i], np_ratings[np_ratings[:, 1] == j]

    users_i = set(ratings_i[:, 0])
    users_j = set(ratings_j[:, 0])

    common_users = list(users_i.intersection(users_j))

    if not common_users:
        return 0.0

    common_ratings_i = np.array([rating for rating in ratings_i if rating[0] in common_users])
    common_ratings_j = np.array([rating for rating in ratings_j if rating[0] in common_users])

    mean_i = np.mean(common_ratings_i[:, 2])
    mean_j = np.mean(common_ratings_j[:, 2])

    diff_i = common_ratings_i[:, 2] - mean_i
    diff_j = common_ratings_j[:, 2] - mean_j

    numerator = np.sum(diff_i * diff_j)
    denominator_i = np.sqrt(np.sum(diff_i ** 2))
    denominator_j = np.sqrt(np.sum(diff_j ** 2))

    if denominator_i == 0 or denominator_j == 0:
        return 0.0

    correlation = numerator / (denominator_i * denominator_j)

    if correlation < threshold:
        return 0.0
    else:
        return correlation


In [95]:
similarities_pearson, neighbors_pearson = adjusted_cosine(
    np_ratings, calculate_thresholded_pearson_similarity
)

100%|██████████| 1413721/1413721 [17:44<00:00, 1327.55it/s]


### Part 2. Top items for a user

In [96]:
def candidate_items(
    np_ratings: np.array, userid: int, k=-1
) -> tp.Tuple[np.array, np.array]:
    # 1. Finding the set I_u of items already rated by user userid
    I_u = np_ratings[np_ratings[:, 0] == userid]
    I_u = I_u[:, 1].astype("int")

    # 2. Taking the union of similar items for all items in I_u to form the set of candidate items
    c = set()

    for iid in I_u:
        # add the neighbors of item iid in the set of candidate items
        c.update(neighbors[iid, :k])

    c = list(c)
    # 3. exclude from the set C all items in I_u.
    candidates = np.setdiff1d(c, I_u, assume_unique=True)

    return I_u, candidates

In [97]:
i_u, u_candidates = candidate_items(np_ratings, uencoder.transform([3])[0])

print("Films seen by user:", len(i_u))
print("Candidates:", len(u_candidates))

Films seen by user: 54
Candidates: 1628


In [98]:
i_u, u_candidates = candidate_items(np_ratings, uencoder.transform([3])[0], 10)

print("Films seen by user:", len(i_u))
print("Candidates:", len(u_candidates))

Films seen by user: 54
Candidates: 237


In [99]:
def similarity_with_Iu(item_id: int, I_u: np.array) -> float:
    w = 0
    for iid in I_u:
        # get similarity between itemid and c, if c is one of the k nearest neighbors of itemid
        if item_id in neighbors[iid]:
            w = w + similarities[iid, neighbors[iid] == item_id][0]
    return w

In [100]:
def rank_candidates(candidates: np.array, I_u: np.array) -> np.array:
    # list of candidate items mapped to their corresponding similarities to I_u
    sims = [similarity_with_Iu(c, I_u) for c in candidates]
    candidates = iencoder.inverse_transform(candidates)
    mapping = list(zip(candidates, sims))

    ranked_candidates = sorted(mapping, key=lambda couple: couple[1], reverse=True)
    return ranked_candidates

## Problem II.1

In [101]:
def estimate_ratings_simple_avg(candidates: np.array, I_u: np.array, np_ratings: np.array) -> np.array:
    estimated_ratings = []
    for candidate in candidates:
        similar_neighbors = np_ratings[np.isin(np_ratings[:, 1], candidate) & (np_ratings[:, 0] != I_u[0])]

        if len(similar_neighbors) > 0:
            avg_rating = np.mean(similar_neighbors[:, 2])
            estimated_ratings.append((candidate, avg_rating))
        else:
            estimated_ratings.append((candidate, 0))

    return np.array(estimated_ratings)


In [102]:
simple_avg_ratings = estimate_ratings_simple_avg(u_candidates, i_u, np_ratings)



## Problem II.2

In [103]:
def estimate_ratings_mean_adjusted(candidates: np.array, I_u: np.array, np_ratings: np.array) -> np.array:
    estimated_ratings = []
    user_ratings = np_ratings[np.isin(np_ratings[:, 0], I_u)]

    user_avg_rating = np.mean(user_ratings[:, 2])

    for candidate in candidates:
        similar_neighbors = np_ratings[np.isin(np_ratings[:, 1], candidate) & (np_ratings[:, 0] != I_u[0])]

        if len(similar_neighbors) > 0:
            avg_rating = np.mean(similar_neighbors[:, 2])
            mean_adjusted_rating = avg_rating - user_avg_rating
            estimated_ratings.append((candidate, mean_adjusted_rating))
        else:
            estimated_ratings.append((candidate, 0))

    return np.array(estimated_ratings)


In [104]:
mean_adjusted_ratings = estimate_ratings_mean_adjusted(u_candidates, i_u, np_ratings)

## Putting it alltogether

In [105]:
def topn_recommendation(np_ratings: np.array, userid, k=-1, N=30):
    # find candidate items
    I_u, candidates = candidate_items(np_ratings, userid, k)

    # rank candidate items according to their similarities with I_u
    ranked_candidates = rank_candidates(candidates, I_u)

    # get the first N row of ranked_candidates to build the top N recommendation list
    topn = pd.DataFrame(ranked_candidates[:N], columns=["itemid", "similarity_with_Iu"])
    topn = pd.merge(topn, movies, on="itemid", how="inner")
    return topn

In [106]:
topn_recommendation(np_ratings, uencoder.transform([1])[0])

Unnamed: 0,itemid,similarity_with_Iu,title
0,313,-16.94169,Titanic (1997)
1,318,-23.278784,Schindler's List (1993)
2,655,-24.349682,Stand by Me (1986)
3,357,-26.289772,One Flew Over the Cuckoo's Nest (1975)
4,433,-27.140886,Heathers (1989)
5,423,-27.654851,E.T. the Extra-Terrestrial (1982)
6,651,-27.970798,Glory (1989)
7,288,-29.451573,Scream (1996)
8,276,-29.451749,Leaving Las Vegas (1995)
9,527,-29.573808,Gandhi (1982)


In [107]:
test_history = [49, 81, 180, 256, 131, 379]
movies.iloc[test_history]

Unnamed: 0,itemid,title
49,50,Star Wars (1977)
81,82,Jurassic Park (1993)
180,181,Return of the Jedi (1983)
256,257,Men in Black (1997)
131,132,"Wizard of Oz, The (1939)"
379,380,Star Trek: Generations (1994)


In [108]:
def candidate_items_by_user_history(I_u: tp.List[int], k=-1):
    c = set()
    for iid in I_u:
        c.update(neighbors[iid, :k])
    candidates = np.setdiff1d(list(c), I_u, assume_unique=True)

    return candidates


def topn_recommendations_by_user_history(I_u: tp.List[int], k=-1, N=30):
    candidates = candidate_items_by_user_history(I_u, k=k)
    ranked_candidates = rank_candidates(candidates, I_u)
    topn = pd.DataFrame(ranked_candidates[:N], columns=["itemid", "similarity_with_Iu"])
    topn = pd.merge(topn, movies, on="itemid", how="inner")
    return topn


topn_recommendations_by_user_history(test_history)

Unnamed: 0,itemid,similarity_with_Iu,title
0,172,2.628615,"Empire Strikes Back, The (1980)"
1,174,2.14232,Raiders of the Lost Ark (1981)
2,313,2.035859,Titanic (1997)
3,210,1.941583,Indiana Jones and the Last Crusade (1989)
4,651,1.88054,Glory (1989)
5,22,1.773043,Braveheart (1995)
6,64,1.73555,"Shawshank Redemption, The (1994)"
7,79,1.707949,"Fugitive, The (1993)"
8,204,1.686313,Back to the Future (1985)
9,429,1.67672,"Day the Earth Stood Still, The (1951)"


# Problem III

In [114]:
num_movies_to_rate = 30
random_movies = movies.sample(num_movies_to_rate)
ratings_data = {
    'itemid': random_movies['itemid'],
    'title': random_movies['title'],
    'rating': np.random.randint(1, 6, num_movies_to_rate)
}
ratings_df = pd.DataFrame(ratings_data)
print(ratings_df)

      itemid                                              title  rating
1442    1443                                   8 Seconds (1994)       1
1102    1103                                       Trust (1990)       2
1343    1344                       Story of Xinghua, The (1993)       1
551      552                                     Species (1995)       3
222      223                                 Sling Blade (1996)       4
829      830                                    Power 98 (1995)       3
1436    1437                               House Party 3 (1994)       5
1048    1049                                House Arrest (1996)       1
1233    1234                       Chairman of the Board (1998)       2
1564    1565                                       Daens (1992)       4
796      797                                     Timecop (1994)       5
988      989                            Cats Don't Dance (1997)       5
1485    1486                        Girl in the Cadillac (1995) 

In [138]:
def my_normalize(ratings: pd.DataFrame) -> pd.DataFrame:

    mean = ratings['rating'].mean()

    ratings['norm_rating'] = ratings['rating'] - mean

    return ratings

In [139]:
my_norm_ratings = my_normalize(ratings_df)
my_np_ratings = my_norm_ratings.to_numpy()
my_norm_ratings.head()

Unnamed: 0,itemid,title,rating,norm_rating
1442,1443,8 Seconds (1994),1,-1.866667
1102,1103,Trust (1990),2,-0.866667
1343,1344,"Story of Xinghua, The (1993)",1,-1.866667
551,552,Species (1995),3,0.133333
222,223,Sling Blade (1996),4,1.133333


In [140]:
my_np_ratings.shape

(30, 4)

In [142]:
new_my_norm_ratings = pd.DataFrame({
    'userid': [0] * len(my_norm_ratings),  # Set all user IDs to 0
    'itemid': my_norm_ratings['itemid'].values,  # Use the 'itemid' column
    'rating': my_norm_ratings['rating'].values,  # Use the 'rating' column
    'norm_rating': my_norm_ratings['norm_rating'].values  # Use the 'norm_rating' column
})

# print(new_my_norm_ratings)


    userid  itemid  rating  norm_rating
0        0    1443       1    -1.866667
1        0    1103       2    -0.866667
2        0    1344       1    -1.866667
3        0     552       3     0.133333
4        0     223       4     1.133333
5        0     830       3     0.133333
6        0    1437       5     2.133333
7        0    1049       1    -1.866667
8        0    1234       2    -0.866667
9        0    1565       4     1.133333
10       0     797       5     2.133333
11       0     989       5     2.133333
12       0    1486       1    -1.866667
13       0      48       2    -0.866667
14       0     380       3     0.133333
15       0    1514       5     2.133333
16       0    1336       3     0.133333
17       0     771       2    -0.866667
18       0    1072       2    -0.866667
19       0    1211       2    -0.866667
20       0    1642       3     0.133333
21       0    1665       5     2.133333
22       0     670       3     0.133333
23       0     420       2    -0.866667


In [145]:
from tqdm import tqdm

def adjusted_cosine(
    np_ratings: np.array, similarity_between_two
) -> tp.Tuple[np.array, np.array]:
    """Computes correlation for all pairs

    np_ratings: array containing: (user_id, item_id, rating, rating_mean, norm_rating)
    similarity_between_two: function to calculate similarity
    """
    nb_items = np.unique(np_ratings[:, 1]).size
    similarities = np.zeros((nb_items, nb_items), dtype=float)

    np.fill_diagonal(similarities, 1)

    with tqdm(total=nb_items * (nb_items - 1) // 2) as pbar:
        for i in range(nb_items):
            for j in range(i + 1, nb_items):
                item_id_i = np.unique(np_ratings[:, 1])[i]
                item_id_j = np.unique(np_ratings[:, 1])[j]

                sim = similarity_between_two(np_ratings, item_id_i, item_id_j)
                similarities[i, j] = sim
                similarities[j, i] = sim
                pbar.update()

    assert np.all(
        similarities.T == similarities
    ), "Similarity matrix should be symmetrical"
    assert np.allclose(
        np.diag(similarities), 1.0
    ), "Similarities of items with themselves should be 1"

    neighbors = np.flip(np.argsort(similarities), axis=1)

    similarities = np.flip(np.sort(similarities), axis=1)

    return similarities, neighbors


In [146]:
new_my_norm_ratings.reset_index(drop=True, inplace=True)
similarities_cosine, neighbors_cosine = adjusted_cosine(
    new_my_norm_ratings.to_numpy(), calculate_similarity_between_two_with_threshold
)


100%|██████████| 435/435 [00:00<00:00, 3933.18it/s]


In [147]:
similarities_jaccard, neighbors_jaccard = adjusted_cosine(
    new_my_norm_ratings.to_numpy(), calculate_jaccard_similarity_between_two
)

100%|██████████| 435/435 [00:00<00:00, 15566.67it/s]


In [150]:
neighbours_viz(0, movies, similarities_cosine, neighbors_cosine)


Unnamed: 0,item_id,Toy Story (1995),Similarity
0,0,Toy Story (1995),1.0
1,920,Farewell My Concubine (1993),-0.017696
2,499,Fly Away Home (1996),-0.036669
3,922,Raise the Red Lantern (1991),-0.08237
4,488,Notorious (1946),-0.159048






In [151]:
similarities_inner_product, neighbors_inner_product = adjusted_cosine(
    new_my_norm_ratings.to_numpy(), calculate_inner_product_similarity
)

100%|██████████| 435/435 [00:00<00:00, 12871.41it/s]


In [154]:
similarities_pearson, neighbors_pearson = adjusted_cosine(
    new_my_norm_ratings.to_numpy(), calculate_thresholded_pearson_similarity
)

100%|██████████| 435/435 [00:00<00:00, 5335.61it/s]


In [156]:
simple_avg_ratings = estimate_ratings_simple_avg(u_candidates, i_u, new_my_norm_ratings.to_numpy())


In [160]:
mean_adjusted_ratings = estimate_ratings_mean_adjusted(u_candidates, i_u, new_my_norm_ratings.to_numpy())