## Recommending what's popular

#### Data

In [1]:
# data
users_interests = [
    ["Hadoop", "Big Data", "HBase", "Java", "Spark", "Storm", "Cassandra"],
    ["NoSQL", "MongoDB", "Cassandra", "HBase", "Postgres"],
    ["Python", "scikit-learn", "scipy", "numpy", "statsmodels", "pandas"],
    ["R", "Python", "statistics", "regression", "probability"],
    ["machine learning", "regression", "decision trees", "libsvm"],
    ["Python", "R", "Java", "C++", "Haskell", "programming languages"],
    ["statistics", "probability", "mathematics", "theory"],
    ["machine learning", "scikit-learn", "Mahout", "neural networks"],
    ["neural networks", "deep learning", "Big Data", "artificial intelligence"],
    ["Hadoop", "Java", "MapReduce", "Big Data"],
    ["statistics", "R", "statsmodels"],
    ["C++", "deep learning", "artificial intelligence", "probability"],
    ["pandas", "R", "Python"],
    ["databases", "HBase", "Postgres", "MySQL", "MongoDB"],
    ["libsvm", "regression", "support vector machines"]
]

In [2]:
# popular interests
from collections import Counter

popular_interests = Counter(interest
                            for user_interests in users_interests
                            for interest in user_interests)
popular_interests

Counter({'Hadoop': 2,
         'Big Data': 3,
         'HBase': 3,
         'Java': 3,
         'Spark': 1,
         'Storm': 1,
         'Cassandra': 2,
         'NoSQL': 1,
         'MongoDB': 2,
         'Postgres': 2,
         'Python': 4,
         'scikit-learn': 2,
         'scipy': 1,
         'numpy': 1,
         'statsmodels': 2,
         'pandas': 2,
         'R': 4,
         'statistics': 3,
         'regression': 3,
         'probability': 3,
         'machine learning': 2,
         'decision trees': 1,
         'libsvm': 2,
         'C++': 2,
         'Haskell': 1,
         'programming languages': 1,
         'mathematics': 1,
         'theory': 1,
         'Mahout': 1,
         'neural networks': 2,
         'deep learning': 2,
         'artificial intelligence': 2,
         'MapReduce': 1,
         'databases': 1,
         'MySQL': 1,
         'support vector machines': 1})

In [3]:
# most popular interests
from scratchlib.recommender import most_popular_new_interests

In [4]:
# user 1
most_popular_new_interests(users_interests[0], popular_interests)

[('Python', 4),
 ('R', 4),
 ('statistics', 3),
 ('regression', 3),
 ('probability', 3)]

In [5]:
# user 2
most_popular_new_interests(users_interests[2], popular_interests)

[('R', 4), ('Big Data', 3), ('HBase', 3), ('Java', 3), ('statistics', 3)]

## User-based Collaborative Filtering

In [6]:
# unique interests
unique_interests = sorted({interest
                           for user_interests in users_interests
                           for interest in user_interests})

assert unique_interests[:6] == [
    'Big Data',
    'C++',
    'Cassandra',
    'HBase',
    'Hadoop',
    'Haskell',
    # ...
]

In [7]:
# user interests vector
from scratchlib.recommender import make_user_interest_vector

user_interest_vectors = [make_user_interest_vector(user_interests, unique_interests)
                         for user_interests in users_interests]

user_interest_vectors[0][:10] # user 1 first 10 interests

[1, 0, 1, 1, 1, 0, 1, 0, 0, 0]

In [8]:
# interest matrix
import pandas as pd 
pd.options.display.max_columns = 100

pd.DataFrame(data=user_interest_vectors, columns=unique_interests)

Unnamed: 0,Big Data,C++,Cassandra,HBase,Hadoop,Haskell,Java,Mahout,MapReduce,MongoDB,MySQL,NoSQL,Postgres,Python,R,Spark,Storm,artificial intelligence,databases,decision trees,deep learning,libsvm,machine learning,mathematics,neural networks,numpy,pandas,probability,programming languages,regression,scikit-learn,scipy,statistics,statsmodels,support vector machines,theory
0,1,0,1,1,1,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,1,1,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,1,0,1,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0
5,0,1,0,0,0,1,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,1
7,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0
8,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
9,1,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [9]:
# user similarity
from scratchlib.recommender import cosine_similarity

user_similarities = [[cosine_similarity(interest_vector_i, interest_vector_j)
                      for interest_vector_j in user_interest_vectors]
                     for interest_vector_i in user_interest_vectors]

user_similarities[0] # pair-wise user similarity

[1.0,
 0.3380617018914066,
 0.0,
 0.0,
 0.0,
 0.1543033499620919,
 0.0,
 0.0,
 0.1889822365046136,
 0.5669467095138409,
 0.0,
 0.0,
 0.0,
 0.1690308509457033,
 0.0]

In [10]:
# compare user interests: user 1 vs user 9
pd.DataFrame(index = unique_interests,
             data = {
    "user 1":  user_interest_vectors[0],
    "user 10": user_interest_vectors[9]}).T

Unnamed: 0,Big Data,C++,Cassandra,HBase,Hadoop,Haskell,Java,Mahout,MapReduce,MongoDB,MySQL,NoSQL,Postgres,Python,R,Spark,Storm,artificial intelligence,databases,decision trees,deep learning,libsvm,machine learning,mathematics,neural networks,numpy,pandas,probability,programming languages,regression,scikit-learn,scipy,statistics,statsmodels,support vector machines,theory
user 1,1,0,1,1,1,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
user 10,1,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [11]:
# find most similar users
from scratchlib.recommender import most_similar_users_to

most_similar_users_to(0, user_similarities)

[(9, 0.5669467095138409),
 (1, 0.3380617018914066),
 (8, 0.1889822365046136),
 (13, 0.1690308509457033),
 (5, 0.1543033499620919)]

In [12]:
# user based suggestions for user 1
from scratchlib.recommender import user_based_suggestions

user_based_suggestions(0, users_interests, user_similarities)

[('MapReduce', 0.5669467095138409),
 ('MongoDB', 0.50709255283711),
 ('Postgres', 0.50709255283711),
 ('NoSQL', 0.3380617018914066),
 ('neural networks', 0.1889822365046136),
 ('deep learning', 0.1889822365046136),
 ('artificial intelligence', 0.1889822365046136),
 ('databases', 0.1690308509457033),
 ('MySQL', 0.1690308509457033),
 ('Python', 0.1543033499620919),
 ('R', 0.1543033499620919),
 ('C++', 0.1543033499620919),
 ('Haskell', 0.1543033499620919),
 ('programming languages', 0.1543033499620919)]

## Item-based Collaborative Filtering

In [13]:
# interests-user matrix
interest_user_matrix = [[user_interest_vector[j]
                         for user_interest_vector in user_interest_vectors]
                        for j, _ in enumerate(unique_interests)]

pd.DataFrame(data=interest_user_matrix, index=unique_interests)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
Big Data,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0
C++,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0
Cassandra,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
HBase,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0
Hadoop,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0
Haskell,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
Java,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0
Mahout,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
MapReduce,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
MongoDB,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0


In [14]:
# interest similarities

interest_similarities = [[cosine_similarity(user_vector_i, user_vector_j)
                          for user_vector_j in interest_user_matrix]
                         for user_vector_i in interest_user_matrix]

In [15]:
# find most similar interests
from scratchlib.recommender import most_similar_interests_to

most_similar_interests_to(0, unique_interests, interest_similarities)

[('Hadoop', 0.8164965809277261),
 ('Java', 0.6666666666666666),
 ('MapReduce', 0.5773502691896258),
 ('Spark', 0.5773502691896258),
 ('Storm', 0.5773502691896258),
 ('Cassandra', 0.4082482904638631),
 ('artificial intelligence', 0.4082482904638631),
 ('deep learning', 0.4082482904638631),
 ('neural networks', 0.4082482904638631),
 ('HBase', 0.3333333333333333)]

In [16]:
# item based suggestions for user 1
from scratchlib.recommender import item_based_suggestions

item_based_suggestions(0, users_interests, user_interest_vectors, unique_interests, interest_similarities)

[('MapReduce', 1.861807319565799),
 ('MongoDB', 1.3164965809277263),
 ('Postgres', 1.3164965809277263),
 ('NoSQL', 1.2844570503761732),
 ('MySQL', 0.5773502691896258),
 ('databases', 0.5773502691896258),
 ('Haskell', 0.5773502691896258),
 ('programming languages', 0.5773502691896258),
 ('artificial intelligence', 0.4082482904638631),
 ('deep learning', 0.4082482904638631),
 ('neural networks', 0.4082482904638631),
 ('C++', 0.4082482904638631),
 ('Python', 0.2886751345948129),
 ('R', 0.2886751345948129)]

## Matrix Factorization

#### Data

In [17]:
# files' location
MOVIES = "./data/ml-100k/u.item"   # pipe-delimited: movie_id|title|...
RATINGS = "./data/ml-100k/u.data"  # tab-delimited: user_id, movie_id, rating, timestamp

In [18]:
# rating class
from typing import NamedTuple

class Rating(NamedTuple):
    user_id: str
    movie_id: str
    rating: float

In [19]:
# read movies
import csv

with open(MOVIES, encoding="iso-8859-1") as f:
    reader = csv.reader(f, delimiter="|")
    movies = {movie_id: title for movie_id, title, *_ in reader}
    
dict(list(movies.items())[0:5])

{'1': 'Toy Story (1995)',
 '2': 'GoldenEye (1995)',
 '3': 'Four Rooms (1995)',
 '4': 'Get Shorty (1995)',
 '5': 'Copycat (1995)'}

In [20]:
# read ratings
with open(RATINGS, encoding="iso-8859-1") as f:
    reader = csv.reader(f, delimiter="\t")
    ratings = [Rating(user_id, movie_id, float(rating))
               for user_id, movie_id, rating, _ in reader]
    
ratings[0]

Rating(user_id='196', movie_id='242', rating=3.0)

#### Starwars rating

In [21]:
import re

# Data structure for accumulating ratings by movie_id
star_wars_ratings = {movie_id: []
                     for movie_id, title in movies.items()
                     if re.search("Star Wars|Empire Strikes|Jedi", title)}

star_wars_ratings # three movies

{'50': [], '172': [], '181': []}

In [22]:
# Iterate over ratings, accumulating the Star Wars ones
for rating in ratings:
    if rating.movie_id in star_wars_ratings:
        star_wars_ratings[rating.movie_id].append(rating.rating)

star_wars_ratings["50"][:10]

[5.0, 4.0, 5.0, 5.0, 5.0, 4.0, 5.0, 5.0, 4.0, 4.0]

In [23]:
# Compute the average rating for each movie
avg_ratings = [(sum(title_ratings) / len(title_ratings), movie_id)
               for movie_id, title_ratings in star_wars_ratings.items()]

In [24]:
# And then print them in order
for avg_rating, movie_id in sorted(avg_ratings, reverse=True):
    print(f"{avg_rating:.2f} {movies[movie_id]}")

4.36 Star Wars (1977)
4.20 Empire Strikes Back, The (1980)
4.01 Return of the Jedi (1983)


#### Split Data

In [25]:
import random

random.seed(0)
random.shuffle(ratings)

split1 = int(len(ratings) * 0.7)
split2 = int(len(ratings) * 0.85)

train = ratings[:split1]              # 70% of the data
validation = ratings[split1:split2]   # 15% of the data
test = ratings[split2:]               # 15% of the data

#### Baseline error = average rating

In [26]:
avg_rating = sum(rating.rating for rating in train) / len(train)
baseline_error = sum((rating.rating - avg_rating) ** 2
                     for rating in test) / len(test)
baseline_error

1.2609526646939684

#### Learning Embeddings

In [27]:
# create vectors
from scratchlib.recommender import random_tensor

EMBEDDING_DIM = 2

# Find unique ids
user_ids = {rating.user_id for rating in ratings}
movie_ids = {rating.movie_id for rating in ratings}

# Then create a random vector per id
user_vectors = {user_id: random_tensor(EMBEDDING_DIM)
                for user_id in user_ids}
movie_vectors = {movie_id: random_tensor(EMBEDDING_DIM)
                 for movie_id in movie_ids}

In [28]:
# optimizer
from typing import List
import tqdm
from scratchlib.recommender import dot

def loop(dataset: List[Rating],
         learning_rate: float = None) -> None:
    with tqdm.tqdm(dataset) as t:
        loss = 0.0
        for i, rating in enumerate(t):
            movie_vector = movie_vectors[rating.movie_id]
            user_vector = user_vectors[rating.user_id]
            predicted = dot(user_vector, movie_vector)
            error = predicted - rating.rating
            loss += error ** 2

            if learning_rate is not None:
                #     predicted = m_0 * u_0 + ... + m_k * u_k
                # So each u_j enters output with coefficent m_j
                # and each m_j enters output with coefficient u_j
                user_gradient = [error * m_j for m_j in movie_vector]
                movie_gradient = [error * u_j for u_j in user_vector]

                # Take gradient steps
                for j in range(EMBEDDING_DIM):
                    user_vector[j] -= learning_rate * user_gradient[j]
                    movie_vector[j] -= learning_rate * movie_gradient[j]

            t.set_description(f"avg loss: {loss / (i + 1)}")

In [None]:
# train
learning_rate = 0.05
for epoch in range(2):
    learning_rate *= 0.9
    print(epoch, learning_rate)
    loop(train, learning_rate=learning_rate)
    loop(validation)
loop(test)

0 0.045000000000000005


avg loss: 5.397518032434137: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 70000/70000 [00:51<00:00, 1371.54it/s]
avg loss: 1.2665781174206434: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15000/15000 [00:10<00:00, 1492.10it/s]


1 0.04050000000000001


avg loss: 1.1151836681387008: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 70000/70000 [00:46<00:00, 1498.94it/s]
avg loss: 1.0675642374415708:  16%|█████████████████████████▏                                                                                                                                       | 2346/15000 [00:01<00:08, 1451.44it/s]

In [None]:
loop(test)

avg loss: 1.091223521248254:  98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎   | 14660/15000 [00:11<00:00, 1277.76it/s]

#### Understand Embeddings

In [35]:
from scratch.working_with_data import pca, transform
from collections import defaultdict

original_vectors = [vector for vector in movie_vectors.values()]
components = pca(original_vectors, 2)

ratings_by_movie = defaultdict(list)
for rating in ratings:
    ratings_by_movie[rating.movie_id].append(rating.rating)

vectors = [
    (movie_id,
     sum(ratings_by_movie[movie_id]) / len(ratings_by_movie[movie_id]),
     movies[movie_id],
     vector)
    for movie_id, vector in zip(movie_vectors.keys(),
                                transform(original_vectors, components))
]

# Print top 25 and bottom 25 by first principal component
print(sorted(vectors, key=lambda v: v[-1][0])[:25])
print(sorted(vectors, key=lambda v: v[-1][0])[-25:])

dv: 4513.117: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 178.05it/s]
dv: 923.379: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 206.34it/s]


[('1603', 3.0, 'Angela (1995)', [-3.4777692829356983, 1.3024085570641923]), ('1500', 5.0, 'Santa with Muscles (1996)', [-2.715039129426547, -0.5872911710899551]), ('1449', 4.625, 'Pather Panchali (1955)', [-2.5679128418900623, 0.22203343150610122]), ('1642', 4.5, "Some Mother's Son (1996)", [-2.5666261615534616, 1.0042868980324502]), ('867', 4.0, 'Whole Wide World, The (1996)', [-2.517393016027836, 1.9079396716978334]), ('1467', 5.0, 'Saint of Fort Washington, The (1993)', [-2.4111319938724787, 0.9585407027217993]), ('1158', 4.0, 'Fille seule, La (A Single Girl) (1995)', [-2.3919926448542173, 0.05131330254314448]), ('850', 4.0, 'Perfect Candidate, A (1996)', [-2.370266324282181, 0.98939613274928]), ('1064', 4.25, 'Crossfire (1947)', [-2.3679444142978427, -0.5256378415208939]), ('169', 4.466101694915254, 'Wrong Trousers, The (1993)', [-2.363691417475967, 0.24354746046776432]), ('1604', 4.0, 'He Walked by Night (1948)', [-2.3336160861067476, -0.7695574741293153]), ('745', 3.875, 'Ruling 