# Importing libraries

In [7]:
import random
from typing import Callable

import numpy as np
from sklearn.decomposition import PCA
from sentence_transformers import SentenceTransformer
from umap import UMAP
from datasets import load_dataset

import utils
from protocols import Recommender
from index import Index

# Loading the Dataset

In [2]:
ds = load_dataset("fancyzhx/ag_news", split="all")
texts: list[str] = [ds[i]["text"] for i in range(len(ds))]
len(texts)

127600

# Loading the Models and training them

In [3]:
embedder = SentenceTransformer("all-MiniLM-L6-v2")
embeddings: np.ndarray = embedder.encode(texts, show_progress_bar=True)

  return forward_call(*args, **kwargs)
Batches: 100%|██████████| 3988/3988 [02:11<00:00, 30.29it/s]


In [4]:
umap_model = UMAP(n_components=50, random_state=42, n_neighbors=15, min_dist=0.1, metric="euclidean", low_memory=False)
umap_embeddings: np.ndarray = umap_model.fit_transform(embeddings)

  warn(


In [5]:
pca_model = PCA(n_components=50, random_state=42)
pca_embeddings: np.ndarray = pca_model.fit_transform(embeddings)

# Indexing

In [6]:
umap_index = Index(umap_embeddings, space="l2", ef=200)
pca_index = Index(pca_embeddings, space="cosine", ef=200)

# Recommender implementation

In [8]:
class EmbeddingRecommender:
    def __init__(
        self,
        search_fn: Callable[[np.ndarray, int], dict[int, float]],
        get_user_embeddings: Callable[[int], np.ndarray],
        k: int,
    ):
        self.search_fn = search_fn
        self.get_user_embeddings = get_user_embeddings
        self.k = k

    def recommend(self, user_id: int) -> dict[int, float]:
        history_embeddings = self.get_user_embeddings(user_id)
        scores = self.search_fn(history_embeddings, self.k)
        print(f"Found {len(scores)} recommendations for {len(history_embeddings)} history items")
        return scores

# Helper functions

In [24]:
def clean_scores(scores: dict[int, float], user_history: list[int], sample_weight: float, k: int):
    scores = utils.skip_ids(scores, user_history)
    if sample_weight > 1:
        scores = utils.take_top_k(scores, k=int(k * sample_weight))
        scores = utils.take_random(scores, k=k)
    else:
        scores = utils.take_top_k(scores, k=k)
    return scores


def show_recommendations(scores: dict[int, float], texts: list[str]):
    indices, scorings = utils.unpack_scores(scores)
    for i, (idx, scoring) in enumerate(zip(indices, scorings)):
        print(f"ID: {str(idx).zfill(6)}, Score: {scoring:.4f}, Text: {texts[idx]}")


def get_user_history_function(embeddings: np.ndarray, user_history: dict[int, list[int]]):
    def get_history_embeddings(user_id: int) -> np.ndarray:
        history_ids = user_history.get(user_id, [])
        if not history_ids:
            return np.empty((0, embeddings.shape[1]))
        return embeddings[history_ids]

    return get_history_embeddings

# Recommendation engine

In [10]:
user_history = {}

umap_recommender = EmbeddingRecommender(
    umap_index.query,
    get_user_history_function(umap_embeddings, user_history),
    k=100,
)
pca_recommender = EmbeddingRecommender(
    pca_index.query,
    get_user_history_function(pca_embeddings, user_history),
    k=100,
)
# pca should be more general and umap more precise

In [11]:
def recommend_and_show(user_id: int, recommender: Recommender, sample_weight: float, k: int):
    scores = recommender.recommend(user_id)
    scores = clean_scores(scores, user_history.get(user_id, []), sample_weight, k)
    show_recommendations(scores, texts)

Testing

In [25]:
user_history[0] = random.sample(range(len(texts)), k=5)
for i in user_history[0]:
    print(f"ID: {str(i).zfill(6)}, Text: {texts[i]}")

ID: 029048, Text: Labor pains April 1980 -- Players strike the last eight days of spring training. Ninety-two exhibition games are canceled. June 1981 -- Players stage first midseason strike in history.
ID: 012482, Text: Pakistan Parliament Elects Prime Minister Pakistan #39;s parliament elected former finance minister Shaukat Aziz, a close ally of President Gen. Pervez Musharraf, as prime minister on Friday after the opposition 
ID: 030480, Text: Afghans Arrest Three for Rocket Attack on Karzai  KABUL, Afghanistan (Reuters) - Three men have been arrested  for trying to kill Afghan President Hamid Karzai by firing a  rocket at his helicopter during his first election campaign  trip outside the capital, officials said on Friday.
ID: 107709, Text: UPDATE 2-Fifth Third balance sheet fix to cut profit Fifth Third Bancorp, the ninth largest US bank, on Thursday said fourth-quarter profit will fall well short of analyst forecasts, as it takes \$340 million of 
ID: 011006, Text: Intel's Centr

#### PCA

More random

In [26]:
recommend_and_show(0, pca_recommender, sample_weight=10, k=10)

Found 500 recommendations for 5 history items
ID: 012513, Score: 0.9103, Text: Musharraf ally elected as new Pakistan PM ISLAMABAD - Pakistan #39;s Parliament elected former Finance Minister Shaukat Aziz as Prime Minister yesterday amid an opposition boycott of the vote.
ID: 072399, Score: 0.8629, Text: Intel to Join in a Project to Extend Wireless Use In an effort to create a global wireless alternative to cable and telephone Internet service, Intel said on Monday that it would collaborate with Clearwire, a wireless 
ID: 011448, Score: 0.8607, Text: Intel Adds Tri-mode (802.11a,b,g) Wireless Module Intel announced a new tri-mode wireless module (supporting IEEE standards 802.11a, b and g) for notebooks based on Intel Centrino mobile technology.
ID: 012157, Score: 0.8483, Text: Pak. parliament elects Aziz as new PM : Opposition lawmakers boycotted a parliamentary vote on Friday that unanimously elected Shaukat Aziz as Pakistan #39;s new prime minister after their candidate, a jailed op

Less random

In [27]:
recommend_and_show(0, pca_recommender, sample_weight=1.2, k=10)

Found 500 recommendations for 5 history items
ID: 030486, Score: 0.9946, Text: Afghans Arrest Three for Rocket Attack on Karzai Three men have been arrested for trying to kill Afghan President Hamid Karzai by firing a rocket at his helicopter during his first election campaign trip outside the capital, officials said on Friday.
ID: 011096, Score: 0.9890, Text: Centrino to connect with all Wi-Fi standards New component will help next generation of Intel's wireless chip technology reach full audience for Wi-Fi.
ID: 030705, Score: 0.9370, Text: Suspects Arrested in Rocket Attack On Afghan President #39;s &lt;b&gt;...&lt;/b&gt; Afghan officials say they have arrested three suspects in connection with a rocket attack on a helicopter carrying President Hamid Karzai.
ID: 011945, Score: 0.9355, Text: Pakistan elects new prime minister Pakistan #39;s National Assembly elected Shaukat Aziz as the nation #39;s new prime minister Friday. Aziz won 191 votes to none for rival candidate Javed Hashmi.

No randomness

In [28]:
recommend_and_show(0, pca_recommender, sample_weight=1, k=10)

Found 500 recommendations for 5 history items
ID: 030486, Score: 0.9946, Text: Afghans Arrest Three for Rocket Attack on Karzai Three men have been arrested for trying to kill Afghan President Hamid Karzai by firing a rocket at his helicopter during his first election campaign trip outside the capital, officials said on Friday.
ID: 011096, Score: 0.9890, Text: Centrino to connect with all Wi-Fi standards New component will help next generation of Intel's wireless chip technology reach full audience for Wi-Fi.
ID: 030705, Score: 0.9370, Text: Suspects Arrested in Rocket Attack On Afghan President #39;s &lt;b&gt;...&lt;/b&gt; Afghan officials say they have arrested three suspects in connection with a rocket attack on a helicopter carrying President Hamid Karzai.
ID: 011945, Score: 0.9355, Text: Pakistan elects new prime minister Pakistan #39;s National Assembly elected Shaukat Aziz as the nation #39;s new prime minister Friday. Aziz won 191 votes to none for rival candidate Javed Hashmi.

Custom input

In [29]:
old_function = pca_recommender.get_user_embeddings
pca_recommender.get_user_embeddings = lambda _: pca_model.transform(embedder.encode(["robots", "gaming"]))

recommend_and_show(0, pca_recommender, sample_weight=1, k=10)

pca_recommender.get_user_embeddings = old_function

Found 200 recommendations for 2 history items
ID: 019907, Score: 0.8296, Text: Games With Get Up and Go  Video gaming has long suffered a reputation for creating sedentary and solitary zombies, pasty and pudgy except for their muscular thumbs. A growing number of games, however, are getting gamers off their couches and on their feet, not only shouting, gyrating, singing and dancing, but sometimes -- gasp! -- playing nicely with others, face-to-face, in groups.
ID: 104134, Score: 0.7992, Text: Video Games Teach More Than Hand-Eye Coordination  WASHINGTON (Reuters) - Video games, often maligned as  having little or no redeeming value, are becoming a way for  firefighters, soldiers, currency traders and college  administrators to hone their skills.
ID: 076572, Score: 0.7835, Text: How computer games grew up THEY USED TO be for kids. Space Invaders, Pong, Doom, some of them were shoot-em-up fun, others were just, well, fun. But computer games have changed.
ID: 099703, Score: 0.7802, Text: 

  return forward_call(*args, **kwargs)


### UMAP

In [30]:
recommend_and_show(0, umap_recommender, sample_weight=1, k=10)

Found 500 recommendations for 5 history items
ID: 010870, Score: 1.0000, Text: Intel Gives Centrino Chip Line a Wireless Upgrade Intel Corp. (INTC.O: Quote, Profile, Research) on Thursday said it has upgraded the wireless networking capabilities of its Centrino line of notebook computer chips to allow broader network access with improved security.
ID: 120645, Score: 0.9999, Text: Intel Gives Centrino Chip Line a Wireless Upgrade (Reuters) Reuters - Intel Corp. (INTC.O) on Thursday\said it has upgraded the wireless networking capabilities of\its Centrino line of notebook computer chips to allow broader\network access with improved security.
ID: 014724, Score: 0.9999, Text: Intel updates Centrino wireless chips, software Intel Corp. unveiled an update to its wireless networking technology Thursday, adding simpler software, improved security and support for a more robust, less interference-prone radio standard.
ID: 011089, Score: 0.9999, Text: Centrino to connect with all Wi-Fi standards 

In [31]:
recommend_and_show(0, umap_recommender, sample_weight=10, k=10)

Found 500 recommendations for 5 history items
ID: 014724, Score: 0.9999, Text: Intel updates Centrino wireless chips, software Intel Corp. unveiled an update to its wireless networking technology Thursday, adding simpler software, improved security and support for a more robust, less interference-prone radio standard.
ID: 012513, Score: 0.9991, Text: Musharraf ally elected as new Pakistan PM ISLAMABAD - Pakistan #39;s Parliament elected former Finance Minister Shaukat Aziz as Prime Minister yesterday amid an opposition boycott of the vote.
ID: 030486, Score: 0.9977, Text: Afghans Arrest Three for Rocket Attack on Karzai Three men have been arrested for trying to kill Afghan President Hamid Karzai by firing a rocket at his helicopter during his first election campaign trip outside the capital, officials said on Friday.
ID: 009553, Score: 0.9973, Text: Intel Eyes Tri-Mode Wi-Fi (PC World) PC World - Upcoming chip set will support 802.11a, b, and g networks.
ID: 004497, Score: 0.9969, Tex

In [32]:
old_function = umap_recommender.get_user_embeddings
umap_recommender.get_user_embeddings = lambda _: umap_model.transform(embedder.encode(["robots", "gaming"]))

recommend_and_show(0, umap_recommender, sample_weight=1, k=10)

umap_recommender.get_user_embeddings = old_function

Found 200 recommendations for 2 history items
ID: 030755, Score: 0.9995, Text: They #39;re robots? Those beasts! NAHANT, Mass.--Joseph Ayers was crouched over a laptop in a cool cinder block shed barely big enough to house a ride-on lawn mower, watching a boxy-shelled black lobster through a rectangular acrylic window.
ID: 006345, Score: 0.9994, Text: Smart little suckers A new generation of robotic vacuums is ready to do battle with dirt, dust and dog hair with more cleaning power and cunning than their ancestors could muster.
ID: 087818, Score: 0.9993, Text: Robot Is Pivotal Member of the Force The youngest, smallest member of the Virginia State Police bomb squad can fearlessly pick up an unidentified bomb, walk into a tense hostage situation and hand a cell phone to a gunman. And he requires no food, water, insurance benefits or sleep.
ID: 100710, Score: 0.9993, Text: Video Game Report Card Helps Parents Goal is to assist moms and dads in buying family-friendly entertainment for the

  return forward_call(*args, **kwargs)
