Import

In [1]:
import random
from collections import defaultdict
from typing import Literal


import numpy as np
import hnswlib
from sklearn.decomposition import PCA
from scipy.spatial.distance import cdist
from sentence_transformers import SentenceTransformer
from umap import UMAP
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


Functions

In [None]:
def make_index(embeddings: np.ndarray, space: Literal["l2", "ip", "cosine"], ef=50):
    index = hnswlib.Index(space=space, dim=embeddings.shape[1])
    index.init_index(max_elements=len(embeddings), ef_construction=200, M=16)
    index.add_items(embeddings, np.arange(len(embeddings)))
    index.set_ef(ef)  # ef should always be > k
    return index


def query_index(index: hnswlib.Index, query_embeddings: np.ndarray, k=5) -> tuple[list[int], list[float]]:
    indices, distances = index.knn_query(query_embeddings, k)
    # flatten
    indices, distances = indices.flatten(), distances.flatten()

    # get max similarity for each item
    scores: defaultdict[int, float] = defaultdict(float)
    for i, d in zip(indices, distances):
        sim = 1 - d
        idx = int(i)
        scores[idx] = max(scores[idx], sim)

    # rerank
    scores_ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    indices, distances = [i for i, _ in scores_ranked], [d for _, d in scores_ranked]
    return indices, distances


class Recommender:
    def __init__(
        self,
        dim_reduction_model: PCA,
        index: hnswlib.Index,
        embedder: SentenceTransformer,
        reduced_embeddings: np.ndarray,
        texts: list[str],
    ):
        self.dim_reduction_model = dim_reduction_model
        self.index = index
        self.embedder = embedder
        self.reduced_embeddings = reduced_embeddings
        self.texts = texts

    def recommend(self, history_embeddings: np.ndarray, history: list[int], sample_weight=10.0, k=20) -> tuple[list[int], list[float]]:
        assert sample_weight >= 1, "sample_weight should be >= 1"

        indices, distances = query_index(self.index, history_embeddings, int(sample_weight * k))
        # indices, distances = query_index(index, history_embeddings, k)
        print(f"Found {len(indices)} recommendations for {len(history_embeddings)} history items")

        if history:
            # remove skipped indices
            scores = [(i, v) for i, v in zip(indices, distances) if i not in history]
            indices, distances = [i for i, _ in scores], [d for _, d in scores]
        if sample_weight > 1:
            # random subsample
            sampled = list(zip(indices, distances))[: int(k * sample_weight)]
            sampled = random.sample(sampled, k)
            sampled = sorted(sampled, key=lambda x: x[1], reverse=True)
            indices, distances = [i for i, _ in sampled], [d for _, d in sampled]
        else:
            # take top k
            indices, distances = indices[:k], distances[:k]

        return indices, distances

    def show_rec(self, history_texts: list[str], history: list[int], sample_weight=10.0, k=20):
        history_embeddings = self.dim_reduction_model.transform(self.embedder.encode(history_texts))
        indices, distances = self.recommend(history_embeddings, history, sample_weight, k)

        for i, (idx, distance) in enumerate(zip(indices, distances)):
            # get closest text match from history
            cos_sim = cdist([self.reduced_embeddings[idx]], history_embeddings, metric="cosine")
            cos_sim = np.array(cos_sim).flatten()
            cos_sim = 1 - cos_sim
            # get highest similarity index
            history_match_idx = np.argmax(cos_sim)
            print(f"{i + 1}. (similarity: {distance:.4f}) (to {history_match_idx}): {self.texts[idx]}")

Load Dataset

In [None]:
ds = load_dataset("fancyzhx/ag_news", split="all")
texts: list[str] = [ds[i]["text"] for i in range(len(ds))]
len(texts)

127600

Load Models and train them

In [None]:
embedder = SentenceTransformer("all-MiniLM-L6-v2")
embeddings: np.ndarray = embedder.encode(texts, show_progress_bar=True)

  return forward_call(*args, **kwargs)
Batches: 100%|██████████| 3988/3988 [02:10<00:00, 30.50it/s]


In [5]:
umap_model = UMAP(n_components=50, random_state=42, n_neighbors=15, min_dist=0.1, metric="euclidean")
umap_embeddings: np.ndarray = umap_model.fit_transform(embeddings)

  warn(


In [6]:
pca_model = PCA(n_components=50, random_state=42)
pca_embeddings: np.ndarray = pca_model.fit_transform(embeddings)

Indexing

In [7]:
umap_index = make_index(umap_embeddings, space="l2", ef=200)
pca_index = make_index(pca_embeddings, space="cosine", ef=200)

In [8]:
umap_recommender = Recommender(
    umap_model,
    umap_index,
    embedder,
    umap_embeddings,
    texts,
)
pca_recommender = Recommender(
    pca_model,
    pca_index,
    embedder,
    pca_embeddings,
    texts,
)
# pca should be more general and umap more precise

Testing

In [11]:
history = random.sample(range(len(texts)), k=5)
history_texts = [texts[i] for i in history]
for i, text in enumerate(history_texts):
    print(f"{i}: {text}")

0: Frances Destroys Sea Turtle Nests (AP) AP - Hurricane Frances destroyed thousands of sea turtle nests as the storm tore through their most important beaches in the middle of the nesting season, biologists said.
1: Kelly: Perennial Winner Southwest In 14.1 Traffic Jump &lt;b&gt;...&lt;/b&gt; Southwest Airlines (nyse: LUV - news - people ) on Friday said its traffic jumped 14.1 in November. The discount carrier carried 9.6 more passengers during the month than in the year-ago period.
2: TD, Banknorth in Talks on Possible Deal  TORONTO (Reuters) - Canada's Toronto Dominion Bank &lt;A HREF="http://www.investor.reuters.com/FullQuote.aspx?ticker=TD.TO target=/stocks/quickinfo/fullquote"&gt;TD.TO&lt;/A&gt;  said on Wednesday that it is in talks with U.S.-based Banknorth  Group &lt;A HREF="http://www.investor.reuters.com/FullQuote.aspx?ticker=BNK.N target=/stocks/quickinfo/fullquote"&gt;BNK.N&lt;/A&gt; about a possible deal.
3: Talks look for new climate effort Climate experts and politicia

More random

In [12]:
pca_recommender.show_rec(history_texts, history, sample_weight=10.0, k=10)

Found 500 recommendations for 5 history items
1. (similarity: 0.9118) (to 2): UPDATE 2-TD, Banknorth in talks on possible deal Toronto Dominion Bank (TD.TO: Quote, Profile, Research) said on Wednesday that it is in talks with US-based Banknorth Group (BNK.N: Quote, Profile, Research) about a possible deal, in line with the Canadian bank #39;s push for 
2. (similarity: 0.8244) (to 2): Canada bank, Banknorth talk \$2B deal Canada #39;s Toronto Dominion Bank said Wednesday it is in talks with US-based Banknorth Group about a possible deal, while a published report said the two parties are in advanced negotiations that could lead to a \$2.
3. (similarity: 0.7716) (to 3): Energy proposal pushes debate toward center A nonpartisan group that includes advocates for business, labor, consumers, and the environment gave a significant nudge this week to the effort to craft a national energy policy.
4. (similarity: 0.7573) (to 3): Global Warming Fight to Get Harder from 2012 -U.N (Reuters) Reuters 

  return forward_call(*args, **kwargs)


In [20]:
umap_recommender.show_rec(history_texts, history, sample_weight=10.0, k=10)

Found 500 recommendations for 5 history items
1. (similarity: 0.9991) (to 2): Toronto-Dominion to buy stake in Banknorth Toronto-Dominion Bank of Toronto said Thursday it agreed to acquire a 51 percent stake in New England regional bank Banknorth Group Inc.
2. (similarity: 0.9990) (to 2): Canada bank, Banknorth talk \$2B deal Canada #39;s Toronto Dominion Bank said Wednesday it is in talks with US-based Banknorth Group about a possible deal, while a published report said the two parties are in advanced negotiations that could lead to a \$2.
3. (similarity: 0.9988) (to 2): Canadians Confirm Talks on Bank Deal The Toronto-Dominion Bank confirmed on Wednesday that it was negotiating to buy all or part of the Banknorth Group, which has branches in five New England states and upstate New York.
4. (similarity: 0.9976) (to 3): Kyoto Too Little to Fix Warming - UN Climate Chief (Reuters) Reuters - Although saved last week with Russian\help, the Kyoto pact on global warming offers too little to

  return forward_call(*args, **kwargs)


Less random

In [21]:
pca_recommender.show_rec(history_texts, history, sample_weight=1.2, k=10)

Found 60 recommendations for 5 history items
1. (similarity: 0.9125) (to 2): TD, Banknorth in Talks on Possible Deal (Reuters) Reuters - Canada's Toronto Dominion Bank (TD.TO)\said on Wednesday that it is in talks with U.S.-based Banknorth\Group (BNK.N) about a possible deal.
2. (similarity: 0.9118) (to 2): UPDATE 2-TD, Banknorth in talks on possible deal Toronto Dominion Bank (TD.TO: Quote, Profile, Research) said on Wednesday that it is in talks with US-based Banknorth Group (BNK.N: Quote, Profile, Research) about a possible deal, in line with the Canadian bank #39;s push for 
3. (similarity: 0.8854) (to 2): Canada's TD Buys Banknorth for \$3.8 Bln  TORONTO (Reuters) - Toronto-Dominion Bank &lt;A HREF="http://www.investor.reuters.com/FullQuote.aspx?ticker=TD.TO target=/stocks/quickinfo/fullquote"&gt;TD.TO&lt;/A&gt; made a  much-anticipated push into U.S. retail banking on Thursday,  announcing it would acquire a majority stake in Banknorth Group  Inc. &lt;A HREF="http://www.investor.

  return forward_call(*args, **kwargs)


In [22]:
umap_recommender.show_rec(history_texts, history, sample_weight=1.2, k=10)

Found 60 recommendations for 5 history items
1. (similarity: 0.9992) (to 2): Banknorth, TD Bank Are in Talks Canada #39;s TD Bank Financial Group is discussing  quot;a possible transaction quot; with the Northeast regional bank Banknorth Group Inc.
2. (similarity: 0.9991) (to 2): Not a big hit everywhere Bill Ryan is spending the last days of the summer traveling across Canada and the United States to pitch big shareholders on the complicated plan to sell 51 percent of his Banknorth Group Inc. to Toronto-Dominion Bank .
3. (similarity: 0.9991) (to 2): Toronto-Dominion to buy stake in Banknorth Toronto-Dominion Bank of Toronto said Thursday it agreed to acquire a 51 percent stake in New England regional bank Banknorth Group Inc.
4. (similarity: 0.9990) (to 2): Canada bank, Banknorth talk \$2B deal Canada #39;s Toronto Dominion Bank said Wednesday it is in talks with US-based Banknorth Group about a possible deal, while a published report said the two parties are in advanced negotiations

  return forward_call(*args, **kwargs)


No randomness

In [40]:
pca_recommender.show_rec(["robots", "gaming"], [], sample_weight=1, k=10)

Found 20 recommendations for 2 history items
1. (similarity: 0.8296) (to 1): Games With Get Up and Go  Video gaming has long suffered a reputation for creating sedentary and solitary zombies, pasty and pudgy except for their muscular thumbs. A growing number of games, however, are getting gamers off their couches and on their feet, not only shouting, gyrating, singing and dancing, but sometimes -- gasp! -- playing nicely with others, face-to-face, in groups.
2. (similarity: 0.7992) (to 1): Video Games Teach More Than Hand-Eye Coordination  WASHINGTON (Reuters) - Video games, often maligned as  having little or no redeeming value, are becoming a way for  firefighters, soldiers, currency traders and college  administrators to hone their skills.
3. (similarity: 0.7835) (to 1): How computer games grew up THEY USED TO be for kids. Space Invaders, Pong, Doom, some of them were shoot-em-up fun, others were just, well, fun. But computer games have changed.
4. (similarity: 0.7802) (to 1): Compu

  return forward_call(*args, **kwargs)


In [39]:
umap_recommender.show_rec(["robots", "gaming"], [], sample_weight=1, k=10)

Found 20 recommendations for 2 history items
1. (similarity: 0.9995) (to 0): They #39;re robots? Those beasts! NAHANT, Mass.--Joseph Ayers was crouched over a laptop in a cool cinder block shed barely big enough to house a ride-on lawn mower, watching a boxy-shelled black lobster through a rectangular acrylic window.
2. (similarity: 0.9995) (to 1): Weaned on Video Games Video gamers are getting younger and younger. And parents and the video game industry agree that the implications are likely to be huge.
3. (similarity: 0.9995) (to 0): iRobot readies for war--and the household Whether mopping up a battlefield or living room floor, the robot maker is working on vehicle to get the job done.\
4. (similarity: 0.9995) (to 1): Video games 'good for children' Computer games can promote problem-solving and team-building in children, say games industry experts.
5. (similarity: 0.9994) (to 0): Robot Is Pivotal Member of the Force The youngest, smallest member of the Virginia State Police bomb sq

  return forward_call(*args, **kwargs)
