Import

In [1]:
import random


import numpy as np
from sklearn.decomposition import PCA
from sentence_transformers import SentenceTransformer
from umap import UMAP
from datasets import load_dataset

import utils
from models import EmbeddingRecommender, Index

  from .autonotebook import tqdm as notebook_tqdm


Load Dataset

In [2]:
ds = load_dataset("fancyzhx/ag_news", split="all")
texts: list[str] = [ds[i]["text"] for i in range(len(ds))]
len(texts)

127600

Load Models and train them

In [3]:
embedder = SentenceTransformer("all-MiniLM-L6-v2")
embeddings: np.ndarray = embedder.encode(texts, show_progress_bar=True)

  return forward_call(*args, **kwargs)
Batches: 100%|██████████| 3988/3988 [02:09<00:00, 30.80it/s]


In [4]:
umap_model = UMAP(n_components=50, random_state=42, n_neighbors=15, min_dist=0.1, metric="euclidean", low_memory=False)
umap_embeddings: np.ndarray = umap_model.fit_transform(embeddings)

  warn(


In [5]:
pca_model = PCA(n_components=50, random_state=42)
pca_embeddings: np.ndarray = pca_model.fit_transform(embeddings)

Indexing

In [6]:
umap_index = Index(umap_embeddings, space="l2", ef=200)
pca_index = Index(pca_embeddings, space="cosine", ef=200)

In [7]:
umap_recommender = EmbeddingRecommender(
    umap_index,
    lambda ids: umap_embeddings[ids],
    k=10,
    sample_weight=1.0,
)
pca_recommender = EmbeddingRecommender(
    pca_index,
    lambda ids: pca_embeddings[ids],
    k=10,
    sample_weight=1.0,
)
# pca should be more general and umap more precise

Testing

In [9]:
def show_rec(recommender: EmbeddingRecommender, history_ids: list[int], texts: list[str]):
    scores = recommender.recommend(history_ids)
    scores = utils.skip_history(scores, history_ids)
    if recommender.sample_weight > 1:
        scores = utils.take_top_k(scores, k=int(recommender.k * recommender.sample_weight))
        scores = utils.take_random(scores, k=recommender.k)
    else:
        scores = utils.take_top_k(scores, k=recommender.k)

    for i, d in scores.items():
        print(f"Item ID: {i}, Score: {d:.4f}, Text: {texts[i]}")
        # show history most matching

In [25]:
history = random.sample(range(len(texts)), k=5)
history_texts = [texts[i] for i in history]
for i, text in enumerate(history_texts):
    print(f"{i}: {text}")

0: Lyon, Man U #39;s Ruud put on four-star shows Both Lyon and Manchester United #39;s Ruud van Nistelrooy struck four times in Group D of the Champions League on Wednesday as the French champions downed Fenerbahce 4-2 while the Dutch star #39;s goals saw United to a 4-1 victory over Sparta Prague.
1: Some shareholders accept, others reject Oracle bid SAN FRANCISCO (CBS.MW) -- PeopleSoft #39;s board could be the target of another shareholder lawsuit over the anti-takeover measures it adopted to fend off a hostile bid from rival Oracle, according to a lawyer representing a group of PeopleSoft shareholders.
2: Birds Not Being Killed by Wind Farms -Ecologist (Reuters) Reuters - Two major offshore wind farms in Denmark\are giving the lie to fears that birds are being killed by\flying into the huge vanes of such installations, a conference\heard on Thursday.
3: Stocks Seen Higher; Oil Prices Slipping  NEW YORK (Reuters) - U.S. stocks looked to open higher on  Friday, as the fourth quarter b

More random

In [26]:
umap_recommender.sample_weight = 10
show_rec(umap_recommender, history, texts)

Found 500 recommendations for 5 history items
Item ID: 107111, Score: 0.9953, Text: India rely on spin trick to clinch series win India relied on their spin trumpcard Harbhajan Singh #39;s guiles to clinch their first home series win in two years with a facile eight-wicket victory over South Africa in the second cricket Test here today.
Item ID: 55756, Score: 0.9947, Text: Harbhajan is confident of repeat Indian comeback BANGALORE Off-spinner Harbhajan Singh is confident India will bounce back in the Test series against Australia as they did three years ago.
Item ID: 73341, Score: 0.9933, Text: Martyn, Kartik, Khan set tone for thriller The off-field drama in a Test that has seen speculation rule over fact continued before the first ball was bowled on Tuesday. The first surprise came when Rahul Dravid walked out for the toss.
Item ID: 44074, Score: 0.9935, Text: Dalmiya gets Pawar run out on last ball KOLKATA: Weeks of suspense, a final day packed with drama, and in the best potboiler 

In [27]:
pca_recommender.sample_weight = 10
show_rec(pca_recommender, history, texts)

Found 500 recommendations for 5 history items
Item ID: 49032, Score: 0.9317, Text: Stocks Seen Higher as Oil Prices Ease  NEW YORK (Reuters) - U.S. stocks looked to open higher on  Monday, extending Friday's broad rally, as oil prices fall  below \$50 a barrel on easing tensions in Nigeria.
Item ID: 15639, Score: 0.8958, Text: Stocks Open Higher; Wall Street Cautious US stocks opened slightly higher on Tuesday as oil prices slipped with a lightly staffed Wall Street remaining cautious during the Republican party convention in New York amid heavy security and concerns of possible attacks.
Item ID: 2814, Score: 0.8889, Text: Stocks Gain as Oil Ease from New High  NEW YORK (Reuters) - U.S. stocks were higher on  Wednesday  as investors bought beaten-down shares and oil prices eased  from the new 21-year record high hit earlier in the session.
Item ID: 52069, Score: 0.8960, Text: Stocks Near Flat as Oil Tops \$52  NEW YORK (Reuters) - U.S. stocks barely moved above the  unchanged mark on W

Less random

In [28]:
umap_recommender.sample_weight = 1.2
show_rec(umap_recommender, history, texts)

Found 60 recommendations for 5 history items
Item ID: 14583, Score: 0.9975, Text: Stocks Seen Sliding as Oil Moves Higher  NEW YORK (Reuters) - U.S. stocks are set to open lower on  Monday with oil prices pushing higher and Wall Street on edge  as the Republican National Convention gets underway in New York  amid heightened security concerns.
Item ID: 87850, Score: 0.9975, Text: Oracle's 'best and final' offer rejected (USATODAY.com) USATODAY.com - PeopleSoft's board of directors rejected Oracle's  #36;9.2 billion takeover bid Wednesday, thrusting the business software maker's fate in the hands of its shareholders.
Item ID: 97265, Score: 0.9984, Text: PeopleSoft Board Rejects Oracle Bid  SAN FRANCISCO (Reuters) - Business software maker  PeopleSoft Inc. on Saturday said its board again rejected  Oracle Corp.'s \$9.2 billion tender offer, remaining defiant a  day after most of its shares were tendered to Oracle.
Item ID: 97905, Score: 0.9976, Text: PeopleSoft's Board Rejects Oracle Bid 

In [29]:
pca_recommender.sample_weight = 1.2
show_rec(pca_recommender, history, texts)

Found 60 recommendations for 5 history items
Item ID: 16376, Score: 0.9477, Text: Stocks Open Lower as Oil Climbs  NEW YORK (Reuters) - U.S. stocks opened lower on Wednesday  with oil prices rebounding on fresh supply disruptions and  investors cautious before Friday's jobs figures amid concerns  of possible attacks and disruptions to the Republican party's  convention in New York.
Item ID: 46312, Score: 0.9566, Text: Stocks Seen Up; Oil Holds Below \$50  NEW YORK (Reuters) - Stocks looked to open higher on Friday  with oil prices below \$50 a barrel, while technology shares try  to extend a three-day winning streak of ending positive as the  fourth quarter begins on Wall Street.
Item ID: 110066, Score: 0.9501, Text: Stocks Open Higher, Lower Oil Prices Help  NEW YORK (Reuters) - U.S. stocks opened higher on  Wednesday, a day after a sharp sell-off, as oil tumbled  further, easing concerns about the impact high crude prices  have on corporate profits and consumer spending.
Item ID: 145

No randomness

In [30]:
pca_recommender.sample_weight = 1
show_rec(pca_recommender, history, texts)

Found 50 recommendations for 5 history items
Item ID: 50339, Score: 0.9643, Text: Stocks Seen Higher Despite Oil Price  NEW YORK (Reuters) -  U.S. stocks looked to open slightly  higher on Tuesday, despite oil prices above \$50 a barrel, as  the market readies to extend its streak of gains in four of the  last five sessions.
Item ID: 46312, Score: 0.9566, Text: Stocks Seen Up; Oil Holds Below \$50  NEW YORK (Reuters) - Stocks looked to open higher on Friday  with oil prices below \$50 a barrel, while technology shares try  to extend a three-day winning streak of ending positive as the  fourth quarter begins on Wall Street.
Item ID: 42354, Score: 0.9527, Text: Stocks Seen Higher; Oil Steady at \$50 Bbl  NEW YORK (Reuters) - U.S. stocks looked to open higher on  Tuesday, with beaten down shares tempting investors and oil  producer stocks bolstered by crude oil prices breaking through  the \$50 a barrel mark.
Item ID: 110066, Score: 0.9501, Text: Stocks Open Higher, Lower Oil Prices Help 

In [31]:
pca_recommender.sample_weight = 1
show_rec(pca_recommender, history, texts)

Found 50 recommendations for 5 history items
Item ID: 50339, Score: 0.9643, Text: Stocks Seen Higher Despite Oil Price  NEW YORK (Reuters) -  U.S. stocks looked to open slightly  higher on Tuesday, despite oil prices above \$50 a barrel, as  the market readies to extend its streak of gains in four of the  last five sessions.
Item ID: 46312, Score: 0.9566, Text: Stocks Seen Up; Oil Holds Below \$50  NEW YORK (Reuters) - Stocks looked to open higher on Friday  with oil prices below \$50 a barrel, while technology shares try  to extend a three-day winning streak of ending positive as the  fourth quarter begins on Wall Street.
Item ID: 42354, Score: 0.9527, Text: Stocks Seen Higher; Oil Steady at \$50 Bbl  NEW YORK (Reuters) - U.S. stocks looked to open higher on  Tuesday, with beaten down shares tempting investors and oil  producer stocks bolstered by crude oil prices breaking through  the \$50 a barrel mark.
Item ID: 110066, Score: 0.9501, Text: Stocks Open Higher, Lower Oil Prices Help 

In [32]:
pca_recommender.get_history_embeddings = lambda ids: pca_model.transform(embedder.encode(["robots", "gaming"]))
show_rec(pca_recommender, [], texts)
pca_recommender.get_history_embeddings = lambda ids: pca_embeddings[ids]

Found 20 recommendations for 2 history items
Item ID: 19907, Score: 0.8296, Text: Games With Get Up and Go  Video gaming has long suffered a reputation for creating sedentary and solitary zombies, pasty and pudgy except for their muscular thumbs. A growing number of games, however, are getting gamers off their couches and on their feet, not only shouting, gyrating, singing and dancing, but sometimes -- gasp! -- playing nicely with others, face-to-face, in groups.
Item ID: 104134, Score: 0.7992, Text: Video Games Teach More Than Hand-Eye Coordination  WASHINGTON (Reuters) - Video games, often maligned as  having little or no redeeming value, are becoming a way for  firefighters, soldiers, currency traders and college  administrators to hone their skills.
Item ID: 76572, Score: 0.7835, Text: How computer games grew up THEY USED TO be for kids. Space Invaders, Pong, Doom, some of them were shoot-em-up fun, others were just, well, fun. But computer games have changed.
Item ID: 99703, Score

  return forward_call(*args, **kwargs)


In [33]:
umap_recommender.get_history_embeddings = lambda ids: umap_model.transform(embedder.encode(["robots", "gaming"]))
show_rec(umap_recommender, [], texts)
umap_recommender.get_history_embeddings = lambda ids: umap_embeddings[ids]

Found 24 recommendations for 2 history items
Item ID: 72029, Score: 0.9991, Text: Robots Compete for Your Interest Robonexus Expo, which wrapped up over the weekend in Santa Clara, California, offered the requisite look at commercial robotics, robot parts and accessories, small-market redistributors, and research and experimental robots, but the overall theme and most 
Item ID: 200, Score: 0.9992, Text: Video games 'good for children' Computer games can promote problem-solving and team-building in children, say games industry experts.
Item ID: 67924, Score: 0.9992, Text: Robo-vacuum wins wall-to-wall praise at confab Robot fans cheer news that the "Roomba" has sucked up its 1 millionth buyer--a 'bot breakthrough,' they say.
Item ID: 76572, Score: 0.9990, Text: How computer games grew up THEY USED TO be for kids. Space Invaders, Pong, Doom, some of them were shoot-em-up fun, others were just, well, fun. But computer games have changed.
Item ID: 87818, Score: 0.9993, Text: Robot Is Pivot

  return forward_call(*args, **kwargs)
