Import

In [37]:
import random


import numpy as np
from sklearn.decomposition import PCA
from sentence_transformers import SentenceTransformer
from umap import UMAP
from datasets import load_dataset
from scipy.spatial.distance import cdist

import utils
from models import EmbeddingRecommender, Index

Load Dataset

In [2]:
ds = load_dataset("fancyzhx/ag_news", split="all")
texts: list[str] = [ds[i]["text"] for i in range(len(ds))]
len(texts)

127600

Load Models and train them

In [3]:
embedder = SentenceTransformer("all-MiniLM-L6-v2")
embeddings: np.ndarray = embedder.encode(texts, show_progress_bar=True)

  return forward_call(*args, **kwargs)
Batches: 100%|██████████| 3988/3988 [02:09<00:00, 30.80it/s]


In [4]:
umap_model = UMAP(n_components=50, random_state=42, n_neighbors=15, min_dist=0.1, metric="euclidean", low_memory=False)
umap_embeddings: np.ndarray = umap_model.fit_transform(embeddings)

  warn(


In [5]:
pca_model = PCA(n_components=50, random_state=42)
pca_embeddings: np.ndarray = pca_model.fit_transform(embeddings)

Indexing

In [6]:
umap_index = Index(umap_embeddings, space="l2", ef=200)
pca_index = Index(pca_embeddings, space="cosine", ef=200)

In [7]:
umap_recommender = EmbeddingRecommender(
    umap_index,
    lambda ids: umap_embeddings[ids],
    k=10,
    sample_weight=1.0,
)
pca_recommender = EmbeddingRecommender(
    pca_index,
    lambda ids: pca_embeddings[ids],
    k=10,
    sample_weight=1.0,
)
# pca should be more general and umap more precise

Testing

In [44]:
def show_rec(recommender: EmbeddingRecommender, history_ids: list[int], texts: list[str]):
    scores = recommender.recommend(history_ids)
    scores = utils.skip_history(scores, history_ids)
    if recommender.sample_weight > 1:
        scores = utils.take_top_k(scores, k=int(recommender.k * recommender.sample_weight))
        scores = utils.take_random(scores, k=recommender.k)
    else:
        scores = utils.take_top_k(scores, k=recommender.k)
    indices, scores = utils.unpack_scores(scores)
    cos_sim = cdist(umap_embeddings[indices], recommender.get_history_embeddings(history_ids), metric="cosine")
    print(cos_sim.shape)
    for i, (idx, d) in enumerate(zip(indices, scores)):
        matching_idx = np.argmin(cos_sim[i])
        print(f"Item ID: {idx}, Score: {d:.4f} History index: {matching_idx}, Text: {texts[idx]}")

In [63]:
history = random.sample(range(len(texts)), k=5)
history_texts = [texts[i] for i in history]
for i, text in enumerate(history_texts):
    print(f"{i}: {text}")

0: Tiger looking for his first win in Japan Miyazaki - Tiger Woods will be looking for his first win in Japan when he tees off on Thursday in the Dunlop Phoenix golf tournament.
1: Virgin to offer space flights High-flying Brit entrepreneur Richard Branson today announced the imminent take-off of the  quot;VSS Virgin quot; - a scaled-up version of Burt Rutan #39;s SpacShipOne - which will offer well-to-do wannabe astronauts the chance of zero-grav flights for a mere 100,000 
2: Thai PM Says Will Punish  #39;Wrongdoers #39; Over Deaths Bomb blasts rocked southern Thailand on Friday as Prime Minister Thaksin Shinawatra promised to get to the bottom of the deaths of 78 Muslims in military custody and punish any  quot;wrongdoers.
3: Dollar Steadies, More Weakness Expected (Reuters) Reuters - The U.S. dollar paused near recent\lows Friday but dealers expected selling to resume after a\weekend meeting of G20 finance ministers, while stock investors\sought cover in companies immune to currenc

More random

In [64]:
umap_recommender.sample_weight = 10
show_rec(umap_recommender, history, texts)

Found 500 recommendations for 5 history items
(10, 5)
Item ID: 41002, Score: 0.9997 History index: 1, Text: Branson unveils deal for first commercial space flights The Virgin chief Sir Richard Branson announced his company has signed an agreement potentially worth 14 million to secure the world #39;s first commercial flights to space.
Item ID: 122575, Score: 0.9997 History index: 1, Text: Now Virgin to offer trips to space LONDON, England -- British entrepreneur Richard Branson announced his company has signed a deal to offer the world #39;s first commercial flights to space under the branding  quot;Virgin Galactic.
Item ID: 42219, Score: 0.9992 History index: 1, Text: British tycoon wants to fly you to space Richard Branson, the British tycoon known for daredevil exploits in a speedboat, a hot-air balloon and an amphibious car, wants to take a giant leap into the final frontier -- and to give a lift out of this world to similarly intrepid paying passengers.
Item ID: 41988, Score: 0.99

In [65]:
pca_recommender.sample_weight = 10
show_rec(pca_recommender, history, texts)

Found 500 recommendations for 5 history items
(10, 5)
Item ID: 94903, Score: 0.8728 History index: 0, Text: Golf: Woods off to flying start in bid for first Japan tour win MIYAZAKI, Japan : American superstar Tiger Woods overcame rainy conditions to hit a five-under-par 65 for a three-stroke lead in the first round of the Dunlop Phoenix golf tournament.
Item ID: 11388, Score: 0.8240 History index: 0, Text: Bombs explode in Thailand #39;s south Two explosions rocked Thailand #39;s troubled southern region Thursday, killing one person and injuring as many as two dozen others.
Item ID: 7217, Score: 0.8235 History index: 0, Text: Experts defuse bomb in Thailand #39;s violence-hit south Experts defused a large bomb Monday in a city where three similar devices exploded over the weekend - the latest violence blamed on suspected Islamic militants in Thailand #39;s troubled south.
Item ID: 122650, Score: 0.8186 History index: 0, Text: Virgin to Launch Commercial Space Flights by 2007 SEPTEMBER 

Less random

In [66]:
umap_recommender.sample_weight = 1.2
show_rec(umap_recommender, history, texts)

Found 60 recommendations for 5 history items
(10, 5)
Item ID: 68976, Score: 1.0000 History index: 1, Text: Virgin CEO says thousands are signing up to take flight into space LONDON Ever since two successful flights into space were made from the Mojave Desert, thousands of people are signing up to take the same trip.
Item ID: 41108, Score: 1.0000 History index: 1, Text: Groundwork Laid for Spaceline LONDON-September 27, 2004 - A ticket may soon be all that separates you from a dream voyage to the final frontier. British entrepreneur Richard Branson says Virgin company plans to start offering commercial space flights over the next few years.
Item ID: 41618, Score: 1.0000 History index: 1, Text: Virgin #39;s next destination: space LONDON Richard Branson, the adventurous entrepreneur behind the Virgin Group, said Monday that he would form a commercial space travel company that would start carrying passengers by 2007.
Item ID: 69364, Score: 1.0000 History index: 1, Text: 800M PLEDGED FOR V

In [67]:
pca_recommender.sample_weight = 1.2
show_rec(pca_recommender, history, texts)

Found 60 recommendations for 5 history items
(10, 5)
Item ID: 77308, Score: 0.9314 History index: 0, Text: Thai PM Says Will Punish 'Wrongdoers' Over Deaths  PATTANI, Thailand (Reuters) - Bomb blasts rocked southern  Thailand on Friday as Prime Minister Thaksin Shinawatra  promised to get to the bottom of the deaths of 78 Muslims in  military custody and punish any "wrongdoers."
Item ID: 98610, Score: 0.8993 History index: 0, Text: Golf: Delighted Woods demolishes field for first Japan tour win MIYAZAKI, Japan : Tiger Woods said he was on the way back to his world-beating best after demolishing the field at the Dunlop Phoenix golf tournament for his first title on the Japan tour.
Item ID: 41653, Score: 0.8917 History index: 0, Text: Virgin to Offer Space Flights (Even, Sort of, at Discount) Sir Richard Branson, owner of the Virgin Group, announced on Monday that he would offer travelers the chance to go to the edge of space beginning in 2007, for \$190,000 a ticket.
Item ID: 77153, Sco

No randomness

In [68]:
pca_recommender.sample_weight = 1
show_rec(pca_recommender, history, texts)

Found 50 recommendations for 5 history items
(10, 5)
Item ID: 95909, Score: 0.9330 History index: 0, Text: Dollar Steadies, More Weakness Expected  SINGAPORE (Reuters) - The U.S. dollar paused near recent  lows Friday but dealers expected selling to resume after a  weekend meeting of G20 finance ministers, while stock investors  sought cover in companies immune to currency risks.
Item ID: 77308, Score: 0.9314 History index: 0, Text: Thai PM Says Will Punish 'Wrongdoers' Over Deaths  PATTANI, Thailand (Reuters) - Bomb blasts rocked southern  Thailand on Friday as Prime Minister Thaksin Shinawatra  promised to get to the bottom of the deaths of 78 Muslims in  military custody and punish any "wrongdoers."
Item ID: 98610, Score: 0.8993 History index: 0, Text: Golf: Delighted Woods demolishes field for first Japan tour win MIYAZAKI, Japan : Tiger Woods said he was on the way back to his world-beating best after demolishing the field at the Dunlop Phoenix golf tournament for his first title 

In [69]:
pca_recommender.sample_weight = 1
show_rec(pca_recommender, history, texts)

Found 50 recommendations for 5 history items
(10, 5)
Item ID: 95909, Score: 0.9330 History index: 0, Text: Dollar Steadies, More Weakness Expected  SINGAPORE (Reuters) - The U.S. dollar paused near recent  lows Friday but dealers expected selling to resume after a  weekend meeting of G20 finance ministers, while stock investors  sought cover in companies immune to currency risks.
Item ID: 77308, Score: 0.9314 History index: 0, Text: Thai PM Says Will Punish 'Wrongdoers' Over Deaths  PATTANI, Thailand (Reuters) - Bomb blasts rocked southern  Thailand on Friday as Prime Minister Thaksin Shinawatra  promised to get to the bottom of the deaths of 78 Muslims in  military custody and punish any "wrongdoers."
Item ID: 98610, Score: 0.8993 History index: 0, Text: Golf: Delighted Woods demolishes field for first Japan tour win MIYAZAKI, Japan : Tiger Woods said he was on the way back to his world-beating best after demolishing the field at the Dunlop Phoenix golf tournament for his first title 

In [70]:
pca_recommender.get_history_embeddings = lambda ids: pca_model.transform(embedder.encode(["robots", "gaming"]))
show_rec(pca_recommender, [], texts)
pca_recommender.get_history_embeddings = lambda ids: pca_embeddings[ids]

Found 20 recommendations for 2 history items
(10, 2)
Item ID: 19907, Score: 0.8296 History index: 1, Text: Games With Get Up and Go  Video gaming has long suffered a reputation for creating sedentary and solitary zombies, pasty and pudgy except for their muscular thumbs. A growing number of games, however, are getting gamers off their couches and on their feet, not only shouting, gyrating, singing and dancing, but sometimes -- gasp! -- playing nicely with others, face-to-face, in groups.
Item ID: 104134, Score: 0.7992 History index: 1, Text: Video Games Teach More Than Hand-Eye Coordination  WASHINGTON (Reuters) - Video games, often maligned as  having little or no redeeming value, are becoming a way for  firefighters, soldiers, currency traders and college  administrators to hone their skills.
Item ID: 76572, Score: 0.7835 History index: 1, Text: How computer games grew up THEY USED TO be for kids. Space Invaders, Pong, Doom, some of them were shoot-em-up fun, others were just, well, 

  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)


In [71]:
umap_recommender.get_history_embeddings = lambda ids: umap_model.transform(embedder.encode(["robots", "gaming"]))
show_rec(umap_recommender, [], texts)
umap_recommender.get_history_embeddings = lambda ids: umap_embeddings[ids]

Found 24 recommendations for 2 history items
(10, 2)
Item ID: 30755, Score: 0.9995 History index: 0, Text: They #39;re robots? Those beasts! NAHANT, Mass.--Joseph Ayers was crouched over a laptop in a cool cinder block shed barely big enough to house a ride-on lawn mower, watching a boxy-shelled black lobster through a rectangular acrylic window.
Item ID: 6345, Score: 0.9994 History index: 0, Text: Smart little suckers A new generation of robotic vacuums is ready to do battle with dirt, dust and dog hair with more cleaning power and cunning than their ancestors could muster.
Item ID: 87818, Score: 0.9993 History index: 0, Text: Robot Is Pivotal Member of the Force The youngest, smallest member of the Virginia State Police bomb squad can fearlessly pick up an unidentified bomb, walk into a tense hostage situation and hand a cell phone to a gunman. And he requires no food, water, insurance benefits or sleep.
Item ID: 100710, Score: 0.9993 History index: 1, Text: Video Game Report Card He

  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
