Import

In [1]:
import json
import random
from collections import defaultdict


import numpy as np
import hnswlib
from sklearn.decomposition import PCA
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


Functions

In [2]:
def load_dataset(file_path: str):
    dataset = []
    with open(file_path) as f:
        for line in f.readlines():
            dataset.append(json.loads(line))

    len_before = len(dataset)
    dataset = [d for d in dataset if len(d['short_description']) > 40]
    num = len_before - len(dataset)
    print(f'removed {num} short samples')
    random.Random(42).shuffle(dataset)
    texts = [i['short_description'] for i in dataset]
    return texts


def make_index(embeddings: np.ndarray, ef=50):
    index = hnswlib.Index(space='cosine', dim=embeddings.shape[1])
    index.init_index(max_elements=len(embeddings), ef_construction=200, M=16)
    index.add_items(embeddings, np.arange(len(embeddings)))
    index.set_ef(ef)  # ef should always be > k
    return index


def query_index(index: hnswlib.Index, query_embeddings: np.ndarray, k=5) -> tuple[list[int], list[float]]:
    indices, distances = index.knn_query(query_embeddings, k)
    # flatten
    indices, distances = indices.flatten(), distances.flatten()

    # get max similarity for each item
    scores: defaultdict[int, float] = defaultdict(float)
    for i, d in zip(indices, distances):
        sim = 1 - d
        idx = int(i)
        scores[idx] = max(scores[idx], sim)

    # rerank
    scores_ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    indices, distances = [i for i, _ in scores_ranked], [d for _, d in scores_ranked]
    return  indices, distances

def recommend(index: hnswlib.Index, history_embeddings: np.ndarray, history: list[int], sample_weight=10.0, k=20) -> tuple[list[int], list[float]]:
    assert sample_weight >= 1, 'sample_weight should be >= 1'

    indices, distances = query_index(index, history_embeddings, int(sample_weight * k))
    # indices, distances = query_index(index, history_embeddings, k)
    print(f'Found {len(indices)} recommendations for {len(history_embeddings)} history items')

    if history:
        # remove skipped indices
        scores = [(i, v) for i, v in zip(indices, distances) if i not in history]
        indices, distances = [i for i, _ in scores], [d for _, d in scores]
    if sample_weight > 1:
        # random subsample
        sampled = list(zip(indices, distances))[:int(k * sample_weight)]
        sampled = random.sample(sampled, k)
        sampled = sorted(sampled, key=lambda x: x[1], reverse=True)
        indices, distances = [i for i, _ in sampled], [d for _, d in sampled]
    else:
        # take top k
        indices, distances = indices[:k], distances[:k]
        
    return indices, distances

Load Dataset

In [3]:
texts = load_dataset('news.json')
len(texts)

removed 37721 short samples


171806

Load Models

In [4]:
embedder = SentenceTransformer('all-MiniLM-L6-v2')
pca_model = PCA(n_components=50, random_state=42)

Train Models

In [5]:
embeddings: np.ndarray = embedder.encode(texts, show_progress_bar=True)
reduced_embeddings: np.ndarray = pca_model.fit_transform(embeddings)

Batches: 100%|██████████| 5369/5369 [01:45<00:00, 50.67it/s]


Indexing

In [6]:
index = make_index(reduced_embeddings)

Testing

In [7]:
history = [432, 123, 456, 789, 1011]
history_texts = [texts[i] for i in history]
history_texts

["The USGA should relocate the event because of Trump's comments about women, the senators say.",
 "But like belief in Santa Claus and the Tooth Fairy, this magical sleep potion relies on the power of my son's imagination",
 '"Do you realize I\'m only 25? This embryo and I could have been best friends," the mother says.',
 'Having a policy and enforcing the policy are two totally different and separate things. After all, law without enforcement is no law at all.',
 'The ability to be assertive and say "no" is a communication skill we all learn at a very young age.  If you\'re a parent, you know better than anyone that once this word enters a child\'s vocabulary it\'s used very often.  However, as an adult, "no" is often much more difficult to say.']

More random

In [8]:
history_embeddings = pca_model.transform(embedder.encode(history_texts))

indices, distances = recommend(index, history_embeddings, history, sample_weight=10, k=20)

for i, (idx, distance) in enumerate(zip(indices, distances)):
    print(f'{i+1}. (similarity: {distance:.4f}): {texts[idx]}')

Found 998 recommendations for 5 history items
1. (similarity: 0.8597): "It's been such a blessing to have a child. I never thought I would have one."
2. (similarity: 0.7617): “I was in that relationship a long time. But now I’m, like: It’s the best decision," she said. "We raise our kids together
3. (similarity: 0.7539): Because you don't have to live in the U.S. legally to be protected by the law.
4. (similarity: 0.7535): "You're my mother, of course you think I'm great. I need to hear it from someone else," my daughter told me.
5. (similarity: 0.7516): "If suddenly it’s Mother’s Day and you have two dads, you’re jacked."
6. (similarity: 0.7285): "Your son looks just like you and your daughter looks like she could be from a different family? Is she the milkman's baby?"
7. (similarity: 0.7263): “Are you the oldest mom there?” a friend asked me as we walked to the elementary school to pick up our sons. At least, I
8. (similarity: 0.7222): "You’re not going to be the perfect parent, you’

Less random

In [9]:
history_embeddings = pca_model.transform(embedder.encode(history_texts))
indices, distances = recommend(index, history_embeddings, history, sample_weight=1.2, k=20)

for i, (idx, distance) in enumerate(zip(indices, distances)):
    print(f'{i+1}. (similarity: {distance:.4f}): {texts[idx]}')

Found 120 recommendations for 5 history items
1. (similarity: 0.8597): "It's been such a blessing to have a child. I never thought I would have one."
2. (similarity: 0.8521): "Mom, would you just concentrate on having the baby!" her teen daughter said.
3. (similarity: 0.8334): Almost every state in the country allows citizens to act as police — with sometimes deadly consequences. Some want these laws abolished.
4. (similarity: 0.8115): "I have three boys of my own, all grown.  And when I dream of them, I dream of them at this age. Enjoy these moments." We smiled at each other, two mothers sharing a moment of understanding.
5. (similarity: 0.8055): "I have a 12-year-old and a 6-year-old now ... after being single for six years," she said. "We all live together and everything
6. (similarity: 0.7991): “I don’t know if I would be a mom today if it weren’t for her."
7. (similarity: 0.7979): "You think you're [insert anything here]? Try having kids!"
8. (similarity: 0.7972): "I don't know ev

No randomness

In [10]:
history_embeddings = pca_model.transform(embedder.encode(['gaming', 'sports', 'ai']))
indices, distances = recommend(index, history_embeddings, [], sample_weight=1, k=20)

for i, (idx, distance) in enumerate(zip(indices, distances)):
    print(f'{i+1}. (similarity: {distance:.4f}): {texts[idx]}')


Found 58 recommendations for 3 history items
1. (similarity: 0.8418): A sport is "a contest or a game in which people do certain physical activities according to a specific set of rules and compete against each other." This is the definition that has been engraved as the meaning behind the word "sport" for years.  After June 1, that definition changed.
2. (similarity: 0.7553): Football, basketball, hockey, martial arts, European football... all of these athletic events offer plenty of excitement
3. (similarity: 0.7516): Whether we want to admit it or not, youth sports are really about education. Our organization says that sports are the outdoor classroom whereby young people are learning a host of life lesson and they are not always good lessons.
4. (similarity: 0.7362): An elementary schooler got to participate in the biggest sports event in America through Fuel Up to Play 60, a program that uses the star-power of NFL players to get kids excited about exercise.
5. (similarity: 0.7322)