# Imports

In [None]:
from collections import Counter

from pydantic import BaseModel, ConfigDict
import faiss
import networkx as nx
import numpy as np
import spacy
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer

from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer
from hdbscan import HDBSCAN
from umap import UMAP
from keybert import KeyBERT
from sklearn.metrics.pairwise import cosine_similarity

# Classes

In [26]:
class ContentGraph:
    """Manages the content relationship graph"""

    def __init__(self):
        self.topic_threshold = 0.3
        self.graph = nx.Graph()

    def add_node(self, content_id: str, attributes=None):
        """Add a node to the graph with optional attributes"""
        self.graph.add_node(content_id, **(attributes or {}))

    def connect_by_entities(self, content_id: str, other_id: str, weight: float):
        """Connect two content pieces based on shared entities"""
        if weight > 0:
            if self.graph.has_edge(content_id, other_id):
                self.graph[content_id][other_id]['weight'] += weight
            else:
                self.graph.add_edge(content_id, other_id, weight=weight)

    def connect_by_topics(self, content_id: str, other_id: str, topic_similarity: float):
        """Connect two content pieces based on topic similarity"""
        if topic_similarity > self.topic_threshold:  # Threshold for connection
            if self.graph.has_edge(content_id, other_id):
                self.graph[content_id][other_id]['weight'] += topic_similarity
            else:
                self.graph.add_edge(content_id, other_id, weight=topic_similarity)

    def get_neighbors(self, content_id: str):
        """Get neighboring content for a given content ID"""
        if content_id in self.graph:
            return list(self.graph.neighbors(content_id))
        return []

    def get_edge_weight(self, content_id: str, other_id: str):
        """Get the weight of an edge between two content pieces"""
        if self.graph.has_edge(content_id, other_id):
            return self.graph[content_id][other_id].get('weight', 0)
        return 0

    def get_centrality(self):
        """Calculate degree centrality for all nodes"""
        return nx.degree_centrality(self.graph)


class Index:
    def __init__(self):
        self.index = faiss.IndexHNSWFlat(1, 1)

    def add(self, embeddings: np.ndarray):
        self.index = faiss.IndexHNSWFlat(embeddings.shape[1], 32)
        self.index.add(embeddings)

    def get_items_by_index(self, query_embeddings: list[np.ndarray], top_k: int) -> list[list[str]]:
        distances, indices = self.index.search(np.array(query_embeddings), top_k)
        return indices

In [27]:
class UserProfile(BaseModel):
    model_config = ConfigDict(arbitrary_types_allowed=True)

    user_id: str
    interaction_history: list[str]
    embedding_profile: np.ndarray
    topic_interests: np.ndarray
    entity_interests: Counter


class Content(BaseModel):
    model_config = ConfigDict(arbitrary_types_allowed=True)

    content_id: str
    text: str
    embedding: np.ndarray
    entities: Counter
    keywords: set[str]
    topic_dist: np.ndarray
    topic: int
    centrality: float

# DB functions

In [28]:
def add_article_to_graph(content: 'Content', candidates: list['Content'], content_graph: ContentGraph):
    # Add to content graph
    content_graph.add_node(content.content_id)
    similarity_matrix = cosine_similarity(
        [c.topic_dist for c in candidates],
        [content.topic_dist]
    )

    # Connect with existing content through shared entities
    for i, other_content in enumerate(candidates):
        other_id = other_content.content_id
        if other_id == content.content_id:
            continue

        weight = 0
        shared_entities = other_content.entities.keys() & content.entities.keys()
        for entity in shared_entities:
            weight += min(other_content.entities[entity], content.entities[entity])

        content_graph.connect_by_entities(content.content_id, other_id, weight)

        topic_similarity = similarity_matrix[i]

        content_graph.connect_by_topics(content.content_id, other_id, topic_similarity)


def create_user(user_id: str, user_profiles: dict[str, UserProfile]):
    """Create a new user profile"""
    if user_id not in user_profiles:
        user_profiles[user_id] = UserProfile(user_id=user_id, embedding_profile=[], topic_interests=[], entity_interests=Counter(), interaction_history=[])
    return user_profiles[user_id]


def record_interaction(user_profile: UserProfile, content: Content, history: list[Content]):
    """Record a user's interaction with content"""
    user_profile.interaction_history.append(content.content_id)

    # Update entity interests
    for entity, count in content.entities.items():
        user_profile.entity_interests[entity] += count

    # Update embedding profile and topic interests

    # set zeros
    user_profile.embedding_profile = np.zeros_like(content.embedding)
    user_profile.topic_interests = np.zeros_like(content.topic_dist)

    history = history[-20:]  # Consider last 20 interactions
    for i, h_content in enumerate(history):
        weight = (20 - i) / 20
        user_profile.embedding_profile += h_content.embedding * weight
        user_profile.topic_interests +=h_content.topic_dist * weight

# Similarity functions

In [29]:
# Article - Article
def get_embedding_similarity(content: Content, candidates: list[Content], weight: float) -> dict[str, float]:
    scores = {}
    similarity_matrix = cosine_similarity(
        [c.embedding for c in candidates],
        [content.embedding]
    )

    for i, other_content in enumerate(candidates):
        if other_content.content_id == content.content_id:
            continue
        emb_similarity = similarity_matrix[i]
        scores[other_content.content_id] = emb_similarity * weight

    return scores


def get_entity_similarity(neighbors: list[Content], weights: list[float], weight: float) -> dict[str, float]:
    scores = {}
    for neighbor, weight_ in zip(neighbors, weights):
        scores[neighbor.content_id] = min(weight_ / 5, 1) * weight
    return scores


def get_topic_similarity(content: Content, candidates: list[Content], weight: float) -> dict[str, float]:
    scores = {}
    topic_similarity_matrix = cosine_similarity(
        [c.topic_dist for c in candidates],
        [content.topic_dist]
    )

    for i, other_content in enumerate(candidates):
        if other_content.content_id == content.content_id:
            continue
        topic_similarity = topic_similarity_matrix[i]
        scores[other_content.content_id] = topic_similarity * weight

    return scores


def get_popularity_scores(candidates: list[Content], weight: float) -> dict[str, float]:
    scores = {}
    for c in candidates:
        scores[c.content_id] = c.centrality * weight

    return scores


# User - Article
def get_embedding_similarity_user(user_profile: UserProfile, candidates: list[Content], weight: float) -> dict[str, float]:
    scores = {}
    similarity_matrix = cosine_similarity(
        [c.embedding for c in candidates],
        [user_profile.embedding_profile]
    )
    for i, other_content in enumerate(candidates):
        emb_similarity = similarity_matrix[i]
        scores[other_content.content_id] = emb_similarity * weight
    return scores


def get_topic_similarity_user(user_profile: UserProfile, candidates: list[Content], weight: float) -> dict[str, float]:
    scores = {}
    similarity_matrix = cosine_similarity(
        [c.topic_dist for c in candidates],
        [user_profile.topic_interests],
    )
    for i, content in enumerate(candidates):
        topic_similarity = similarity_matrix[i]
        scores[content.content_id] = topic_similarity * weight
    return scores


def get_entity_similarity_user(user_profile: UserProfile, candidates: list[Content], weight: float) -> dict[str, float]:
    scores = {}

    for content in candidates:
        content_id = content.content_id

        # Calculate entity overlap score
        score = 0
        shared_entities = user_profile.entity_interests & content.entities
        for entity in shared_entities:
            # Weight by how important this entity is to the user
            user_weight = user_profile.entity_interests[entity] / sum(user_profile.entity_interests.values())
            # And by how important it is to the content
            content_weight = content.entities[entity] / sum(content.entities.values())
            score += user_weight * content_weight

        if score > 0:
            scores[content_id] = min(score * 5 * weight, 1.0)  # Normalize and apply weight

    return scores


def get_explore_scores_user(user_profile: UserProfile, candidates: list[Content], weight: float) -> dict[str, float]:
    scores = {}
    # Get all entities user has engaged with
    user_entities = set(user_profile.entity_interests.keys())
    # For each content, calculate discovery score
    for content in candidates:
        content_id = content.content_id
        # Skip recently viewed content
        if any(i == content_id for i in user_profile.interaction_history[-10:]):
            continue

        # Get entities in this content
        content_entities = set(content.entities.keys())

        # Calculate novelty (% of entities not seen before)
        if content_entities:
            new_entities = content_entities - user_entities
            novelty = len(new_entities) / len(content_entities)

            # We want some novelty but not too much (sweet spot around 50-80% new)
            discovery_score = 1.0 - abs(0.7 - novelty)

            # But also consider content popularity via graph centrality

            # Combine novelty with popularity
            scores[content_id] = (discovery_score * 0.7 + content.centrality * 0.3) * weight

    return scores

# Recommendation

In [30]:
def get_candidates_article(content: Content, content_store: dict[str, Content], content_graph: ContentGraph):
    candidates = list(content_store.values())
    neighbors = content_graph.get_neighbors(content.content_id)
    neighbor_weights = [content_graph.get_edge_weight(content.content_id, n_id) for n_id in neighbors]
    return candidates, neighbors, neighbor_weights

def get_recommendations_article(content: Content, strategy_weights: dict[str, float], candidates: list[Content], neighbors: list[Content], neighbor_weights: list[float], num_recommendations=5):
    """Generate recommendations for a given content piece"""
    # re-scoring
    scores = {}

    scores_1 = get_embedding_similarity(content, candidates, strategy_weights.get('embedding', 0.4))
    scores_2 = get_entity_similarity(neighbors, neighbor_weights, strategy_weights.get('entity', 0.3))
    scores_3 = get_topic_similarity(content, candidates, strategy_weights.get('topic', 0.4))
    scores_4 = get_popularity_scores(candidates, strategy_weights.get('explore', 0.1))

    for scores_dict in [scores_1, scores_2, scores_3, scores_4]:
        for content_id, score in scores_dict.items():
            scores[content_id] = scores.get(content_id, 0) + score


    sorted_recommendations = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    return sorted_recommendations[:num_recommendations]


def explain_recommendation_article(content: Content, rec: Content):
    """Provide an explanation for why something was recommended"""

    explanations = []

    # Check for shared entities
    shared_entities = content.entities.keys() & rec.entities.keys()
    if shared_entities:
        top_shared = sorted(shared_entities,
                            key=lambda e: content.entities[e] + rec.entities[e],
                            reverse=True
                            )[:3]
        if top_shared:
            explanations.append(f"Shares topics: {', '.join(top_shared)}")

    # Check for embedding similarity
    emb_similarity = cosine_similarity(
        [content.embedding],
        [rec.embedding]
    )[0][0]

    if emb_similarity > 0.7:
        explanations.append("Content is semantically similar")
    elif emb_similarity > 0.5:
        explanations.append("Content is somewhat related")

    # Check for BERTopic topic similarity
    topic_sim = cosine_similarity(
        [content.topic_dist],
        [rec.topic_dist]
    )[0][0]

    if topic_sim > 0.8:
        explanations.append("Covers very similar topics")

    # Check if they share the same dominant topic
    if content.topic == rec.topic and content.topic != -1:  # -1 is BERTopic's outlier topic
        explanations.append("Part of the same topic cluster")

    if rec.centrality > 0.7:
        explanations.append("Popular content that connects many topics")

    # Default explanation if nothing else
    if not explanations:
        explanations.append("No Reason")

    return explanations


def get_recommendations_user(user_profile: UserProfile, strategy_weights: dict[str, float], candidates: list[Content], num_recommendations=10):
    """Generate personalized recommendations for user homepage"""
    # Initialize scores
    scores = {}

    scores_1 = get_embedding_similarity_user(user_profile, candidates, strategy_weights.get('embedding', 0.4))
    scores_2 = get_entity_similarity_user(user_profile, candidates, strategy_weights.get('entity', 0.3))
    scores_3 = get_topic_similarity_user(user_profile, candidates, strategy_weights.get('topic', 0.4))
    scores_4 = get_explore_scores_user(user_profile, candidates, strategy_weights.get('explore', 0.1))

    for scores_dict in [scores_1, scores_2, scores_3, scores_4]:
        for content_id, score in scores_dict.items():
            scores[content_id] = scores.get(content_id, 0) + score

    sorted_recommendations = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    return sorted_recommendations[:num_recommendations]

    # Filter out recently viewed content (avoid immediate repeats)
    # recent_views = set(user_profile.interaction_history[-5:])
    # filtered_scores = {cid: score for cid, score in scores.items() if cid not in recent_views}
    #
    # # If we filtered too aggressively, restore some items
    # if len(filtered_scores) < num_recommendations / 2:
    #     filtered_scores = scores
    #
    # # Sort and return top recommendations
    # sorted_recommendations = sorted(filtered_scores.items(), key=lambda x: x[1], reverse=True)

    # # Ensure diversity by sampling from different score ranges
    # result = sorted_recommendations[:int(num_recommendations * 0.7)]  # 70% top picks
    #
    # # Add 30% semi-random picks from the rest
    # if len(sorted_recommendations) > num_recommendations:
    #     mid_range = sorted_recommendations[int(num_recommendations * 0.7):int(len(sorted_recommendations) * 0.5)]
    #     if mid_range:
    #         random_picks = random.sample(mid_range, min(len(mid_range), int(num_recommendations * 0.3)))
    #         result.extend(random_picks)

    # return sorted_recommendations[:num_recommendations]


def explain_recommendation_user(user_profile: UserProfile, rec_content: Content, history: list[Content]):
    """Explain why content was recommended to a user"""
    explanations = []

    # Check for embedding similarity
    if user_profile.embedding_profile is not None:
        emb_similarity = cosine_similarity(
            [user_profile.embedding_profile],
            [rec_content.embedding]
        )[0][0]

        if emb_similarity > 0.7:
            explanations.append("Based on content you've engaged with")
        elif emb_similarity > 0.5:
            explanations.append("Similar to content you've viewed")

    # Check for topic similarity
    if user_profile.topic_interests is not None:
        topic_sim = cosine_similarity(
            [user_profile.topic_interests],
            [rec_content.topic_dist]
        )[0][0]

        if topic_sim > 0.7:
            explanations.append("Matches topics you're interested in")
            for content in history:

                # Check for shared entities
                base_entities = set(content.entities.keys())
                rec_entities = set(rec_content.entities.keys())
                shared_entities = base_entities.intersection(rec_entities)

                if shared_entities:
                    top_shared = sorted(shared_entities,
                                        key=lambda e: content.entities[e] +
                                                      rec_content.entities[e],
                                        reverse=True)[:3]
                    if top_shared:
                        explanations.append(f"Shares topics: {', '.join(top_shared)}")
        elif topic_sim > 0.5:
            explanations.append("Related to topics you follow")

    # Check for shared entities
    user_entities = set(user_profile.entity_interests.keys())
    content_entities = set(rec_content.entities.keys())
    shared_entities = user_entities.intersection(content_entities)

    if shared_entities:
        top_shared = sorted(shared_entities,
                            key=lambda e: user_profile.entity_interests[e],
                            reverse=True)[:2]
        if top_shared:
            explanations.append(f"Mentions {', '.join(top_shared)}")

    # Check if it's popular content
    if rec_content.centrality > 0.6:
        explanations.append("Popular in your topic areas")

    # Default explanation
    if not explanations:
        explanations.append("Recommended based on your reading history")

    return explanations


# Model functions

In [38]:
from rake_nltk import Rake

def extract_keywords(texts: list[str]):
  """
  Extracts keywords from text using the RAKE algorithm.

  Args:
    text: The input text string.

  Returns:
    A list of ranked keywords and phrases.
  """
  r = Rake()
  res = []
  for t in texts:
    r.extract_keywords_from_text(t)
    res.append(r.get_ranked_phrases())
  return res

In [None]:
NOT_ALLOWED_ENTITIES = {
    'DATE',
    'TIME',
    'PERCENT',
    'MONEY',
    'QUANTITY',
    'ORDINAL',
    'CARDINAL',
}


def extract_entities(texts: list[str], nlp: spacy.language.Language) -> list[Counter]:
    """Extract entities from text documents"""
    res = []
    for doc in nlp.pipe(texts):
        entities = [ent.text.lower() for ent in doc.ents if ent.label_ not in NOT_ALLOWED_ENTITIES]
        entity_counter = Counter(entities)
        res.append(entity_counter)
    return res


def extract_keywords(texts: list[str], kw_model: KeyBERT) -> list[list[str]]:
    """Extract entities from text documents"""
    return [[] for _ in texts]
    # keywords = kw_model.extract_keywords(texts, keyphrase_ngram_range=(3, 5), stop_words='english', top_n=5)
    # for i, keywords_i in enumerate(keywords):
    #     keywords[i] = [k[0] for k in keywords_i]
    # return keywords


def train_topic_model(texts: list[str], topic_model: BERTopic, embeddings: np.ndarray) -> list[int]:
    topics, probs = topic_model.fit_transform(texts, embeddings)
    outlier_count = topics.count(-1)
    print(f'outlier count: {outlier_count}, outlier percentage: {outlier_count * 100 / len(texts)}')

    if -1 not in topic_model.topic_sizes_:
        return topics
    # reduce topic outliers
    print(f'reducing topic outliers')
    new_topics = topic_model.reduce_outliers(texts, topics, strategy="c-tf-idf", threshold=0.2)

    new_outlier_count = new_topics.count(-1)
    print(f'outlier count: {new_outlier_count}, outlier percentage: {new_outlier_count * 100 / len(texts)}')

    new_topics = topic_model.reduce_outliers(texts, new_topics, strategy="embeddings", embeddings=embeddings,
                                                  threshold=0.3)

    new_outlier_count = new_topics.count(-1)
    print(f'outlier count: {new_outlier_count}, outlier percentage: {new_outlier_count * 100 / len(texts)}')

    print(f'updating topics')
    topic_model.update_topics(texts, topics=new_topics)
    return new_topics

# Load dataset

In [None]:
import json
import random

dataset = []
with open('news.json') as f:
    for line in f.readlines():
        dataset.append(json.loads(line))

num = sum(1 for d in dataset if len(d['short_description']) < 40)
print(f'removed {num} short samples')
dataset = [d for d in dataset if len(d['short_description']) > 40]
random.Random(42).shuffle(dataset)

# dataset = dataset[:50_000]
texts = [i['short_description'] for i in dataset]
len(texts)

removed 37038 short samples


171806

# Load models

In [None]:
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)
     ---------------------------------------- 0.0/400.7 MB ? eta -:--:--
     ---------------------------------------- 0.0/400.7 MB ? eta -:--:--
     -------------------------------------- 0.0/400.7 MB 435.7 kB/s eta 0:15:20
     -------------------------------------- 0.1/400.7 MB 657.6 kB/s eta 0:10:10
     -------------------------------------- 0.1/400.7 MB 656.4 kB/s eta 0:10:11
     ---------------------------------------- 0.2/400.7 MB 1.1 MB/s eta 0:06:19
     ---------------------------------------- 0.4/400.7 MB 1.4 MB/s eta 0:04:40
     ---------------------------------------- 0.5/400.7 MB 1.7 MB/s eta 0:04:02
     ---------------------------------------- 1.0/400.7 MB 2.9 MB/s eta 0:02:20
     ---------------------------------------- 1.3/400.7 MB 3.4 MB/s eta 0:01:58
     --------------------------------


[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
nlp = spacy.load("en_core_web_lg", disable=['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer'])

embedder = SentenceTransformer('all-MiniLM-L6-v2')

umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = HDBSCAN(min_cluster_size=5, metric='euclidean', cluster_selection_method='eom',
                        prediction_data=True)
vectorizer_model = CountVectorizer(stop_words="english")
ctfidf_model = ClassTfidfTransformer()

representation_model = KeyBERTInspired(random_state=42)
# representation_model = TextGeneration('gpt2')


topic_model = BERTopic(
    embedding_model=embedder,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    ctfidf_model=ctfidf_model,
    representation_model=representation_model,
    verbose=True
)
kw_model = KeyBERT(
    model=embedder
)

In [None]:
emb_index = Index()
topic_index = Index()

# Training process

In [None]:
print(f'Processing batch of {len(texts)} texts')
embeddings: np.ndarray = embedder.encode(texts, show_progress_bar=True)
print(f'Embeddings shape: {embeddings.shape}')

Processing batch of 171806 texts


Batches: 100%|██████████| 5369/5369 [05:01<00:00, 17.83it/s]


Embeddings shape: (171806, 384)


In [None]:
emb_index.add(embeddings)

KeyboardInterrupt: 

In [None]:
topics = train_topic_model(texts, topic_model, embeddings)

topic_labels = topic_model.generate_topic_labels()

num_topics = len(topic_model.get_topic_info())
print(f'Number of topics: {num_topics}')
print(f'Topics: {topic_labels}')

In [None]:
topic_dist_matrix, _ = topic_model.approximate_distribution(texts)
print(f'Topic distribution shape: {topic_dist_matrix.shape}')

In [None]:
topic_index.add(topic_dist_matrix)

In [None]:
entities = extract_entities(texts, nlp)

KeyboardInterrupt: 

In [None]:
keywords_matrix = extract_keywords(texts, kw_model)

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Mo\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [None]:
keywords = extract_keywords(texts)

In [45]:
texts[3], keywords[3]

('What is the solution to finding more time for what matters while still succeeding at work?  There is no magic formula, but I would like to share three basic ideas for how we can begin to tame our calendars and achieve better balance.',
 ['share three basic ideas',
  'achieve better balance',
  'would like',
  'still succeeding',
  'magic formula',
  'work',
  'time',
  'tame',
  'solution',
  'matters',
  'finding',
  'calendars',
  'begin'])

# Add to db

In [None]:
content_graph = ContentGraph()
content_store = {}

In [None]:
for i, text in enumerate(texts):
    content_store[i] = Content(
        content_id=str(i),
        text=text,
        embedding=embeddings[i],
        entities=entities[i],
        topic_dist=topic_dist_matrix[i],
        topic=topics[i],
        keywords=set(keywords_matrix[i]),
        centrality=-1
    )

In [None]:
content_store['178234']

In [None]:
content_store[np.int64(178234)]

In [None]:
candidates_per_item = emb_index.get_items_by_index([c.embedding for c in content_store.values()], 100)
candidates_per_item_2 = topic_index.get_items_by_index([c.topic_dist for c in content_store.values()], 100)
for i, content in enumerate(content_store.values()):
    candidate_ids = candidates_per_item[i] + candidates_per_item_2[i]
    candidate_ids = list(set(candidate_ids))
    candidates = [content_store[c] for c in candidate_ids]
    add_article_to_graph(content, candidates, content_graph)

# calculate centrality for each item
centrality = content_graph.get_centrality()
for content_id, centrality_score in centrality.items():
    content_store[content_id].centrality = centrality_score

# Experimentation

In [None]:
print(topic_labels)

In [None]:
cluster = []
cluster_id = 365
for content in content_store.values():
    if content.topic == cluster_id:
        cluster.append(content)

for content in cluster:
    print(content)

In [None]:
user_profiles = {}
user_id = "user1"

if user_id in user_profiles:
    del user_profiles[user_id]

user = create_user(user_id, user_profiles)
history = [
    'https://www.huffingtonpost.com/entry/cats-family-babysitter_us_5b9dd60ee4b03a1dcc8d8d54',
    'https://www.huffingtonpost.com/entry/a-wedding-theme-featuring_us_5b9deb0ce4b03a1dcc8eb7ea'
]
for h in history:
    c = content_store[h]
    history = [content_store[i] for i in user.interaction_history]
    record_interaction(user_id, c, history)


In [None]:
recommendations = get_recommendations_user(
    user,
    strategy_weights={
        'embedding': 0,
        'topic': 0,
        'entity': 0,
        'explore': 0
    },
    candidates=content_store.values(),
    num_recommendations=7
)

hist = "\n".join(str(u) for u in user.interaction_history)
print(f'[DEBUG] user history: \n{hist}')
print(f'[DEBUG] user topic interests: {user.topic_interests}')
print(f'[DEBUG] user entity interests: {user.entity_interests}')
print("\nRecommendations:")
history = [content_store[i] for i in user.interaction_history]
for rec_id, score in recommendations:
    content = content_store[rec_id]
    print(f'\n- id: {rec_id}, score: {score} summary: {content.text}')
    print(f"  Why: {'; '.join(explain_recommendation_user(user_id, rec_id, history))}")

In [None]:
# todo: look into keyword extraction using keybert
# todo: add user interests
# # topic reduction
#
# topic_model.reduce_topics(docs, nr_topics=30)
#
# # Access updated topics
# topics = topic_model.topics_