#### Libraries/Imports

In [69]:
import os

import flask
from flask import Flask
from flask_pymongo import PyMongo

import requests

from bson import ObjectId, Binary
from pydantic.json import ENCODERS_BY_TYPE
from pymongo import MongoClient

from pydantic import BaseModel, Field
from typing import List, Optional

import numpy as np
from schedule


import torch
from transformers import AutoTokenizer, AutoModel

##### Helper Models

In [2]:
class PydanticObjectId(ObjectId):
    """
    Object Id field. Compatible with Pydantic.
    """

    @classmethod
    def __get_validators__(cls):
        yield cls.validate

    @classmethod
    def validate(cls, v):
        return PydanticObjectId(v)

    @classmethod
    def __modify_schema__(cls, field_schema: dict):
        field_schema.update(
            type="string",
        )


ENCODERS_BY_TYPE[PydanticObjectId] = str

In [3]:
class UserForum(BaseModel):
    id: Optional[PydanticObjectId] = Field(None, alias="_id")
    following_ids: List[PydanticObjectId]

class Post(BaseModel):
    id: Optional[PydanticObjectId] = Field(None, alias="_id")
    author_id: PydanticObjectId
    title: str
    content: str
    hashtags: List[str]
    response_to_id: Optional[PydanticObjectId]

    def to_dict(self):
        post_dict = {
            "_id": self.id,
            "author_id": self.author_id,
            "title": self.title,
            "content": self.content,
            "hashtags": self.hashtags
        }
        if self.response_to_id is not None:
            post_dict["response_to_id"] = self.response_to_id
        return post_dict

##### API URL

In [4]:
url = 'http://localhost:5000/posts'

In [5]:
try:
    response = requests.get(url)

    if response.status_code == 200:
        print("Response content:")
        response_content = response.json()
        print(response_content)
    else:
        print(f"Failed to retrieve data. Status code: {response.status_code}")

except requests.exceptions.RequestException as e:
    print(f"Error occurred: {e}")

Response content:
{'posts': [{'_id': '65e3204ce64d1e43b6dd7875', 'author_id': '65e1f82be64d1e71f2a9226b', 'content': 'My special thanks to @at0mul @z @abc', 'hashtags': ['hub_page'], 'title': 'Hello, #hub_page'}, {'_id': '65e326d7e64d1eb175d8bd33', 'author_id': '65e3050de64d1eb2ddb0c678', 'content': '', 'hashtags': [], 'response_to_id': '65e3204ce64d1e43b6dd7875', 'title': 'fr, really cool'}, {'_id': '65e32799e64d1ec521b63709', 'author_id': '65e249baf69a9c082c820154', 'content': '', 'hashtags': [], 'response_to_id': '65e3204ce64d1e43b6dd7875', 'title': 'hmmm'}, {'_id': '65e726e8e64d1e32b88e928b', 'author_id': '65e6e573e64d1e32dacf9881', 'content': 'Astazi facem review la shaormeria din spatele blocului.\n\nNu pot pune in cuvinte ce inseamna aceasta locatie pentru mine. Imi aduc cu drag de inima aminte momentul in care maicuta mea m-a dus sa imi ia primul donner din aceasta locatie.\n\nAstazi, privind locatie cu alti ochi, pot spune ca nu impresioneaza in niciun aspect, dar cu siguranta

In [6]:
#all_posts = [Post(**post_dict) for post_dict in response_content.get("posts", [])]
all_posts = [Post(**post_dict) for post_dict in response_content.get("posts", [])]
print("All posts:")
for post in all_posts:
    print(post)

liked_posts = all_posts[:4] + all_posts[-4:]
# liked_posts = []
print("Liked posts:")
for post in liked_posts:
    print(post)

disliked_posts = all_posts[12:16] + [all_posts[31]]
# disliked_posts = all_posts[:4] + all_posts[-4:]
# disliked_posts = []
print("Disliked posts:")
for post in disliked_posts:
    print(post)

current_user_id = PydanticObjectId("65d25cd3c2ef35ebebb785e6")
following_ids = [PydanticObjectId("65eda222e64d1e63721f1b1b"), 
                      PydanticObjectId("65db417ff69a9c1ee871447e"),
                      PydanticObjectId("65e475a5d831837d3a72eac5"),
                      PydanticObjectId("65e6e573e64d1e32dacf9881")]
# following_ids = []

current_user_forum = UserForum(_id=current_user_id, following_ids=following_ids)

print(f"User Data: {current_user_forum}")

All posts:
id=ObjectId('65e3204ce64d1e43b6dd7875') author_id=ObjectId('65e1f82be64d1e71f2a9226b') title='Hello, #hub_page' content='My special thanks to @at0mul @z @abc' hashtags=['hub_page'] response_to_id=None
id=ObjectId('65e326d7e64d1eb175d8bd33') author_id=ObjectId('65e3050de64d1eb2ddb0c678') title='fr, really cool' content='' hashtags=[] response_to_id=ObjectId('65e3204ce64d1e43b6dd7875')
id=ObjectId('65e32799e64d1ec521b63709') author_id=ObjectId('65e249baf69a9c082c820154') title='hmmm' content='' hashtags=[] response_to_id=ObjectId('65e3204ce64d1e43b6dd7875')
id=ObjectId('65e726e8e64d1e32b88e928b') author_id=ObjectId('65e6e573e64d1e32dacf9881') title='Imi place sa mananc aicea' content='Astazi facem review la shaormeria din spatele blocului.\n\nNu pot pune in cuvinte ce inseamna aceasta locatie pentru mine. Imi aduc cu drag de inima aminte momentul in care maicuta mea m-a dus sa imi ia primul donner din aceasta locatie.\n\nAstazi, privind locatie cu alti ochi, pot spune ca nu im

##### Configure Mongo Vectorized DB

In [7]:
app = Flask(__name__)
app.config["MONGO_URI"] = "mongodb+srv://fiicode_dev:7iqxicnfaMxKbzdk@fiicode24.ywnthrv.mongodb.net/fiicode24"
pymongo = PyMongo(app)

embeddings_collection = pymongo.db.embeddings
posts_collection = pymongo.db.posts

##### Configure Model

In [8]:
tokenizer = AutoTokenizer.from_pretrained('Twitter/twhin-bert-base')
model = AutoModel.from_pretrained('Twitter/twhin-bert-base')

Some weights of BertModel were not initialized from the model checkpoint at Twitter/twhin-bert-base and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


##### Preprocess Data

In [9]:
def process_post_data(posts):
    post_texts = [post.title + " " + post.content for post in posts]
    inputs = tokenizer(post_texts, return_tensors="pt", padding=True, truncation=True)
    return inputs

##### Optimize Embeddings (Store & Get)

In [44]:
def store_embeddings(post_ids, embeddings):
    for post_id, embedding in zip(post_ids, embeddings):
        post_dict = Post(**posts_collection.find_one({'_id': post_id})).to_dict()

        embedding_np = embedding.detach().numpy()
        embedding_bson = Binary(embedding_np.tobytes())
        
        existing_document = embeddings_collection.find_one({'post_id': post_id})
        if existing_document:
            embeddings_collection.update_one({'post_id': post_id}, {'$set': {'embedding': embedding_bson}})
        else:
            embeddings_collection.insert_one({'post_id': post_id, 'post': post_dict, 'embedding': embedding_bson})

def get_embeddings(posts):
    embeddings = []
    for post in posts:
        document = embeddings_collection.find_one({'post_id': post.id})
        if document:
            embedding_np = np.frombuffer(document['embedding'], dtype=np.float32)
            embeddings.append((post, embedding_np))
    return embeddings

def get_all_embeddings():
    embeddings = []
    for document in embeddings_collection.find():
        post_data = document['post']
        post = Post(**post_data)
        embedding_np = np.frombuffer(document['embedding'], dtype=np.float32)
        embeddings.append((post, embedding_np))
    return embeddings

def update_embeddings():
    all_posts = [Post(**post_dict) for post_dict in posts_collection.find()]
    # all_posts = [Post(**post_dict) for post_dict in posts_collection.find({'$or': [{'response_to_id': None}, {'response_to_id': {'$exists': False}}]})]
    processed_posts = process_post_data(all_posts)
    
    with torch.no_grad():
        all_outputs = model(**processed_posts)
        all_embeddings = all_outputs.last_hidden_state[:, 0, :]

    post_ids = [post.id for post in all_posts]
    
    store_embeddings(post_ids, all_embeddings)

##### Schedule Embeddings

In [70]:
def run_in_background(interval=1):
    cease_continuous_run = threading.Event()

    class ScheduleThread(threading.Thread):
        @classmethod
        def run(cls):
            while not cease_continuous_run.is_set():
                schedule.run_pending()
                time.sleep(interval)

    continuous_thread = ScheduleThread()
    continuous_thread.start()
    return cease_continuous_run

def update_embeddings_job():
    update_embeddings()

schedule.every().day.at("00:00").do(update_embeddings_job)

run_in_background()

KeyboardInterrupt: 

##### Compute Product Similarity

In [62]:
def get_recommendations(liked_posts, disliked_posts, following_ids):
    all_embeddings = get_all_embeddings()
    all_embeddings = [(post_id, embedding) for post_id, embedding in all_embeddings if post_id.response_to_id is None]
    
    liked_embeddings = [(post_id, embedding) for post_id, embedding in all_embeddings if post_id in liked_posts]
    disliked_embeddings = [(post_id, embedding) for post_id, embedding in all_embeddings if post_id in disliked_posts]

    recommendation_scores = torch.zeros(len(all_embeddings))

    for i, (post, post_embedding) in enumerate(all_embeddings):
        if post.response_to_id is not None:
            continue
        
        if post.author_id in following_ids:
            recommendation_scores[i] += 50

        similarity_liked = torch.tensor([torch.dot(torch.tensor(post_embedding), torch.tensor(liked_embedding)) for _, liked_embedding in liked_embeddings])
        similarity_disliked = torch.tensor([torch.dot(torch.tensor(post_embedding), torch.tensor(disliked_embedding)) for _, disliked_embedding in disliked_embeddings])

        recommendation_scores[i] += similarity_liked.mean() - similarity_disliked.mean()

    sorted_indices = torch.argsort(recommendation_scores, descending=True)
    recommended_post = [all_embeddings[i][0] for i in sorted_indices]

    return recommended_post

##### Test Implementation

In [37]:
update_embeddings()

In [45]:
# embeddings_from_db = get_embeddings(all_posts)
embeddings_from_db = get_all_embeddings()
print(f"Embeddings from DB: {embeddings_from_db}")

Embeddings from DB: [(Post(id=ObjectId('65e3204ce64d1e43b6dd7875'), author_id=ObjectId('65e1f82be64d1e71f2a9226b'), title='Hello, #hub_page', content='My special thanks to @at0mul @z @abc', hashtags=['hub_page'], response_to_id=None), array([ 2.65118387e-02,  3.18308204e-01,  4.78979260e-01,  9.23034132e-01,
        1.02453567e-01,  3.89839709e-01, -9.89530310e-02, -3.34935158e-01,
       -2.45452568e-01, -3.37282300e-01, -3.02582562e-01,  1.70583986e-02,
        1.73820198e-01,  3.61140907e-01,  3.31465341e-02,  1.93105459e-01,
       -1.95913436e-03, -3.62483151e-02, -6.63699508e-02, -6.54252946e-01,
        3.31143945e-01, -2.58449078e-01,  5.99804163e-01, -9.73190740e-02,
        3.09990704e-01, -2.39483193e-02,  9.17849302e-01, -1.95653051e-01,
        1.53460562e-01, -4.69180316e-01, -2.94718623e-01,  6.05187155e-02,
        1.23227715e-01, -3.67063172e-02,  2.90166259e-01, -4.14495468e-01,
        3.73869687e-01,  1.35481372e-01,  7.85319149e-01, -3.33261825e-02,
       -2.06048

In [63]:
recommended_posts = get_recommendations(liked_posts, disliked_posts, current_user_forum.following_ids)
print(f"Recommendations size: {len(recommended_posts)}")
print("Recommended posts:")
for post in recommended_posts:
    print(post)

Recommendations size: 22
Recommended posts:
id=ObjectId('65e726e8e64d1e32b88e928b') author_id=ObjectId('65e6e573e64d1e32dacf9881') title='Imi place sa mananc aicea' content='Astazi facem review la shaormeria din spatele blocului.\n\nNu pot pune in cuvinte ce inseamna aceasta locatie pentru mine. Imi aduc cu drag de inima aminte momentul in care maicuta mea m-a dus sa imi ia primul donner din aceasta locatie.\n\nAstazi, privind locatie cu alti ochi, pot spune ca nu impresioneaza in niciun aspect, dar cu siguranta intelege bazele a ceea ce face o shaorma buna. Lucru pe care il vezi tot mai rar la shaormeriile mai pretentioase.\n\nVeniti aici si mancati o shaorma peste medie!' hashtags=[] response_to_id=None
id=ObjectId('65ecf3e9d831834e9e21a97f') author_id=ObjectId('65e475a5d831837d3a72eac5') title='La lautari' content='Unde mergem sa petrecem\nCand plecam din cluburi tari?!\nLa lautari, unde canta lautari\nhttps://www.versuri.ro/w/8jt2\nUnde intra spritu bine\nSi se fac petreceri mari\n

##### OLD BACKUPS

In [None]:
def get_recommendations1(posts, liked_posts, disliked_posts, following_ids):
    all_post_inputs = process_post_data(posts) if posts else None
    liked_post_inputs = process_post_data(liked_posts) if liked_posts else None
    disliked_post_inputs = process_post_data(disliked_posts) if disliked_posts else None


    with torch.no_grad():
        all_outputs = model(**all_post_inputs)
        all_embeddings = all_outputs.last_hidden_state[:, 0, :]

        if liked_post_inputs:
            liked_outputs = model(**liked_post_inputs)
            liked_embeddings = liked_outputs.last_hidden_state[:, 0, :]
        else:
            liked_embeddings = torch.zeros(1, model.config.hidden_size)

        if disliked_post_inputs:
            disliked_outputs = model(**disliked_post_inputs)
            disliked_embeddings = disliked_outputs.last_hidden_state[:, 0, :]
        else:
            disliked_embeddings = torch.zeros(1, model.config.hidden_size)

    similarities_liked = torch.matmul(all_embeddings, liked_embeddings.T)
    similarities_disliked = torch.matmul(all_embeddings, disliked_embeddings.T)

    recommendation_scores = torch.zeros(len(posts))

    for i, post in enumerate(posts):
        if post.author_id in following_ids:
            recommendation_scores[i] += 50

    recommendation_scores += similarities_liked.mean(dim=1) - similarities_disliked.mean(dim=1)

    sorted_indices = torch.argsort(recommendation_scores, descending=True)

    recommended_posts = [posts[i] for i in sorted_indices]
    return recommended_posts

In [None]:
recommended_posts = get_recommendations1(all_posts, liked_posts, disliked_posts, current_user_forum.following_ids)
print("Recommended posts:")
for post in recommended_posts:
    print(post)

Recommended posts:
id=ObjectId('65ecf3e9d831834e9e21a97f') author_id=ObjectId('65e475a5d831837d3a72eac5') title='La lautari' content='Unde mergem sa petrecem\nCand plecam din cluburi tari?!\nLa lautari, unde canta lautari\nhttps://www.versuri.ro/w/8jt2\nUnde intra spritu bine\nSi se fac petreceri mari\nLa lautari, unde canta lautari' hashtags=[] response_to_id=None
id=ObjectId('65ecf558d831834e9e21a982') author_id=ObjectId('65e475a5d831837d3a72eac5') title='Nefiu' content='Eu cu Amtilb stăm în spate, patru bagaboante\nPar puțin fumate, deci sunt sparte, parfumate\nO împart fiindcă sunt dulce cofetar de\nStradă mușc din savarină și biscuiți cu lapte\nEu cu Amtilb stăm în spate, patru bagaboante\nNe conduc mașina așa șofer n-ai văzut frate\nLe duc până la cofetărie-n spate\nIau mini-eclere și le întreb: „parlez-vous français”?' hashtags=[] response_to_id=ObjectId('65ecf3e9d831834e9e21a97f')
id=ObjectId('65e726e8e64d1e32b88e928b') author_id=ObjectId('65e6e573e64d1e32dacf9881') title='Imi 