## Libraries

In [1]:
from pymongo import MongoClient
from bson import ObjectId
import math
from datetime import datetime
import flask
from flask_cors import CORS, cross_origin
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import time
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Embedding, Reshape, Flatten, Input, Dot, Lambda, Dense
from tensorflow.keras.models import Model
import nltk
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

## Building connections and settings constants

In [2]:
# CHANGE the connection string to your MongoDB database instance
#client = MongoClient("mongodb+srv://officialmemeon:nRPOPkW3rRYzlH83@cluster0.vkb7bra.mongodb.net/")  
#db = client["test"] 
#collection = db["posts"]
#collectionUser = db["users"]

#Listposts = list(collection.find())
#Listusers = list(collectionUser.find())


In [3]:
#userIds = []
#postIds = []
#for user in Listusers:
#    userIds.append(str(user["_id"]))
#for post in Listposts:
#    postIds.append(str(post["_id"]))


In [23]:
users = pd.read_csv("users.csv")
posts = pd.read_csv("posts.csv")
ListPosts = posts.values.tolist()

In [5]:
# retrieving user and post data from the database and appending into the lists: "users, posts"
#postIds


In [9]:
#print(Listposts)
PostIds = []
for post in posts["_id"]:
    PostIds.append(post)
PostIds

['64dc7749f42bbd9057cb21a8',
 '64dc7abdf42bbd9057cb22c3',
 '64dc7bc1f42bbd9057cb2360',
 '64dc8123f42bbd9057cb2436',
 '64dcbebdf42bbd9057cb29ce',
 '64dcc0f4f42bbd9057cb2a18',
 '64dcd1bff42bbd9057cb2d0e',
 '64dcd80ef42bbd9057cb2dc5',
 '64dcf47ef42bbd9057cb32e2',
 '64dcf952f42bbd9057cb344c',
 '64dcfd62f42bbd9057cb35a8',
 '64dd02c3f42bbd9057cb3808',
 '64dd053ff42bbd9057cb3892',
 '64dd05bdf42bbd9057cb38a4',
 '64dd22a6f5554054a3467894',
 '64dd375cf5554054a3467cb0',
 '64dd526ff5554054a346809d',
 '64dd729a3713e91686492e81',
 '64dd816e3713e9168649366b',
 '64dddc083713e91686494f6b',
 '64de5d6d3713e91686496acb',
 '64de85213713e916864974c2',
 '64deef0762f43d93943b9b6c',
 '64df1f2c62f43d93943baadf',
 '64df39df62f43d93943bb0d6',
 '64df6aca62f43d93943bb5de',
 '64dfc7f162f43d93943bc915',
 '64e0dcdb62f43d93943c21f2',
 '64e0fda562f43d93943c2a78',
 '64e14fc862f43d93943c3a0c',
 '64e2206962f43d93943c5863',
 '64e22d1862f43d93943c5962',
 '64e51108b0b6c574fef60f2a',
 '64e511a9b0b6c574fef60fd8',
 '64e92d8717cb

## Data Preparation for Embedding

In [10]:
import nltk
tokenized_posts = []

for index, row in posts.iterrows():
    post_id = row['_id']
    description = row['description']
    tags = ' '.join(row['tags']).lower()
    comments = row['comments']
    combined_text = f"{description} {tags} {comments}"
    tokenized_text = nltk.word_tokenize(combined_text)
    tokenized_text.append(post_id)
    tokenized_posts.append(tokenized_text)
    

## Preparing model

In [12]:
# Create Word2Vec model with tokenized posts and post IDs
model = Word2Vec(sentences=tokenized_posts, vector_size=100, window=5, min_count=1, sg=0)

post_id_to_vector = {}
# Iterate through post IDs and get their vectors from the Word2Vec model
for post_id in PostIds:
    if str(post_id) in model.wv:
        post_id_to_vector[post_id] = model.wv[str(post_id)]

# Save the post_id_to_vector dictionary (optional)
import pickle

with open("post_id_to_vector.pkl", "wb") as f:
    pickle.dump(post_id_to_vector, f)
post_id_to_vector

{'64dc7749f42bbd9057cb21a8': array([-0.00562449,  0.00822842, -0.00303315,  0.0060672 , -0.00531338,
        -0.01497616,  0.01541625,  0.01755776, -0.00615541, -0.00150967,
         0.00014103, -0.00833257,  0.00461868, -0.00422929,  0.01002542,
        -0.00437111,  0.01578623,  0.01093747,  0.00041269, -0.01660049,
         0.01206395, -0.0008696 ,  0.00575398, -0.01120839, -0.00621795,
        -0.00568889, -0.00239392,  0.01284672, -0.00858224,  0.00396259,
         0.00486201, -0.0109802 ,  0.00593352, -0.00568426,  0.00804192,
        -0.00078993,  0.00794528,  0.00692814,  0.00196751, -0.00867826,
        -0.00647152,  0.00152735,  0.0029352 ,  0.00335016,  0.00612863,
         0.00282274,  0.00358162,  0.008884  , -0.00453929,  0.01622511,
         0.00474606,  0.00304755, -0.00225905, -0.01064913,  0.00240717,
        -0.00576375, -0.00590083, -0.00540045, -0.00744202,  0.00355403,
         0.00569138,  0.00875881,  0.00142101,  0.0064059 , -0.00549707,
         0.00882958, -0

In [13]:
def get_post_vector(post_id):
    if post_id in PostIds:
        return PostIds.index(post_id)
    else:
        # Handle the case where the post ID doesn't exist in your dataset
        return None  # Or return a default vector or handle the error as per your requirements


## Recommendations using Embedding

In [37]:
from scipy.spatial.distance import cosine

def get_post_recommendations(given_post_id, threshold=0.05):
    similar_posts = []
    given_post_vector = post_id_to_vector.get(given_post_id)
    
    #print("Given Post ID:", given_post_id)
    #print("Given Post Vector:", given_post_vector)
    
    if given_post_vector is not None:
        for post_id, post_vector in post_id_to_vector.items():
            if post_id != given_post_id:
                similarity = 1 - cosine(given_post_vector, post_vector)
#                print(f"Post ID: {post_id}, Similarity: {similarity}")
                if similarity >= threshold:
                    similar_posts.append((post_id, similarity))
        
        # Sort similar posts by similarity score in descending order
        similar_posts.sort(key=lambda x: x[1], reverse=True)
    
    return similar_posts

# Example usage
given_post_id = '6531bc1d0a69ece40ff69c01'  # Replace with the desired post ID
similar_posts = get_post_recommendations(given_post_id)

similar_posts_ids = []
print("Recommended posts for the user:")
for similar_post in similar_posts:
    similar_posts_ids.append(similar_post[0])
    
similar_posts_ids

Recommended posts for the user:


['653018520a69ece40ff623c3',
 '6518c385e81df403dee8e3c3',
 '654a75ad318aa113d74ae125',
 '653801d90a69ece40ff76ac0',
 '6529f6170de3e77117de845c',
 '653108c70a69ece40ff67776',
 '654fe5dc721853536741fa45',
 '65486096318aa113d74ada81',
 '6536a7b90a69ece40ff73fd0',
 '65305dbf0a69ece40ff656c7',
 '653dc2fb3e06c4b204e56683',
 '6535152f0a69ece40ff70616',
 '64df39df62f43d93943bb0d6',
 '652f0b4c0de3e77117defb6b',
 '6530122e0a69ece40ff61839',
 '6536a60e0a69ece40ff73e91',
 '652886b80de3e77117de5da7',
 '6536f27b0a69ece40ff74924',
 '653019c60a69ece40ff62738',
 '653124180a69ece40ff67dfc',
 '64e51108b0b6c574fef60f2a',
 '6530163f0a69ece40ff61f24',
 '64dd02c3f42bbd9057cb3808',
 '653179410a69ece40ff68de0',
 '650b85af81f1d3754141e851',
 '651934b2e81df403dee90f9e',
 '650c22eb81f1d3754142da2e',
 '64deef0762f43d93943b9b6c',
 '64dd053ff42bbd9057cb3892',
 '650e283c81f1d3754145831b',
 '653017100a69ece40ff6213d',
 '654bfb15318aa113d74aec25',
 '653126750a69ece40ff67ecd',
 '65351a000a69ece40ff70740',
 '651b24552f19

## Item based CF

In [41]:
# Import the user-item matrix
df_user_item_matrix = pd.read_csv('user_item_matrix.csv')

# Calculate item similarities using cosine similarity
item_similarities = cosine_similarity(df_user_item_matrix.T)

print("Item Similarities:")
print(item_similarities)

active_user_likes = {'6531bc1d0a69ece40ff69c01'}

Item Similarities:
[[1.         0.40937761 0.30779351 ... 0.         0.         0.        ]
 [0.40937761 1.         0.36273813 ... 0.         0.         0.        ]
 [0.30779351 0.36273813 1.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


In [42]:
liked_post_indices = [i for i, post in enumerate(ListPosts) if str(post[0]) in active_user_likes]
average_similarity_scores = np.mean(item_similarities[liked_post_indices], axis=0)
sorted_post_indices = np.argsort(average_similarity_scores)[::-1]

N = 50  # Number of recommended posts
recommended_posts = [ListPosts[i] for i in sorted_post_indices[:N]]

print("Recommended Posts:")
recommended_posts_ids = []
for post in recommended_posts:
    recommended_posts_ids.append(str(post[0]))
recommended_posts_ids

Recommended Posts:


['653176110a69ece40ff68bb5',
 '6531bc1d0a69ece40ff69c01',
 '65315db90a69ece40ff687c6',
 '6536f27b0a69ece40ff74924',
 '6531a99e0a69ece40ff6986f',
 '653781480a69ece40ff757bc',
 '654a75fe318aa113d74ae1c6',
 '655917c672185353674210f1',
 '6559174a72185353674210db',
 '64e9c8f717cb4912c1155a8d',
 '653801a20a69ece40ff76a96',
 '653019c60a69ece40ff62738',
 '654bfa1f318aa113d74aebfa',
 '65310a880a69ece40ff6789b',
 '654a7662318aa113d74ae1ea',
 '65310f250a69ece40ff679ac',
 '654a7681318aa113d74ae1fc',
 '6530163f0a69ece40ff61f24',
 '6536a60e0a69ece40ff73e91',
 '6536a5cd0a69ece40ff73e83',
 '653108c70a69ece40ff67776',
 '6530d1dd0a69ece40ff67104',
 '653515800a69ece40ff70624',
 '653008580de3e77117df2a15',
 '65351e500a69ece40ff7078c',
 '65181fe94aba5b2bd4ff53b2',
 '653517960a69ece40ff706a5',
 '65319f490a69ece40ff696c7',
 '65310ee10a69ece40ff679a0',
 '65305d320a69ece40ff6567c',
 '651397b84aba5b2bd4fad0f9',
 '6531e7b30a69ece40ff6a2db',
 '65319e860a69ece40ff696a0',
 '65305dbf0a69ece40ff656c7',
 '6507d09981f1