<h2>Persist Ventures Assignment - Richa Patel</h2>  
<h4>Video Recommendations</h4>  


**Content and Collaborative Filtering Recommendations**

In [None]:
import requests
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
import ast

# # Base URL and headers for API requests
# BASE_URL = "https://api.socialverseapp.com"
# HEADERS = {
#     "Flic-Token": "flic_f42bf01b4d011ceaf602290e39116146a1ed7bbfe73bc949e6b19561a34cf4b8"
# }

# # API Fetch Functions
# def fetch_data(endpoint, params={}):
#     url = f"{BASE_URL}/{endpoint}"
#     response = requests.get(url, headers=HEADERS, params=params)
#     if response.status_code == 200:
#         return response.json()
#     else:
#         print(f"Error: {response.status_code} - {response.text}")
#         return {}

# def get_all_posts(page=1, page_size=1000):
#     return fetch_data("posts/summary/get", {"page": page, "page_size": page_size}).get("posts", [])

# def get_viewed_posts(page=1, page_size=1000):
#     return fetch_data("posts/view", {"page": page, "page_size": page_size}).get("posts", [])

# def get_liked_posts(page=1, page_size=1000):
#     return fetch_data("posts/like", {"page": page, "page_size": page_size}).get("posts", [])

# def get_all_users(page=1, page_size=1000):
#     return fetch_data("users/get_all", {"page": page, "page_size": page_size}).get("users", [])

# # Load data from live APIs
# posts_data = get_all_posts()
# viewed_posts_data = get_viewed_posts()
# liked_posts_data = get_liked_posts()
# users_data = get_all_users()

# # Convert API data to DataFrames
# posts_df = pd.DataFrame(posts_data)
# viewed_posts_df = pd.DataFrame(viewed_posts_data)
# liked_posts_df = pd.DataFrame(liked_posts_data)
# users_df = pd.DataFrame(users_data)



# # Define file paths
# posts_file = "posts_data.csv"
# viewed_posts_file = "viewed_posts_data.csv"
# liked_posts_file = "liked_posts_data.csv"
# users_file = "users_data.csv"

# # Save DataFrames to CSV files
# posts_df.to_csv(posts_file, index=False)
# viewed_posts_df.to_csv(viewed_posts_file, index=False)
# liked_posts_df.to_csv(liked_posts_file, index=False)
# users_df.to_csv(users_file, index=False)

# print("Data saved to CSV files:")
# print(f"- {posts_file}")
# print(f"- {viewed_posts_file}")
# print(f"- {liked_posts_file}")
# print(f"- {users_file}")


# read data from csv
posts_df = pd.read_csv("posts_data.csv")
viewed_posts_df = pd.read_csv("viewed_posts_data.csv")
liked_posts_df = pd.read_csv("liked_posts_data.csv")
users_df = pd.read_csv("users_data.csv")

# Data Preprocessing
# Handle missing values and extract category names
posts_df.fillna('', inplace=True)
if 'category' in posts_df.columns:
    posts_df['category'] = posts_df['category'].apply(
        lambda x: ast.literal_eval(x) if isinstance(x, str) else x
    )
    posts_df['category_name'] = posts_df['category'].apply(
        lambda x: x.get('name') if isinstance(x, dict) else None
    )

# Ensure title and post_summary are strings; handle non-string values gracefully
posts_df['title'] = posts_df['title'].apply(lambda x: str(x) if isinstance(x, str) else "")
posts_df['post_summary'] = posts_df['post_summary'].apply(lambda x: str(x) if isinstance(x, str) else "")

# Combine text columns safely
posts_df['combined_text'] = posts_df['title'] + " " + posts_df['post_summary']


# Content-Based Filtering
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(posts_df['combined_text'])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

def get_content_based_recommendations(username, category=None):
    # Filter posts liked by the user based on `username`
    
    liked_post_ids = liked_posts_df[liked_posts_df['username'] == username]['id'].tolist()
    print(f'liked_posts_ids: {liked_post_ids}')
    if not liked_post_ids:
        # print("HERE")
        # Recommend popular posts for new users
        if category:
            popular_posts = posts_df[posts_df['category_name'] == category]
        popular_posts = popular_posts.sort_values(by='view_count', ascending=False).head(10)

        return popular_posts[['id', 'slug', 'category_name']]
    
    # Recommend similar posts based on the first liked post
    base_post_id = liked_post_ids[0]
    idx = posts_df.index[posts_df['id'] == base_post_id].tolist()[0] 
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:11]
    # Extract indices of the recommended posts
    recommended_indices = [i[0] for i in sim_scores]
    
    # Select rows from posts_df based on recommended indices
    recommendations = posts_df.iloc[recommended_indices]
    
    if category:
        recommendations = recommendations[recommendations['category_name'] == category]
    return recommendations[['id', 'slug', 'category_name']]


# Check if 'username' can be used as an alternative to 'user_id'
if 'username' in viewed_posts_df.columns:
    user_post_matrix = pd.pivot_table(
        viewed_posts_df, index='username', columns='id', values='view_count', fill_value=0
    )
else:
    raise KeyError("The dataframe does not have a 'user_id' or 'username' column.")


# SVD for Matrix Factorization
svd = TruncatedSVD(n_components=10, random_state=42)
user_post_matrix_svd = svd.fit_transform(user_post_matrix)
predicted_matrix = np.dot(user_post_matrix_svd, svd.components_)

def get_collaborative_filtering_recommendations(username, category=None):
    
    # Handle new users (users not in the interaction matrix)
    if username not in user_post_matrix.index:
        print("New user detected. Recommending 10 popular posts.")
        # Recommend popular posts for new users
        if category:
            # Filter popular posts by category
            popular_posts = posts_df[posts_df['category_name'] == category].sort_values(by='view_count', ascending=False)
        else:
            # Recommend the top popular posts overall
            popular_posts = posts_df.sort_values(by='view_count', ascending=False)
        # Ensure exactly 10 posts are recommended
        return popular_posts[['id', 'slug', 'category_name']].head(10)

    # Generate recommendations for existing users
    # Find the user index in the interaction matrix
    user_idx = user_post_matrix.index.get_loc(username)

    # Get the user's predicted ratings from the collaborative filtering model
    user_predicted_ratings = predicted_matrix[user_idx]

    # Sort items by predicted ratings in descending order
    post_indices = user_predicted_ratings.argsort()[-50:][::-1]  # Take a buffer of 50 to allow for filtering

    # Select posts from the posts_df based on the predicted indices
    recommendations = posts_df.iloc[post_indices]

    # Filter by category if specified
    if category:
        recommendations = recommendations[recommendations['category_name'] == category]

    # Ensure exactly 10 posts are recommended
    return recommendations[['id', 'slug', 'category_name']].head(10)


# Example Usage
# user_id = 5 # old user
user_id = 2 # new user
category_name = 'E/ACC'
username = users_df[users_df['id'] == user_id]['username'].values[0]
print(username)

print("\n================== Content-Based Recommendations ====================")
print(get_content_based_recommendations(username, category=category_name))

print("\n================== Collaborative-Based Recommendations ====================")
print(get_collaborative_filtering_recommendations(username, category=category_name))


kinha

liked_posts_ids: [871, 923, 167, 152, 1208, 1258, 1133, 1104, 1213, 1236, 789, 1246, 1247, 939, 1159, 918, 1160, 1163, 1164, 1161, 788, 714, 996, 775, 377, 779, 173, 698, 84, 898, 702, 619, 662, 663, 664, 82, 151, 549, 527, 370, 371, 449, 216, 207, 177, 189, 190]
Empty DataFrame
Columns: [id, slug, category_name]
Index: []

    id                                      slug category_name
53  80                      don-t-get-distracted         E/ACC
64  93         do-not-conform-to-a-dying-society         E/ACC
66  95     modern-life-is-set-up-to-distract-you         E/ACC
67  97                                 awakening         E/ACC
61  89  it-all-begins-in-your-mind-never-give-up         E/ACC
60  88                  are-you-going-to-make-it         E/ACC
65  94                        the-historical-man         E/ACC
51  78                          choose-your-path         E/ACC


In [50]:
from sklearn.metrics import precision_score, recall_score
import numpy as np

# Compute Diversity
def compute_diversity(recommended_posts):
    unique_recommended = set(recommended_posts)
    total_recommendations = len(recommended_posts)
    diversity = len(unique_recommended) / total_recommendations if total_recommendations > 0 else 0
    return diversity

# Compute Novelty
def compute_novelty(recommended_posts, all_post_popularity):
    # Convert recommended posts into their popularity scores
    popularity_scores = [all_post_popularity.get(post, len(all_post_popularity)) for post in recommended_posts]
    novelty = np.mean(popularity_scores) if len(popularity_scores) > 0 else 0
    return novelty

# Mock popularity data (lower rank means higher popularity)
all_post_popularity = {title: rank for rank, title in enumerate(posts_df['id'].tolist(), start=1)}

# Evaluate Content-Based Recommendations
def evaluate_content_based(username, category=None):
    recommendations = get_content_based_recommendations(username, category)
    recommended_post_ids = recommendations['id'].tolist()

    # Diversity and Novelty
    diversity = compute_diversity(recommended_post_ids)
    novelty = compute_novelty(recommended_post_ids, all_post_popularity)

    return diversity, novelty

# Evaluate Collaborative-Based Recommendations
def evaluate_collaborative(username, category=None):
    recommendations = get_collaborative_filtering_recommendations(username, category)
    recommended_post_ids = recommendations['id'].tolist()

    # Diversity and Novelty
    diversity = compute_diversity(recommended_post_ids)
    novelty = compute_novelty(recommended_post_ids, all_post_popularity)

    return diversity, novelty

# Example Evaluation
print("\n================== Evaluation Metrics ====================")
print("Content-Based Recommendations:")
cb_diversity, cb_novelty = evaluate_content_based(username, category_name)
print(f"Precision: Diversity: {cb_diversity:.2f}, Novelty: {cb_novelty:.2f}")

print("\nCollaborative-Based Recommendations:")
cf_diversity, cf_novelty = evaluate_collaborative(username, category_name)
print(f"Precision: Diversity: {cf_diversity:.2f}, Novelty: {cf_novelty:.2f}")



Content-Based Recommendations:
liked_posts_ids: [30, 11, 58, 29, 61]
Precision: Diversity: 1.00, Novelty: 376.00

Collaborative-Based Recommendations:
Precision: Diversity: 1.00, Novelty: 54.11
