In [6]:
import pandas as pd
import joblib
from sentence_transformers import SentenceTransformer
from gensim.models import Word2Vec
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Clickbait dataset
df = pd.read_csv('clickbait_data.csv')

# TF-IDF
tfidf_vectorizer = joblib.load('tfidf_vectorizer.pkl')
tfidf_matrix = joblib.load('tfidf_matrix.pkl')

# Word2Vec
word2vec_model = Word2Vec.load('word2vec_model.model')
df_word2vec = pd.read_pickle('clickbait_data_with_vectors.pkl')

# Sentence Transformer
sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
df_sentence_transformers = pd.read_pickle('clickbait_data_with_sentence_vectors.pkl')


  from tqdm.autonotebook import tqdm, trange


In [7]:
# get recommandation
def get_tfidf_recommendations(target_index, n_choices=3):
    target_vector = tfidf_matrix[target_index]
    similarities = cosine_similarity(target_vector, tfidf_matrix).flatten()
    similar_indices = similarities.argsort()[-n_choices-1:-1][::-1]
    return similar_indices


In [8]:
# get recommandation
def get_word2vec_recommendations(target_index, n_choices=3):
    target_vector = df_word2vec.iloc[target_index]['vector']
    vectors = np.vstack(df_word2vec['vector'].values)
    similarities = cosine_similarity([target_vector], vectors).flatten()
    similar_indices = similarities.argsort()[-n_choices-1:-1][::-1]
    return similar_indices


In [9]:
# get recommandation
def get_sentence_transformer_recommendations(target_index, n_choices=3):
    target_vector = df_sentence_transformers.iloc[target_index]['vector']
    vectors = np.vstack(df_sentence_transformers['vector'].values)
    similarities = cosine_similarity([target_vector], vectors).flatten()
    similar_indices = similarities.argsort()[-n_choices-1:-1][::-1]
    return similar_indices


In [15]:
import random


random.seed(42)
reference_indices = random.sample(range(len(df)), 100)


In [22]:
def simulate_recommendations_closest(data, get_recommendations_func, reference_indices, n_choices=3):
    proposed_clickbait = 0
    proposed_non_clickbait = 0
    clicked_clickbait = 0
    clicked_non_clickbait = 0

    for target_index in reference_indices:
        similar_indices = get_recommendations_func(target_index, n_choices)
        chosen_index = similar_indices[0]

        for idx in similar_indices:
            if data.iloc[idx]['clickbait'] == 1:
                proposed_clickbait += 1
            else:
                proposed_non_clickbait += 1

        if data.iloc[chosen_index]['clickbait'] == 1:
            clicked_clickbait += 1
        else:
            clicked_non_clickbait += 1

    total_proposed = proposed_clickbait + proposed_non_clickbait
    total_clicked = clicked_clickbait + clicked_non_clickbait

    proposed_clickbait_pct = f"{round((proposed_clickbait / total_proposed) * 100, 2)}%" if total_proposed > 0 else "0%"
    proposed_non_clickbait_pct = f"{round((proposed_non_clickbait / total_proposed) * 100, 2)}%" if total_proposed > 0 else "0%"
    clicked_clickbait_pct = f"{round((clicked_clickbait / total_clicked) * 100, 2)}%" if total_clicked > 0 else "0%"
    clicked_non_clickbait_pct = f"{round((clicked_non_clickbait / total_clicked) * 100, 2)}%" if total_clicked > 0 else "0%"

    return {
        'proposed_clickbait': proposed_clickbait,
        'proposed_non_clickbait': proposed_non_clickbait,
        'clicked_clickbait': clicked_clickbait,
        'clicked_non_clickbait': clicked_non_clickbait,
        'proposed_clickbait_pct': proposed_clickbait_pct,
        'proposed_non_clickbait_pct': proposed_non_clickbait_pct,
        'clicked_clickbait_pct': clicked_clickbait_pct,
        'clicked_non_clickbait_pct': clicked_non_clickbait_pct
    }

In [17]:
def simulate_recommendations_random(data, get_recommendations_func, reference_indices, n_choices=3):
    proposed_clickbait = 0
    proposed_non_clickbait = 0
    clicked_clickbait = 0
    clicked_non_clickbait = 0

    for target_index in reference_indices:
        similar_indices = get_recommendations_func(target_index, n_choices)
        chosen_index = random.choice(similar_indices)

        for idx in similar_indices:
            if data.iloc[idx]['clickbait'] == 1:
                proposed_clickbait += 1
            else:
                proposed_non_clickbait += 1

        if data.iloc[chosen_index]['clickbait'] == 1:
            clicked_clickbait += 1
        else:
            clicked_non_clickbait += 1

    total_proposed = proposed_clickbait + proposed_non_clickbait
    total_clicked = clicked_clickbait + clicked_non_clickbait

    proposed_clickbait_pct = f"{round((proposed_clickbait / total_proposed) * 100, 2)}%" if total_proposed > 0 else "0%"
    proposed_non_clickbait_pct = f"{round((proposed_non_clickbait / total_proposed) * 100, 2)}%" if total_proposed > 0 else "0%"
    clicked_clickbait_pct = f"{round((clicked_clickbait / total_clicked) * 100, 2)}%" if total_clicked > 0 else "0%"
    clicked_non_clickbait_pct = f"{round((clicked_non_clickbait / total_clicked) * 100, 2)}%" if total_clicked > 0 else "0%"

    return {
        'proposed_clickbait': proposed_clickbait,
        'proposed_non_clickbait': proposed_non_clickbait,
        'clicked_clickbait': clicked_clickbait,
        'clicked_non_clickbait': clicked_non_clickbait,
        'proposed_clickbait_pct': proposed_clickbait_pct,
        'proposed_non_clickbait_pct': proposed_non_clickbait_pct,
        'clicked_clickbait_pct': clicked_clickbait_pct,
        'clicked_non_clickbait_pct': clicked_non_clickbait_pct
    }


In [23]:
results_tfidf_closest = simulate_recommendations_closest(df, get_tfidf_recommendations, reference_indices)
results_tfidf_random = simulate_recommendations_random(df, get_tfidf_recommendations, reference_indices)
print("TF-IDF Results (Closest):", results_tfidf_closest)
print("TF-IDF Results (Random):", results_tfidf_random)

results_word2vec_closest = simulate_recommendations_closest(df_word2vec, get_word2vec_recommendations, reference_indices)
results_word2vec_random = simulate_recommendations_random(df_word2vec, get_word2vec_recommendations, reference_indices)
print("Word2Vec Results (Closest):", results_word2vec_closest)
print("Word2Vec Results (Random):", results_word2vec_random)

results_sentence_transformers_closest = simulate_recommendations_closest(df_sentence_transformers, get_sentence_transformer_recommendations, reference_indices)
results_sentence_transformers_random = simulate_recommendations_random(df_sentence_transformers, get_sentence_transformer_recommendations, reference_indices)
print("Sentence Transformer Results (Closest):", results_sentence_transformers_closest)
print("Sentence Transformer Results (Random):", results_sentence_transformers_random)


TF-IDF Results (Closest): {'proposed_clickbait': 198, 'proposed_non_clickbait': 102, 'clicked_clickbait': 63, 'clicked_non_clickbait': 37, 'proposed_clickbait_pct': '66.0%', 'proposed_non_clickbait_pct': '34.0%', 'clicked_clickbait_pct': '63.0%', 'clicked_non_clickbait_pct': '37.0%'}
TF-IDF Results (Random): {'proposed_clickbait': 198, 'proposed_non_clickbait': 102, 'clicked_clickbait': 67, 'clicked_non_clickbait': 33, 'proposed_clickbait_pct': '66.0%', 'proposed_non_clickbait_pct': '34.0%', 'clicked_clickbait_pct': '67.0%', 'clicked_non_clickbait_pct': '33.0%'}
Word2Vec Results (Closest): {'proposed_clickbait': 172, 'proposed_non_clickbait': 128, 'clicked_clickbait': 58, 'clicked_non_clickbait': 42, 'proposed_clickbait_pct': '57.33%', 'proposed_non_clickbait_pct': '42.67%', 'clicked_clickbait_pct': '58.0%', 'clicked_non_clickbait_pct': '42.0%'}
Word2Vec Results (Random): {'proposed_clickbait': 172, 'proposed_non_clickbait': 128, 'clicked_clickbait': 57, 'clicked_non_clickbait': 43, 'p