In [1]:
import sys
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
import nltk
import time
from keybert import KeyBERT

rpath = os.path.abspath('..')
if rpath not in sys.path:
    sys.path.insert(0, rpath)
from src.loader import NewsDataLoader
from src.utils import preprocess_text

  from tqdm.autonotebook import tqdm, trange


# Initialize Dataloader

In [2]:
ndl = NewsDataLoader("../data")
data = ndl.load_data()
traffic = ndl.load_traffic()
domain = ndl.load_domain_location()

# How similar are keywords in the headline/title compared to keywords in the news body across sites? 

In [3]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))


data['cleaned_title'] = data['title'].map(lambda x: preprocess_text(x, stop_words))
data['cleaned_content'] = data['content'].map(lambda x: preprocess_text(x, stop_words))

combined_text = pd.concat([data['cleaned_title'], data['cleaned_content']])

tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit(combined_text)

title_tfidf = tfidf_vectorizer.transform(data['cleaned_title'])
content_tfidf = tfidf_vectorizer.transform(data['cleaned_content'])

data['similarity'] = [cosine_similarity(title_tfidf[i], content_tfidf[i]).flatten()[0] for i in range(len(data))]

site_similarity = data.groupby('source_name')['similarity'].mean().reset_index()
site_similarity.columns = ['Source Name', 'Average Title-Content Similarity']
site_similarity.sort_values(by='Average Title-Content Similarity', ascending=False)

[nltk_data] Downloading package stopwords to /home/mekbib/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,Source Name,Average Title-Content Similarity
26,The Verge,0.773013
11,ETF Daily News,0.622353
9,Deadline,0.413583
16,GlobeNewswire,0.382301
18,Marketscreener.com,0.354881
0,ABC News,0.354594
4,BBC News,0.338615
7,CNA,0.325279
23,The Indian Express,0.294212
24,The Punch,0.291455


# Topic modelling