In [48]:
import numpy as np

from app.models import Session, Headline, Article, Agency, Topic
import pandas as pd
from datetime import datetime as dt, timedelta as td
from app.utils.constants import Constants, Country
from app.utils.config import Config
from sqlalchemy import or_


with Session() as s:
    headlines = s.query(Headline.processed, Headline.first_accessed, Article.url, Topic.name, Agency.name, Agency._bias)\
        .join(Headline.article).join(Article.agency).join(Article.topic)\
        .filter(
            Article.first_accessed > dt.now() - td(days=7),
            or_(Agency._country==Country.us.value, Agency.name.in_(Config.exempted_foreign_media))
        ).all()
topic_df = pd.DataFrame(headlines, columns=['headline', 'date', 'url', 'topic', 'agency', 'bias'])
topic_df.head()

In [49]:
with Session() as s:
    headlines = s.query(Headline.processed, Headline.first_accessed, Article.url, Agency.name, Agency._bias)\
        .join(Headline.article).join(Article.agency)\
        .filter(
            Article.first_accessed > dt.now() - td(days=7),
            or_(Agency._country==Country.us.value, Agency.name.in_(Config.exempted_foreign_media))
        ).all()
df = pd.DataFrame(headlines, columns=['headline', 'date', 'url', 'agency', 'bias'])

In [50]:
len(df)

In [51]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    return ' '.join([lemmatizer.lemmatize(word) for word in tokens if word not in stop_words])

In [58]:
from sklearn.feature_extraction.text import TfidfVectorizer

df['processed'] = df['headline'].apply(preprocess)
print(len(df))
# get the first 3000 samples
samples = df.sample(25000)
vectorizer = TfidfVectorizer(ngram_range=(1, 2))
tfidf_matrix = vectorizer.fit_transform(samples['processed'])


In [70]:
today_df = df[df['date'] > dt.now() - td(days=1)].copy()
today_df['processed'] = today_df['headline'].apply(preprocess)
# take cosine similarity and drop all headlines without any similarity
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(tfidf_matrix, vectorizer.transform(today_df['processed']))


today_tfidf = vectorizer.transform(today_df['processed'])
print(len(today_df))

In [72]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
data_scaled = scaler.fit_transform(today_tfidf.toarray())
pca = PCA(n_components=0.95)
data_pca = pca.fit_transform(data_scaled)
print("Original shape:", data_scaled.shape)
print("Reduced shape:", data_pca.shape)

In [102]:
import hdbscan
today_clusterer = hdbscan.HDBSCAN(min_cluster_size=5, min_samples=5, cluster_selection_epsilon=0.000001, cluster_selection_method='leaf')
today_labels = today_clusterer.fit_predict(today_tfidf)
today_df['cluster'] = today_labels
print("Number of clusters:", len(today_df['cluster'].unique()))

In [103]:
cluster = today_df[today_df['cluster'] == 0]
print(len(cluster))
cluster

In [104]:
cluster = today_df[today_df['cluster'] == 1]
print(len(cluster))
cluster

In [105]:
cluster = today_df[today_df['cluster'] == 2]
print(len(cluster))
cluster

In [99]:
cluster = today_df[today_df['cluster'] == 3]
print(len(cluster))
cluster

In [100]:
cluster = today_df[today_df['cluster'] == -1]
print(len(cluster))
cluster

In [101]:
today_df[today_df['cluster'] == 100]

In [11]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(today_tfidf, today_tfidf)

In [12]:
import pandas as pd
sim_df = pd.DataFrame(cosine_sim, index=today_df.index, columns=today_df.index)
np.fill_diagonal(sim_df.values, 0)
top_indices = pd.DataFrame(sim_df.unstack(), columns=['cosine_sim'])
pairs = top_indices.sort_values(by='cosine_sim', ascending=False)
pairs = pairs[pairs['cosine_sim'] > 0.05]

In [13]:
pairs['headline1'] = pairs.index.map(lambda x: df.iloc[x[0]]['headline'])
pairs['headline2'] = pairs.index.map(lambda x: df.iloc[x[1]]['headline'])
pairs

In [14]:
len(pairs[pairs['cosine_sim'] > 0.05])

In [15]:
pairs[pairs['cosine_sim'] < 0.1]