In [39]:
import numpy as np

from app.models import Session, Headline, Article, Agency, Topic
import pandas as pd
from datetime import datetime as dt, timedelta as td


with Session() as s:
    headlines = s.query(Headline.processed, Headline.first_accessed, Article.url, Topic.name, Agency.name, Agency._bias).join(Headline.article).join(Article.agency).join(Article.topic).filter(Article.first_accessed > dt.now() - td(days=7)).all()
topic_df = pd.DataFrame(headlines, columns=['headline', 'date', 'url', 'topic', 'agency', 'bias'])
topic_df.head()

OperationalError: (sqlite3.OperationalError) disk I/O error
[SQL: SELECT headline.processed AS headline_processed, headline.first_accessed AS headline_first_accessed, article.url AS article_url, topic.name AS topic_name, agency.name AS agency_name, agency._bias AS agency__bias 
FROM headline JOIN article ON article.id = headline.article_id JOIN agency ON agency.id = article.agency_id JOIN topic ON topic.id = article.topic_id 
WHERE article.first_accessed > ?]
[parameters: ('2024-04-06 10:15:53.840561',)]
(Background on this error at: https://sqlalche.me/e/20/e3q8)

In [None]:
with Session() as s:
    headlines = s.query(Headline.processed, Headline.first_accessed, Article.url, Agency.name, Agency._bias).join(Headline.article).join(Article.agency).filter(Article.first_accessed > dt.now() - td(days=7)).all()
df = pd.DataFrame(headlines, columns=['headline', 'date', 'url', 'agency', 'bias'])

In [40]:
len(df)

71690

In [41]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    return ' '.join([lemmatizer.lemmatize(word) for word in tokens if word not in stop_words])

In [42]:
from sklearn.feature_extraction.text import TfidfVectorizer

df['processed'] = df['headline'].apply(preprocess)
# get the first 3000 samples
samples = df.sample(25000)
vectorizer = TfidfVectorizer(min_df=2, max_df=0.5, ngram_range=(1, 2))
tfidf_matrix = vectorizer.fit_transform(samples['processed'])

In [43]:
import hdbscan

clusterer = hdbscan.HDBSCAN(min_cluster_size=5, min_samples=1sdfasdfasdfasdasdfasdfasdfasdfasdfasdf  , cluster_selection_epsilon=0.1, cluster_selection_method='eom')
cluster_labels = clusterer.fit_predict(tfidf_matrix)

KeyboardInterrupt: 

In [None]:
samples['cluster'] = cluster_labels
samples[samples['cluster'] == 1]

In [19]:
today_df = df[df['date'] > dt.now() - td(days=1)].copy()
today_df['processed'] = today_df['headline'].apply(preprocess)
today_clusterer = hdbscan.HDBSCAN(min_cluster_size=10, min_samples=20, cluster_selection_epsilon=0.01, cluster_selection_method='leaf')
today_tfidf = vectorizer.transform(today_df['processed'])
today_labels = today_clusterer.fit_predict(today_tfidf)
today_df['cluster'] = today_labels
print("Number of clusters:", len(today_df['cluster'].unique()))
today_df[today_df['cluster'] == 1]

Number of clusters: 5


Unnamed: 0,headline,date,url,agency,bias,processed,cluster
71206,Shopping centre go-kart track approved,2024-04-12 15:00:02.191876,http://news.yahoo.com/shopping-centre-kart-tra...,Yahoo News,-1,shopping centre gokart track approved,1
71207,Andrew Harrer/Bloomberg/Getty Images/File,2024-04-12 15:00:02.191876,https://www.cnn.com/2024/04/12/tech/airbnb-ren...,CNN,-2,andrew harrerbloomberggetty imagesfile,1
71251,Someone has revealed the Gab chatbot prompt,2024-04-12 15:00:02.191876,https://www.dailykos.com/stories/2024/4/12/223...,The Daily Kos,-3,someone revealed gab chatbot prompt,1
71256,JK Rowling U-turned on promise to respect pron...,2024-04-12 15:00:02.191876,https://www.independent.co.uk/us/arts-entertai...,The Independent,-1,jk rowling uturned promise respect pronoun,1
71286,Is the Constitution Antiquated and Dysfunctional?,2024-04-12 15:00:02.191876,https://www.theepochtimes.com/opinion/is-the-c...,The Epoch Times,2,constitution antiquated dysfunctional,1
71321,Moroccan Burgers With Cucumber Slices and Toma...,2024-04-12 15:00:02.191876,https://www.theepochtimes.com/bright/moroccan-...,The Epoch Times,2,moroccan burger cucumber slice tomato,1
71333,An Entrancing Fairy Tale About Italian Grave R...,2024-04-12 15:30:01.975088,https://www.theatlantic.com/culture/archive/20...,The Atlantic,-1,entrancing fairy tale italian grave robber,1
71353,'Can you hear me?' Hang up,2024-04-12 15:30:01.975088,https://www.usatoday.com/story/nletter/2024/04...,USA Today,-1,hear hang,1
71364,An Oblique and Beautiful Book,2024-04-12 15:30:01.975088,https://www.theatlantic.com/newsletters/archiv...,The Atlantic,-1,oblique beautiful book,1
71379,Is Greed Causing Shrinkflation?,2024-04-12 15:30:01.975088,https://reason.com/video/2024/04/12/is-greed-c...,Reason,1,greed causing shrinkflation,1


In [20]:
today_df[today_df['cluster'] == 2]

Unnamed: 0,headline,date,url,agency,bias,processed,cluster
67814,Solidarity Forever: Building Movements Amid To...,2024-04-12 10:30:02.657482,https://theintercept.com/2024/04/12/deconstruc...,The Intercept,-2,solidarity forever building movement amid toda...,2
67815,Fatal stabbing in Bordeaux connected to victim...,2024-04-12 10:30:02.657482,https://www.lemonde.fr/en/france/article/2024/...,Le Monde,-1,fatal stabbing bordeaux connected victim alcoh...,2
67816,"Nvidia is in a bubble, stocks will disappoint ...",2024-04-12 10:30:02.657482,https://www.businessinsider.com/stock-market-o...,Business Insider,-1,nvidia bubble stock disappoint decade recessio...,2
67817,Humanitarian aid largely lacking in war-ravage...,2024-04-12 10:30:02.657482,https://www.france24.com/en/en/video/20240412-...,France24,0,humanitarian aid largely lacking warravaged gaza,2
67818,"India Car Sales Jump With Demand for Electric,...",2024-04-12 10:30:02.657482,https://www.bloomberg.com/news/articles/2024-0...,Bloomberg,-1,india car sale jump demand electric suv behind...,2
...,...,...,...,...,...,...,...
71685,Somalia says it will never accept Ethiopian na...,2024-04-12 15:30:01.975088,https://www.reuters.com/world/africa/somalia-s...,Reuters,0,somalia say never accept ethiopian naval base ...,2
71686,US meets Venezuelan officials to express conce...,2024-04-12 15:30:01.975088,https://www.reuters.com/world/americas/us-meet...,Reuters,0,u meet venezuelan official express concern ele...,2
71687,Possible BP bid highlights 'London for sale' s...,2024-04-12 15:30:01.975088,https://www.reuters.com/markets/deals/possible...,Reuters,0,possible bp bid highlight london sale scenario,2
71688,Weekend Long Read: How China Can Avoid the Wor...,2024-04-13 12:53:50.426875,https://www.caixinglobal.com/2024-04-12/weeken...,Caixin Global,1,weekend long read china avoid worst path reviv...,2


In [21]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(today_tfidf, today_tfidf)

In [35]:
import pandas as pd
sim_df = pd.DataFrame(cosine_sim, index=today_df.index, columns=today_df.index)
np.fill_diagonal(sim_df.values, 0)
top_indices = pd.DataFrame(sim_df.unstack(), columns=['cosine_sim'])
pairs = top_indices.sort_values(by='cosine_sim', ascending=False)
pairs = pairs[pairs['cosine_sim'] > 0.05]

In [37]:
pairs['headline1'] = pairs.index.map(lambda x: df.iloc[x[0]]['headline'])
pairs['headline2'] = pairs.index.map(lambda x: df.iloc[x[1]]['headline'])
pairs

Unnamed: 0,Unnamed: 1,cosine_sim,headline1,headline2
69917,69382,1.000000,"03:09Salman Khan targeted by trolls for wearing printed pants at Sohail's Eid bash; internet calls him, 'OG King of Chappri'","3:9Salman Khan targeted by trolls for wearing printed pants at Sohail's Eid bash; internet calls him, 'OG King of Chappri'"
69382,69917,1.000000,"3:9Salman Khan targeted by trolls for wearing printed pants at Sohail's Eid bash; internet calls him, 'OG King of Chappri'","03:09Salman Khan targeted by trolls for wearing printed pants at Sohail's Eid bash; internet calls him, 'OG King of Chappri'"
69405,69613,1.000000,Chicago baseball report: Adbert Alzolay' bounce-back outing for Cubs -- and Michael Kopech leaves White Sox teammates speechless,Chicago baseball report: Adbert Alzolay's bounce-back outing for Cubs -- and Michael Kopech leaves White Sox teammates speechless
69613,69405,1.000000,Chicago baseball report: Adbert Alzolay's bounce-back outing for Cubs -- and Michael Kopech leaves White Sox teammates speechless,Chicago baseball report: Adbert Alzolay' bounce-back outing for Cubs -- and Michael Kopech leaves White Sox teammates speechless
69402,70488,1.000000,2. Biden administration cancels another $7.4 billion in student loans,3. Biden administration cancels another $7.4 billion in student loans
...,...,...,...,...
68997,69364,0.050002,Israel's Embattled UN Envoy Hits Back Hard as the Gaza Crisis Worsens,McDonald's Plan to Offset California Wage Hike? Bring Bagels Back
70508,70471,0.050002,Fire Services Department personnel prepare to enter a blaze-hit building Yau Ma Tei. Photo: Jelly Tse,Jimmy Buffet's career in photos
70471,70508,0.050002,Jimmy Buffet's career in photos,Fire Services Department personnel prepare to enter a blaze-hit building Yau Ma Tei. Photo: Jelly Tse
71069,68666,0.050000,Biden cancels another $7.4B of student debt for 277K more borrowers,"7:22 a. m. Harris to warn another Trump term would mean 'more suffering, less freedom'"


In [34]:
len(pairs[pairs['cosine_sim'] > 0.05])

283260

In [38]:
pairs[pairs['cosine_sim'] < 0.1]

Unnamed: 0,Unnamed: 1,cosine_sim,headline1,headline2
70061,69807,0.100000,Japan's native population declines at record rate as births plunge,"Safe-Haven Demand Fuels Gold to a Record $2,400 High"
69807,70061,0.100000,"Safe-Haven Demand Fuels Gold to a Record $2,400 High",Japan's native population declines at record rate as births plunge
71062,70211,0.099998,U. S. issues travel warning for Israel as Iran expected to attack any time U. S. officials say Iran is expected to launch an attack on the country,"'Do not go to Iran, Israel': MEA's advisory amid rising tensions"
70211,71062,0.099998,"'Do not go to Iran, Israel': MEA's advisory amid rising tensions",U. S. issues travel warning for Israel as Iran expected to attack any time U. S. officials say Iran is expected to launch an attack on the country
70504,71478,0.099997,"Evan Vucci / APBiden announces more than $7B in student debt relief for 277,000 borrowers","Evan Corcoran quits Trump's legal team, could be prosecution witness in classified documents case"
...,...,...,...,...
68997,69364,0.050002,Israel's Embattled UN Envoy Hits Back Hard as the Gaza Crisis Worsens,McDonald's Plan to Offset California Wage Hike? Bring Bagels Back
70508,70471,0.050002,Fire Services Department personnel prepare to enter a blaze-hit building Yau Ma Tei. Photo: Jelly Tse,Jimmy Buffet's career in photos
70471,70508,0.050002,Jimmy Buffet's career in photos,Fire Services Department personnel prepare to enter a blaze-hit building Yau Ma Tei. Photo: Jelly Tse
71069,68666,0.050000,Biden cancels another $7.4B of student debt for 277K more borrowers,"7:22 a. m. Harris to warn another Trump term would mean 'more suffering, less freedom'"
