In [48]:
import numpy as np

from app.models import Session, Headline, Article, Agency, Topic
import pandas as pd
from datetime import datetime as dt, timedelta as td
from app.utils.constants import Constants, Country
from app.utils.config import Config
from sqlalchemy import or_


with Session() as s:
    headlines = s.query(Headline.processed, Headline.first_accessed, Article.url, Topic.name, Agency.name, Agency._bias)\
        .join(Headline.article).join(Article.agency).join(Article.topic)\
        .filter(
            Article.first_accessed > dt.now() - td(days=7),
            or_(Agency._country==Country.us.value, Agency.name.in_(Config.exempted_foreign_media))
        ).all()
topic_df = pd.DataFrame(headlines, columns=['headline', 'date', 'url', 'topic', 'agency', 'bias'])
topic_df.head()

Unnamed: 0,headline,date,url,topic,agency,bias
0,Americans address the struggles with retiring ...,2024-04-06 12:00:01.962134,https://www.foxbusiness.com/personal-finance/a...,Inflation,Fox Business,1
1,Russia unleashed a deadly attack on the city o...,2024-04-06 12:00:01.962134,https://www.nytimes.com/2024/04/06/world/europ...,Ukraine,New York Times,-1
2,Concern over Biden's stance on Israel-Hamas wa...,2024-04-06 12:00:01.962134,http://news.yahoo.com/concern-over-president-b...,War in Gaza,Yahoo News,-1
3,"IDF extracts body of hostage Elad Katzir, murd...",2024-04-06 12:00:01.962134,https://news.google.com/home?hl=en-US&gl=US&ce...,War in Gaza,Google News,-1
4,"Naperville leaders offer sympathies, acknowled...",2024-04-06 12:00:01.962134,https://www.chicagotribune.com/2024/04/06/nape...,War in Gaza,Chicago Tribune,1


In [49]:
with Session() as s:
    headlines = s.query(Headline.processed, Headline.first_accessed, Article.url, Agency.name, Agency._bias)\
        .join(Headline.article).join(Article.agency)\
        .filter(
            Article.first_accessed > dt.now() - td(days=7),
            or_(Agency._country==Country.us.value, Agency.name.in_(Config.exempted_foreign_media))
        ).all()
df = pd.DataFrame(headlines, columns=['headline', 'date', 'url', 'agency', 'bias'])

In [50]:
len(df)

47093

In [51]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    return ' '.join([lemmatizer.lemmatize(word) for word in tokens if word not in stop_words])

In [58]:
from sklearn.feature_extraction.text import TfidfVectorizer

df['processed'] = df['headline'].apply(preprocess)
print(len(df))
# get the first 3000 samples
samples = df.sample(25000)
vectorizer = TfidfVectorizer(ngram_range=(1, 2))
tfidf_matrix = vectorizer.fit_transform(samples['processed'])


47093


In [70]:
today_df = df[df['date'] > dt.now() - td(days=1)].copy()
today_df['processed'] = today_df['headline'].apply(preprocess)
# take cosine similarity and drop all headlines without any similarity
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(tfidf_matrix, vectorizer.transform(today_df['processed']))


today_tfidf = vectorizer.transform(today_df['processed'])
print(len(today_df))

6243


In [72]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
data_scaled = scaler.fit_transform(today_tfidf.toarray())
pca = PCA(n_components=0.95)
data_pca = pca.fit_transform(data_scaled)
print("Original shape:", data_scaled.shape)
print("Reduced shape:", data_pca.shape)

Original shape: (6243, 164718)
Reduced shape: (6243, 4483)


In [102]:
import hdbscan
today_clusterer = hdbscan.HDBSCAN(min_cluster_size=5, min_samples=5, cluster_selection_epsilon=0.000001, cluster_selection_method='leaf')
today_labels = today_clusterer.fit_predict(today_tfidf)
today_df['cluster'] = today_labels
print("Number of clusters:", len(today_df['cluster'].unique()))

Number of clusters: 156


In [103]:
cluster = today_df[today_df['cluster'] == 0]
print(len(cluster))
cluster

6


Unnamed: 0,headline,date,url,agency,bias,processed,cluster
44662,Hulu's Cute New Rom-Com Is Honestly Baffling H...,2024-04-12 21:00:02.037011,https://slate.com/culture/2024/04/the-greatest...,Slate,-2,hulus cute new romcom honestly baffling hulus ...,0
44809,Latest CBS News Videos 1 dead after truck rams...,2024-04-12 21:00:02.037011,https://www.cbsnews.com/video/18-wheeler-crash...,CBS News,-1,latest cbs news video 1 dead truck ram texas d...,0
45900,Possible Iran attack on Israel could be immine...,2024-04-13 00:00:01.817439,https://www.cbsnews.com/video/white-house-on-h...,CBS News,-1,possible iran attack israel could imminent u i...,0
46011,Latest CBS News Videos Billy Joel's 100th show...,2024-04-13 00:30:01.966863,https://www.cbsnews.com/video/billy-joel-plays...,CBS News,-1,latest cbs news video billy joel 100th show ma...,0
46124,What's fueling Mexico City's water crisis? Mex...,2024-04-13 01:00:01.794506,https://www.cbsnews.com/video/whats-fueling-me...,CBS News,-1,whats fueling mexico city water crisis mexico ...,0
46160,Billy Joel's 100th show at Madison Square Gard...,2024-04-13 01:00:01.794506,https://www.cbsnews.com/video/billy-joel-plays...,CBS News,-1,billy joel 100th show madison square garden bi...,0


In [104]:
cluster = today_df[today_df['cluster'] == 1]
print(len(cluster))
cluster

10


Unnamed: 0,headline,date,url,agency,bias,processed,cluster
42956,BARK Air launches first luxury airlines for do...,2024-04-12 17:00:02.707870,https://nypost.com/2024/04/12/business/bark-ai...,New York Post,1,bark air launch first luxury airline dog charg...,1
43122,A Gaza teen spent last Eid surrounded by her f...,2024-04-12 17:30:02.607063,https://www.nbcnews.com/news/world/israel-hama...,MSNBC,-2,gaza teen spent last eid surrounded family she...,1
43151,House passes reauthorization of key US surveil...,2024-04-12 17:30:02.607063,https://www.startribune.com/house-will-try-aga...,Star Tribune,-1,house pass reauthorization key u surveillance ...,1
43176,Dow tumbles 500 points as sell-off intensifies...,2024-04-12 17:30:02.607063,https://www.cnbc.com/2024/04/11/stock-market-t...,CNBC,-1,dow tumble 500 point selloff intensifies infla...,1
43198,1:05 p. m. U. S. hikes cost of oil drilling on...,2024-04-12 17:30:02.607063,https://www.washingtonpost.com/elections/2024/...,The Washington Post,-1,105 p u hike cost oil drilling federal land fi...,1
43218,Major Iranian attack on Israel expected soon U...,2024-04-12 17:30:02.607063,https://www.cbsnews.com/video/major-iranian-at...,CBS News,-1,major iranian attack israel expected soon u of...,1
43244,Upgraded seat BARK Air launches first luxury a...,2024-04-12 17:30:02.607063,https://nypost.com/2024/04/12/business/bark-ai...,New York Post,1,upgraded seat bark air launch first luxury air...,1
43269,House Passes Reauthorization of Key US Surveil...,2024-04-12 17:30:02.607063,https://www.newsmax.com/politics/fisa-donald-t...,Newsmax,3,house pass reauthorization key u surveillance ...,1
43361,US Steel Shareholders Approve Nippon Steel's $...,2024-04-12 18:00:02.517388,https://www.bloomberg.com/news/articles/2024-0...,Bloomberg,-1,u steel shareholder approve nippon steel 141 b...,1
43399,Zoetis Sinks Most in 17 Months After Report on...,2024-04-12 18:00:02.517388,https://www.bloomberg.com/news/articles/2024-0...,Bloomberg,-1,zoetis sink 17 month report pet arthritis drug,1


In [105]:
cluster = today_df[today_df['cluster'] == 2]
print(len(cluster))
cluster

7


Unnamed: 0,headline,date,url,agency,bias,processed,cluster
42280,Oil Rises to October High as Israel Prepares f...,2024-04-12 15:30:01.975088,https://www.bloomberg.com/news/articles/2024-0...,Bloomberg,-1,oil rise october high israel prepares iranian ...,2
42377,"[LIVE Q&A 4/12, 1PM ET] The Next Wave of mRNA ...",2024-04-12 15:30:01.975088,https://www.theepochtimes.com/epochtv/live-qa-...,The Epoch Times,2,live qa 412 1pm et next wave mrna vaccine fallout,2
42403,JPMorgan profit beats estimates even as intere...,2024-04-12 15:30:01.975088,https://www.reuters.com/markets/us/jpmorgans-p...,Reuters,0,jpmorgan profit beat estimate even interest in...,2
42531,Remembering O. J. Simpson An Icon of America's...,2024-04-12 16:00:03.316241,https://www.breitbart.com/sports/2024/04/11/oj...,Breitbart,3,remembering j simpson icon america descent,2
42672,A Gaza teen spent last Eid surrounded by famil...,2024-04-12 16:30:02.150997,https://www.nbcnews.com/news/world/israel-hama...,MSNBC,-2,gaza teen spent last eid surrounded family she...,2
42777,20 years in prison for River Falls man who for...,2024-04-12 16:30:02.150997,https://www.startribune.com/20-years-in-prison...,Star Tribune,-1,20 year prison river fall man forced woman str...,2
42786,"After Dobbs, twice as many women sought tubal ...",2024-04-12 16:30:02.150997,http://news.yahoo.com/dobbs-twice-many-women-s...,Yahoo News,-1,dobbs twice many woman sought tubal ligation m...,2


In [99]:
cluster = today_df[today_df['cluster'] == 3]
print(len(cluster))
cluster

9


Unnamed: 0,headline,date,url,agency,bias,processed,cluster
46129,Science What's fueling Mexico City's water cri...,2024-04-13 01:00:01.794506,https://www.cbsnews.com/video/whats-fueling-me...,CBS News,-1,science whats fueling mexico city water crisis...,3
46154,Greenhouse gas emissions record Greenhouse gas...,2024-04-13 01:00:01.794506,https://www.cbsnews.com/video/2023-greenhouse-...,CBS News,-1,greenhouse gas emission record greenhouse gas ...,3
46243,CBS Evening News with Norah O'Donnell Possible...,2024-04-13 01:30:01.941814,https://www.cbsnews.com/video/white-house-on-h...,CBS News,-1,cbs evening news norah odonnell possible iran ...,3
46365,Bryson DeChambeau seen wildly walking at Maste...,2024-04-13 02:30:02.783900,https://nypost.com/2024/04/12/sports/bryson-de...,New York Post,1,bryson dechambeau seen wildly walking master c...,3
46380,"Ryan Feltner gets 1st win of season, Rockies u...",2024-04-13 02:30:02.783900,https://apnews.com/article/rockies-blue-jays-s...,AP,-1,ryan feltner get 1st win season rockies use se...,3
46480,Exclusive Israel Braces for Unprecedented Dire...,2024-04-13 03:30:02.400503,https://www.bloomberg.com/news/articles/2024-0...,Bloomberg,-1,exclusive israel brace unprecedented direct ir...,3
46907,"James Crumbley, Michigan school shooter's fath...",2024-04-13 08:30:02.712025,https://www.foxnews.com/us/james-crumbley-mich...,Fox News,2,james crumbley michigan school shooter father ...,3
46909,"Coachella: Peso Pluma, Chappell Roan, Shakira'...",2024-04-13 08:30:02.712025,https://www.latimes.com/entertainment-arts/mus...,Los Angeles Times,-1,coachella peso pluma chappell roan shakiras su...,3
46986,"5 dead, multiple injured including small child...",2024-04-13 09:00:02.290623,https://nypost.com/2024/04/13/world-news/sydne...,New York Post,1,5 dead multiple injured including small child ...,3


In [100]:
cluster = today_df[today_df['cluster'] == -1]
print(len(cluster))
cluster

4263


Unnamed: 0,headline,date,url,agency,bias,processed,cluster
40850,Chiefs receiver Rashee Rice surrenders to auth...,2024-04-12 12:30:01.869506,https://scrippsnews.com//stories/chiefs-receiv...,Scripps News,-1,chief receiver rashee rice surrender authority...,-1
40851,Chinese Immigrant Wrecks Gun Control Argument ...,2024-04-12 12:30:01.869506,https://pjmedia.com/catherinesalgado/2024/04/1...,Red State,3,chinese immigrant wreck gun control argument o...,-1
40852,Apple Says It Will Fix i Phone 'Bug' That Prom...,2024-04-12 12:30:01.869506,https://www.forbes.com/sites/siladityaray/2024...,Forbes,1,apple say fix phone bug prompt palestinian fla...,-1
40853,How the internet is deleting the past,2024-04-12 12:30:01.869506,https://www.theblaze.com/return/how-the-intern...,The Blaze,3,internet deleting past,-1
40854,3 Trending: Biden Isn't 'Examining' A Border S...,2024-04-12 12:30:01.869506,https://thefederalist.com/2024/04/11/biden-isn...,The Federalist,3,3 trending biden isnt examining border shutdow...,-1
...,...,...,...,...,...,...,...
47086,Gynecologist reveals vagina symptoms you shoul...,2024-04-13 09:30:02.389896,https://www.newsweek.com/gynecologist-reveals-...,Newsweek,1,gynecologist reveals vagina symptom never ignore,-1
47087,Eighty-five-year-old woman saves herself and s...,2024-04-13 09:30:02.389896,https://www.theguardian.com/us/us-news/2024/ap...,The Guardian,-1,eightyfiveyearold woman save son burglar heroi...,-1
47090,Rents Are Still Rising and Pumping Up Inflation,2024-04-13 09:30:02.389896,https://www.wsj.com/real-estate/rising-rents-h...,The Wall Street Journal,1,rent still rising pumping inflation,-1
47091,Small-cap stocks 'challenged' as inflation pus...,2024-04-13 09:30:02.389896,https://www.marketwatch.com/story/small-cap-st...,The Wall Street Journal,1,smallcap stock challenged inflation push fed r...,-1


In [101]:
today_df[today_df['cluster'] == 100]

Unnamed: 0,headline,date,url,agency,bias,processed,cluster
46694,Wild rookie Liam Ohgren lives a hockey player'...,2024-04-13 05:30:02.393400,https://www.startribune.com/wild-rookie-liam-o...,Star Tribune,-1,wild rookie liam ohgren life hockey player dre...,100
46700,"Butler hits home run, winning single to lift A...",2024-04-13 05:30:02.393400,https://apnews.com/article/nationals-athletics...,AP,-1,butler hit home run winning single lift 21 wal...,100
46705,"Kevin Durant scored 28 points, Jusuf Nurkic ma...",2024-04-13 05:30:02.393400,https://apnews.com/article/suns-kings-score-fc...,AP,-1,kevin durant scored 28 point jusuf nurkic made...,100
46706,"Reusse: Back from injury, Towns doesn't have t...",2024-04-13 05:30:02.393400,https://www.startribune.com/back-from-injury-k...,Star Tribune,-1,reusse back injury town doesnt carry load,100
46708,Republican presidential candidate former Presi...,2024-04-13 05:30:02.393400,https://apnews.com/article/trump-chick-fil-a-b...,AP,-1,republican presidential candidate former presi...,100
46711,Lawrence Butler delivered a game-winning RBI s...,2024-04-13 05:30:02.393400,https://apnews.com/article/nationals-athletics...,AP,-1,lawrence butler delivered gamewinning rbi sing...,100
46729,Jackson Merrill provided the go-ahead single i...,2024-04-13 06:00:02.560355,https://apnews.com/article/padres-dodgers-scor...,AP,-1,jackson merrill provided goahead single 11th i...,100
46730,Shaq completely loses it when Charles Barkley ...,2024-04-13 06:00:02.560355,https://nypost.com/2024/04/12/sports/shaq-lose...,New York Post,1,shaq completely loses charles barkley demonstr...,100
46732,'Horrible situation' Body found in burning car...,2024-04-13 06:00:02.560355,https://nypost.com/2024/04/12/us-news/body-fou...,New York Post,1,horrible situation body found burning car beli...,100
46740,"Merrill drives in go-ahead run, Tatis Jr. has ...",2024-04-13 06:00:02.560355,https://apnews.com/article/padres-dodgers-scor...,AP,-1,merrill drive goahead run tati jr 3 hit padre ...,100


In [11]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(today_tfidf, today_tfidf)

In [12]:
import pandas as pd
sim_df = pd.DataFrame(cosine_sim, index=today_df.index, columns=today_df.index)
np.fill_diagonal(sim_df.values, 0)
top_indices = pd.DataFrame(sim_df.unstack(), columns=['cosine_sim'])
pairs = top_indices.sort_values(by='cosine_sim', ascending=False)
pairs = pairs[pairs['cosine_sim'] > 0.05]

In [13]:
pairs['headline1'] = pairs.index.map(lambda x: df.iloc[x[0]]['headline'])
pairs['headline2'] = pairs.index.map(lambda x: df.iloc[x[1]]['headline'])
pairs

Unnamed: 0,Unnamed: 1,cosine_sim,headline1,headline2
70650,70264,1.00,Amazon owes $525 million in cloud-storage pate...,Amazon owes $525 million in cloud-storage pate...
73081,73752,1.00,Israeli settlers rampage through a West Bank v...,Israeli Settlers Rampage through a West Bank V...
70435,69464,1.00,Wildfires on the West Coast May Trigger Hailst...,Wildfires on the West Coast may trigger hailst...
73752,73081,1.00,Israeli Settlers Rampage through a West Bank V...,Israeli settlers rampage through a West Bank v...
72252,75686,1.00,Biden administration raises cost of drilling o...,Biden administration raises costs of public la...
...,...,...,...,...
75864,72544,0.05,273-147: House Passes Controversial Spying Bil...,Republicans Troll DNC for Paying Joe Biden's L...
72544,75864,0.05,Republicans Troll DNC for Paying Joe Biden's L...,273-147: House Passes Controversial Spying Bil...
71954,75864,0.05,Republicans troll DNC for paying Joe Biden's l...,273-147: House Passes Controversial Spying Bil...
76622,76792,0.05,Jaishankar said the two countries never had tr...,Ric Grenell to Newsmax: Biden's Weak Policies ...


In [14]:
len(pairs[pairs['cosine_sim'] > 0.05])

2100916

In [15]:
pairs[pairs['cosine_sim'] < 0.1]

Unnamed: 0,Unnamed: 1,cosine_sim,headline1,headline2
72851,69819,0.100000,Oil hits six-month high as Israel braces for I...,"When It Comes to Recruiting, High Tech Needs H..."
69819,72851,0.100000,"When It Comes to Recruiting, High Tech Needs H...",Oil hits six-month high as Israel braces for I...
69808,71351,0.099999,Israel preparing for Iranian attack 'within 24...,White House says it is watching Iranian threat...
71351,69808,0.099999,White House says it is watching Iranian threat...,Israel preparing for Iranian attack 'within 24...
75289,76283,0.099999,Target this player prop for Padres-Dodgers on ...,"Major champions Justin Thomas, Brian Harman, J..."
...,...,...,...,...
75864,72544,0.050000,273-147: House Passes Controversial Spying Bil...,Republicans Troll DNC for Paying Joe Biden's L...
72544,75864,0.050000,Republicans Troll DNC for Paying Joe Biden's L...,273-147: House Passes Controversial Spying Bil...
71954,75864,0.050000,Republicans troll DNC for paying Joe Biden's l...,273-147: House Passes Controversial Spying Bil...
76622,76792,0.050000,Jaishankar said the two countries never had tr...,Ric Grenell to Newsmax: Biden's Weak Policies ...
