In [2]:
from datetime import datetime as dt, timedelta as td

import numpy as np

from app.models import Session, Article, Agency, Headline, Topic
import pytz
from sqlalchemy import or_
from app.utils.config import Config
from app.utils.constants import Country
import pandas as pd

with Session() as s:
    headlines = s.query(Headline.processed, Article.url, Agency.name, Agency._bias).join(Headline.article).join(Article.agency)\
        .filter(Headline.first_accessed > (dt.now(tz=pytz.UTC) - td(hours=24)),
                or_(Agency._country == Country.us.value, Agency.name.in_(Config.exempted_foreign_media))).all()
    df = pd.DataFrame(headlines, columns=['headline', 'url', 'agency', 'bias'])
df.head()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\malan\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\malan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\malan\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\malan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\malan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,headline,url,agency,bias
0,Marjorie Taylor Greene Can't Stop Pushing Russ...,https://newrepublic.com/post/180678/marjorie-t...,The New Republic,-2
1,Trump's Hush Money Trial Starts Monday Despite...,https://www.forbes.com/sites/brianbushard/2024...,Forbes,1
2,"University of Texas at Austin, Taking Heat for...",https://redstate.com/wardclark/2024/04/12/univ...,Red State,3
3,'Trump is a rapist and a con man': Actor Alan ...,https://www.theblaze.com/news/alan-ritchson-tr...,The Blaze,3
4,Kristi Noem Banned by Yet Another South Dakota...,https://www.thedailybeast.com/kristi-noem-bann...,The Daily Beast,-2


In [3]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    return ' '.join([lemmatizer.lemmatize(word) for word in tokens if word not in stop_words])

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

X = df.copy()
X['processed'] = X['headline'].apply(preprocess)
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(X['processed'])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
sim_df = pd.DataFrame(cosine_sim, columns=X.index, index=X.index)
np.fill_diagonal(sim_df.values, 0)
top_indices = pd.DataFrame(sim_df.unstack(), columns=['similarity']).sort_values(by='similarity', ascending=False)
top_pairs = top_indices.sort_values(by='similarity', ascending=False)[top_indices['similarity'] > 0]
top_pairs = top_pairs.reset_index().rename(columns={'level_0': 'index1', 'level_1': 'index2'})
def copy_col(fromdf, todf, col):
    todf[col + '1'] = todf['index1'].apply(lambda x: fromdf.loc[x, col])
    todf[col + '2'] = todf['index2'].apply(lambda x: fromdf.loc[x, col])
    
copy_col(X, top_pairs, 'headline')
copy_col(X, top_pairs, 'agency')

  top_pairs = top_indices.sort_values(by='similarity', ascending=False)[top_indices['similarity'] > 0]


In [5]:
top_pairs

Unnamed: 0,index1,index2,similarity,headline1,headline2,agency1,agency2
0,389,868,1.000000,Israeli settlers rampage through a West Bank v...,Israeli Settlers Rampage through a West Bank V...,AP,Newsmax
1,3525,2037,1.000000,Calling up the plays Taylor Swift is intellect...,Calling up the plays Claudia Oshry: Taylor Swi...,New York Post,New York Post
2,868,1413,1.000000,Israeli Settlers Rampage through a West Bank V...,Israeli settlers rampage through West Bank vil...,Newsmax,Los Angeles Times
3,2795,2946,1.000000,"Major champions Justin Thomas, Brian Harman, J...","Major champions Justin Thomas, Brian Harman, J...",AP,AP
4,2946,2795,1.000000,"Major champions Justin Thomas, Brian Harman, J...","Major champions Justin Thomas, Brian Harman, J...",AP,AP
...,...,...,...,...,...,...,...
782715,1221,1349,0.005309,I'm a Scholar of Ben Franklin. I Can't Believe...,the explainer Do Facebook groups about dating ...,Slate,The Week
782716,592,576,0.005214,FILE - Former President Donald Trump awaits th...,I'm a Scholar of Ben Franklin. I Can't Believe...,AP,Slate
782717,576,592,0.005214,I'm a Scholar of Ben Franklin. I Can't Believe...,FILE - Former President Donald Trump awaits th...,Slate,AP
782718,576,1349,0.005178,I'm a Scholar of Ben Franklin. I Can't Believe...,the explainer Do Facebook groups about dating ...,Slate,The Week


In [6]:
# drop where agency1 and agency2 are equal
top_pairs = top_pairs[top_pairs['agency1'] != top_pairs['agency2']]
top_pairs.head()

Unnamed: 0,index1,index2,similarity,headline1,headline2,agency1,agency2
0,389,868,1.0,Israeli settlers rampage through a West Bank v...,Israeli Settlers Rampage through a West Bank V...,AP,Newsmax
2,868,1413,1.0,Israeli Settlers Rampage through a West Bank V...,Israeli settlers rampage through West Bank vil...,Newsmax,Los Angeles Times
8,1413,868,1.0,Israeli settlers rampage through West Bank vil...,Israeli Settlers Rampage through a West Bank V...,Los Angeles Times,Newsmax
9,868,389,1.0,Israeli Settlers Rampage through a West Bank V...,Israeli settlers rampage through a West Bank v...,Newsmax,AP
10,389,1413,1.0,Israeli settlers rampage through a West Bank v...,Israeli settlers rampage through West Bank vil...,AP,Los Angeles Times


In [7]:
len(top_pairs)

736710

In [8]:
top_pairs[top_pairs['similarity']> 0.5].sort_values('similarity', ascending=True).head(10)

Unnamed: 0,index1,index2,similarity,headline1,headline2,agency1,agency2
5321,3293,2143,0.500093,Trump and Mike Johnson to Make Announcement on...,Trump hosts 'election integrity' press confere...,Google News,The Independent
5320,2143,3293,0.500093,Trump hosts 'election integrity' press confere...,Trump and Mike Johnson to Make Announcement on...,The Independent,Google News
5318,274,488,0.500139,Vehicle crashes into Texas public safety offic...,Lawmaker says crash of semitrailer into public...,Scripps News,Star Tribune
5316,488,274,0.500139,Lawmaker says crash of semitrailer into public...,Vehicle crashes into Texas public safety offic...,Star Tribune,Scripps News
5319,712,274,0.500139,Crash of semitrailer into Texas public safety ...,Vehicle crashes into Texas public safety offic...,Star Tribune,Scripps News
5317,274,712,0.500139,Vehicle crashes into Texas public safety offic...,Crash of semitrailer into Texas public safety ...,Scripps News,Star Tribune
5315,1160,844,0.50021,Roberto Cavalli dies,"Roberto Cavalli, Italian designer with creatio...",ABC News,USA Today
5314,844,1160,0.50021,"Roberto Cavalli, Italian designer with creatio...",Roberto Cavalli dies,USA Today,ABC News
5308,2728,3029,0.500289,Kamala Harris campaigns in Arizona Vice Presid...,Kamala Harris blames Trump for abortion ban in...,CBS News,Reuters
5309,2728,2587,0.500289,Kamala Harris campaigns in Arizona Vice Presid...,Arizona abortion ban: Kamala Harris blames Trump,CBS News,Yahoo News


In [9]:
len(top_pairs[top_pairs['similarity'] > 0.5])

3554

In [10]:
def get_all_similar(df, id):
    allsim = df[(df['index1'] == id) | (df['index2'] == id)]
    return allsim[allsim['similarity'] > 0.2].copy()

In [11]:
hezbollah = get_all_similar(top_pairs, 1880)
hezbollah

Unnamed: 0,index1,index2,similarity,headline1,headline2,agency1,agency2
13982,1203,1880,0.327477,Biden says he expects Iranian strike on Israel...,Israel braces amid fears of Iranian strike; U....,MSNBC,The Washington Post
13983,1880,1203,0.327477,Israel braces amid fears of Iranian strike; U....,Biden says he expects Iranian strike on Israel...,The Washington Post,MSNBC
22402,1880,1199,0.259697,Israel braces amid fears of Iranian strike; U....,Dow closes nearly 500 points lower on fears of...,The Washington Post,CNN
22403,1199,1880,0.259697,Dow closes nearly 500 points lower on fears of...,Israel braces amid fears of Iranian strike; U....,CNN,The Washington Post
26568,1880,2521,0.237092,Israel braces amid fears of Iranian strike; U....,Iran may strike Israel very soon,The Washington Post,The Hill
26569,2521,1880,0.237092,Iran may strike Israel very soon,Israel braces amid fears of Iranian strike; U....,The Hill,The Washington Post
30866,1821,1880,0.218699,"France recalls diplomatic families from Iran, ...",Israel braces amid fears of Iranian strike; U....,The Hill,The Washington Post
30867,1880,1821,0.218699,Israel braces amid fears of Iranian strike; U....,"France recalls diplomatic families from Iran, ...",The Washington Post,The Hill
31010,997,1880,0.218121,Biden warns Iran against revenge strike on Isr...,Israel braces amid fears of Iranian strike; U....,USA Today,The Washington Post
31011,1880,997,0.218121,Israel braces amid fears of Iranian strike; U....,Biden warns Iran against revenge strike on Isr...,The Washington Post,USA Today


In [12]:
# TF-IDF Vectorization
X = df.copy()
X['text'] = X['headline'].apply(preprocess)

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(X['text'])

# Cosine Similarity
cosine_sim = cosine_similarity(tfidf_matrix)

# Thresholding
threshold = 0.5
cosine_sim[cosine_sim < threshold] = 0





In [13]:
from collections import deque
# Function to form clusters
def form_clusters(cosine_sim, min_samples=10):
    clusters = []
    last_n = 0
    while np.any(cosine_sim):
        # Start with the first available row
        first_index = np.where(np.any(cosine_sim, axis=1))[0][0]
        # Get all indices where similarity >= threshold
        involved_indices = np.where(cosine_sim[first_index] >= threshold)[0]
        # Add all connected components
        queue = deque(involved_indices)
        cluster = set(queue)
        while queue:
            current = queue.popleft()
            connected_indices = np.where(cosine_sim[current] >= threshold)[0]
            new_indices = set(connected_indices) - cluster
            queue.extend(new_indices)
            cluster.update(new_indices)
        # Mark these rows and columns as processed
        for idx in cluster:
            cosine_sim[idx, :] = 0
            cosine_sim[:, idx] = 0
        # Store the cluster
        if len(cluster) >= 10:
            clusters.append(cluster)
        if len(clusters) > last_n:
            last_n = len(clusters)
            print(f"Clusters formed: {len(clusters)}")
        
    return clusters

# Forming clusters
clusters = form_clusters(cosine_sim.copy())


Clusters formed: 1
Clusters formed: 2
Clusters formed: 3
Clusters formed: 4
Clusters formed: 5
Clusters formed: 6
Clusters formed: 7
Clusters formed: 8
Clusters formed: 9
Clusters formed: 10
Clusters formed: 11
Clusters formed: 12
Clusters formed: 13
Clusters formed: 14
Clusters formed: 15
Clusters formed: 16
Clusters formed: 17
Clusters formed: 18
Clusters formed: 19
Clusters formed: 20


In [14]:
# Display clusters
for idx, cluster in enumerate(clusters):
    print(f"Cluster {idx + 1}:")
    for doc_idx in cluster:
        row = X.iloc[doc_idx]
        print(f" {row['agency']} - {row['headline']}")
    print()

Cluster 1:
 The Washington Post - Trump appears with Mike Johnson hours after House passes FISA bill
 ABC News - House passes FISA reauthorization bill
 The Washington Post - Trump to appear with House Speaker Mike Johnson hours after House passes FISA bill
 Politico - Watch: House passes FISA bill
 The Washington Post - Trump appears with House Speaker Mike Johnson hours after House passes FISA bill
 Punchbowl News - FISA passes House, now it's the Senate's turn premium House
 The Washington Post - Trump to appear with Mike Johnson hours after House passes FISA bill
 New York Times - House Passes Surveillance Law Extension
 Punchbowl News - FISA passes House, now it's the Senate's turn The House finally approved a two-year reauthorization of FISA, sending the measure to the Senate for consideration next week. premium House
 Voice of America - US House passes controversial surveillance bill on 4th attempt
 Google News - House passes modified surveillance bill after it failed earlier th

In [15]:
cluster_labels = [-1] * len(X)
for cluster_id, cluster_indices in enumerate(clusters):
    for idx in cluster_indices:
        cluster_labels[idx] = cluster_id
X['cluster'] = cluster_labels
X

Unnamed: 0,headline,url,agency,bias,text,cluster
0,Marjorie Taylor Greene Can't Stop Pushing Russ...,https://newrepublic.com/post/180678/marjorie-t...,The New Republic,-2,marjorie taylor greene cant stop pushing russi...,-1
1,Trump's Hush Money Trial Starts Monday Despite...,https://www.forbes.com/sites/brianbushard/2024...,Forbes,1,trump hush money trial start monday despite pr...,-1
2,"University of Texas at Austin, Taking Heat for...",https://redstate.com/wardclark/2024/04/12/univ...,Red State,3,university texas austin taking heat closing de...,-1
3,'Trump is a rapist and a con man': Actor Alan ...,https://www.theblaze.com/news/alan-ritchson-tr...,The Blaze,3,trump rapist con man actor alan ritchson insul...,-1
4,Kristi Noem Banned by Yet Another South Dakota...,https://www.thedailybeast.com/kristi-noem-bann...,The Daily Beast,-2,kristi noem banned yet another south dakota tribe,-1
...,...,...,...,...,...,...
3793,Middle East crisis: reports vessel may have be...,https://www.theguardian.com/us/world/live/2024...,The Guardian,-1,middle east crisis report vessel may boarded n...,-1
3794,Inside Taylor Swift's Surprise Return to Tik Tok,https://www.wsj.com/business/media/taylor-swif...,The Wall Street Journal,1,inside taylor swift surprise return tik tok,-1
3795,Rents Are Still Rising and Pumping Up Inflation,https://www.wsj.com/real-estate/rising-rents-h...,The Wall Street Journal,1,rent still rising pumping inflation,-1
3796,Small-cap stocks 'challenged' as inflation pus...,https://www.marketwatch.com/story/small-cap-st...,The Wall Street Journal,1,smallcap stock challenged inflation push fed r...,-1


In [16]:
clusters_to_drop = X.groupby('cluster').filter(lambda x: x['agency'].nunique() == 1)
clusters_to_drop

Unnamed: 0,headline,url,agency,bias,text,cluster


In [17]:
drop_cluster_ids = clusters_to_drop['cluster'].unique()
X_filtered = X[~X['cluster'].isin(drop_cluster_ids)]
X_filtered

Unnamed: 0,headline,url,agency,bias,text,cluster
0,Marjorie Taylor Greene Can't Stop Pushing Russ...,https://newrepublic.com/post/180678/marjorie-t...,The New Republic,-2,marjorie taylor greene cant stop pushing russi...,-1
1,Trump's Hush Money Trial Starts Monday Despite...,https://www.forbes.com/sites/brianbushard/2024...,Forbes,1,trump hush money trial start monday despite pr...,-1
2,"University of Texas at Austin, Taking Heat for...",https://redstate.com/wardclark/2024/04/12/univ...,Red State,3,university texas austin taking heat closing de...,-1
3,'Trump is a rapist and a con man': Actor Alan ...,https://www.theblaze.com/news/alan-ritchson-tr...,The Blaze,3,trump rapist con man actor alan ritchson insul...,-1
4,Kristi Noem Banned by Yet Another South Dakota...,https://www.thedailybeast.com/kristi-noem-bann...,The Daily Beast,-2,kristi noem banned yet another south dakota tribe,-1
...,...,...,...,...,...,...
3793,Middle East crisis: reports vessel may have be...,https://www.theguardian.com/us/world/live/2024...,The Guardian,-1,middle east crisis report vessel may boarded n...,-1
3794,Inside Taylor Swift's Surprise Return to Tik Tok,https://www.wsj.com/business/media/taylor-swif...,The Wall Street Journal,1,inside taylor swift surprise return tik tok,-1
3795,Rents Are Still Rising and Pumping Up Inflation,https://www.wsj.com/real-estate/rising-rents-h...,The Wall Street Journal,1,rent still rising pumping inflation,-1
3796,Small-cap stocks 'challenged' as inflation pus...,https://www.marketwatch.com/story/small-cap-st...,The Wall Street Journal,1,smallcap stock challenged inflation push fed r...,-1


In [18]:
X.sort_values('cluster')

Unnamed: 0,headline,url,agency,bias,text,cluster
0,Marjorie Taylor Greene Can't Stop Pushing Russ...,https://newrepublic.com/post/180678/marjorie-t...,The New Republic,-2,marjorie taylor greene cant stop pushing russi...,-1
2476,Agent at Bad Bunny-led firm has certification ...,https://www.theguardian.com/us/sport/2024/apr/...,The Guardian,-1,agent bad bunnyled firm certification revoked ...,-1
2477,Vice-president blames Donald Trump for Arizona...,https://www.theguardian.com/us/us-news/2024/ap...,The Guardian,-1,vicepresident blame donald trump arizona abort...,-1
2480,Urgent Meeting Sought Over $550 Million War Me...,https://www.theepochtimes.com/world/urgent-mee...,The Epoch Times,2,urgent meeting sought 550 million war memorial...,-1
2481,Agent at firm led by music star has certificat...,https://www.theguardian.com/us/sport/2024/apr/...,The Guardian,-1,agent firm led music star certification revoke...,-1
...,...,...,...,...,...,...
3553,Multiple people stabbed and 1 person shot at a...,https://apnews.com/article/sydney-shopping-cen...,AP,-1,multiple people stabbed 1 person shot sydney s...,19
3534,Man shot after stabbings at Sydney mall,https://www.bbc.com/news/world-australia-68805401,BBC,-1,man shot stabbings sydney mall,19
3715,Multiple People Stabbed in Sydney Mall Attack,https://www.newsweek.com/sydney-mall-attack-st...,Newsweek,1,multiple people stabbed sydney mall attack,19
3524,'Critical incident' declared after man shot an...,https://www.nbcnews.com/news/world/sydney-mall...,MSNBC,-2,critical incident declared man shot multiple s...,19


In [19]:
X = X[X['cluster'] != -1]
X.sort_values('cluster')

Unnamed: 0,headline,url,agency,bias,text,cluster
30,House Passes Surveillance Bill,https://politicalwire.com/2024/04/12/house-pas...,Political Wire,0,house pass surveillance bill,0
669,US House passes controversial surveillance bil...,https://www.reuters.com/world/us/us-house-pass...,Reuters,0,u house pass controversial surveillance bill f...,0
2204,House Passes Surveillance Law Extension,https://www.nytimes.com/2024/04/12/briefing/ho...,New York Times,-1,house pass surveillance law extension,0
405,US House passes controversial surveillance bil...,https://www.voanews.com/a/house-rejects-adding...,Voice of America,0,u house pass controversial surveillance bill 4...,0
3115,Watch: House passes FISA bill,https://www.politico.com/video/2024/04/12/watc...,Politico,-1,watch house pass fisa bill,0
...,...,...,...,...,...,...
3653,Multiple People Stabbed and 1 Person Shot at a...,https://www.newsmax.com/world/globaltalk/sydne...,Newsmax,3,multiple people stabbed 1 person shot sydney s...,19
3524,'Critical incident' declared after man shot an...,https://www.nbcnews.com/news/world/sydney-mall...,MSNBC,-2,critical incident declared man shot multiple s...,19
3715,Multiple People Stabbed in Sydney Mall Attack,https://www.newsweek.com/sydney-mall-attack-st...,Newsweek,1,multiple people stabbed sydney mall attack,19
3727,'It was insanity' - Eyewitnesses tell of Sydne...,https://www.bbc.com/news/world-australia-68805401,BBC,-1,insanity eyewitness tell sydney mall horror,19
