In [2]:
from datetime import datetime as dt, timedelta as td

import numpy as np

from app.models import Session, Article, Agency, Headline, Topic
import pytz
from sqlalchemy import or_
from app.utils.config import Config
from app.utils.constants import Country
import pandas as pd

with Session() as s:
    headlines = s.query(Headline.processed, Article.url, Agency.name, Agency._bias).join(Headline.article).join(Article.agency)\
        .filter(Headline.first_accessed > (dt.now(tz=pytz.UTC) - td(hours=24)),
                or_(Agency._country == Country.us.value, Agency.name.in_(Config.exempted_foreign_media))).all()
    df = pd.DataFrame(headlines, columns=['headline', 'url', 'agency', 'bias'])
df.head()

In [3]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    return ' '.join([lemmatizer.lemmatize(word) for word in tokens if word not in stop_words])

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

X = df.copy()
X['processed'] = X['headline'].apply(preprocess)
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(X['processed'])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
sim_df = pd.DataFrame(cosine_sim, columns=X.index, index=X.index)
np.fill_diagonal(sim_df.values, 0)
top_indices = pd.DataFrame(sim_df.unstack(), columns=['similarity']).sort_values(by='similarity', ascending=False)
top_pairs = top_indices.sort_values(by='similarity', ascending=False)[top_indices['similarity'] > 0]
top_pairs = top_pairs.reset_index().rename(columns={'level_0': 'index1', 'level_1': 'index2'})
def copy_col(fromdf, todf, col):
    todf[col + '1'] = todf['index1'].apply(lambda x: fromdf.loc[x, col])
    todf[col + '2'] = todf['index2'].apply(lambda x: fromdf.loc[x, col])
    
copy_col(X, top_pairs, 'headline')
copy_col(X, top_pairs, 'agency')

In [5]:
top_pairs

In [6]:
# drop where agency1 and agency2 are equal
top_pairs = top_pairs[top_pairs['agency1'] != top_pairs['agency2']]
top_pairs.head()

In [7]:
len(top_pairs)

In [8]:
top_pairs[top_pairs['similarity']> 0.5].sort_values('similarity', ascending=True).head(10)

In [9]:
len(top_pairs[top_pairs['similarity'] > 0.5])

In [10]:
def get_all_similar(df, id):
    allsim = df[(df['index1'] == id) | (df['index2'] == id)]
    return allsim[allsim['similarity'] > 0.2].copy()

In [11]:
hezbollah = get_all_similar(top_pairs, 1880)
hezbollah

In [12]:
# TF-IDF Vectorization
X = df.copy()
X['text'] = X['headline'].apply(preprocess)

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(X['text'])

# Cosine Similarity
cosine_sim = cosine_similarity(tfidf_matrix)

# Thresholding
threshold = 0.5
cosine_sim[cosine_sim < threshold] = 0


In [13]:
from collections import deque
# Function to form clusters
def form_clusters(cosine_sim, min_samples=10):
    clusters = []
    last_n = 0
    while np.any(cosine_sim):
        # Start with the first available row
        first_index = np.where(np.any(cosine_sim, axis=1))[0][0]
        # Get all indices where similarity >= threshold
        involved_indices = np.where(cosine_sim[first_index] >= threshold)[0]
        # Add all connected components
        queue = deque(involved_indices)
        cluster = set(queue)
        while queue:
            current = queue.popleft()
            connected_indices = np.where(cosine_sim[current] >= threshold)[0]
            new_indices = set(connected_indices) - cluster
            queue.extend(new_indices)
            cluster.update(new_indices)
        # Mark these rows and columns as processed
        for idx in cluster:
            cosine_sim[idx, :] = 0
            cosine_sim[:, idx] = 0
        # Store the cluster
        if len(cluster) >= 10:
            clusters.append(cluster)
        if len(clusters) > last_n:
            last_n = len(clusters)
            print(f"Clusters formed: {len(clusters)}")
        
    return clusters

# Forming clusters
clusters = form_clusters(cosine_sim.copy())


In [14]:
# Display clusters
for idx, cluster in enumerate(clusters):
    print(f"Cluster {idx + 1}:")
    for doc_idx in cluster:
        row = X.iloc[doc_idx]
        print(f" {row['agency']} - {row['headline']}")
    print()

In [15]:
cluster_labels = [-1] * len(X)
for cluster_id, cluster_indices in enumerate(clusters):
    for idx in cluster_indices:
        cluster_labels[idx] = cluster_id
X['cluster'] = cluster_labels
X

In [16]:
clusters_to_drop = X.groupby('cluster').filter(lambda x: x['agency'].nunique() == 1)
clusters_to_drop

In [17]:
drop_cluster_ids = clusters_to_drop['cluster'].unique()
X_filtered = X[~X['cluster'].isin(drop_cluster_ids)]
X_filtered

In [18]:
X.sort_values('cluster')

In [19]:
X = X[X['cluster'] != -1]
X.sort_values('cluster')