In [10]:
import pandas as pd
import numpy as np
import nltk.data
import itertools
import spacy
from sentence_transformers import SentenceTransformer, util

In [11]:
df_clusters_raw = pd.read_csv("good_clusters_1.csv")

In [12]:
df_clusters = df_clusters_raw.drop_duplicates(subset=['lead'], keep='last')
df_clusters = df_clusters.drop_duplicates(subset=['headline'], keep='last')

In [13]:
list(set(df_clusters["cluster"]))[20:35] #[20:35]

[53, 56, 57, 59, 61, 66, 68, 69, 70, 75, 77, 78, 84, 86, 90]

In [14]:
cnum = 69
df_clust = df_clusters.loc[df_clusters["cluster"]==cnum]

In [15]:
df_clust["headline"].values

array(['Joe Biden Dismisses Report That He Told False War Story',
       "Joe Biden told moving military story at campaign stop - but it 'never happened,' report says",
       "Joe Biden's War Story -- 2020 Democrat Fabricates Story in New Hampshire",
       'Biden misstated details of war story on the campaign trail: Report',
       'As he campaigns for president, Joe Biden tells a moving but false war story',
       'CNN pundit defends Biden for war story by comparing him to Reagan',
       'Report: Joe Biden Fabricated Emotional Story About Afghanistan War',
       "Mark Steyn: Biden's war story controversy latest in long line of verbal mistakes",
       "Biden pushes back on report refuting disputed war story: 'I don't see what the problem is'",
       'Biden defends his telling of a harrowing war story after report he got facts wrong',
       'Biden denies conflating details of war story',
       "Donna Brazile defends Biden over disputed war story: 'He is heartfelt'",
       'Joe

In [16]:
set(df_clust["outlet"].values)

{'ABC News',
 'Breitbart',
 'Fox News',
 'NBC News',
 'National Review',
 'The New York Times',
 'The Washington Post',
 'USA Today'}

Some code sourced from: https://huggingface.co/sentence-transformers/multi-qa-MiniLM-L6-cos-v1

In [17]:
duplicate_headlines = []
model = SentenceTransformer('sentence-transformers/multi-qa-MiniLM-L6-cos-v1')

for i in itertools.combinations(df_clust["headline"].to_list(), 2):
    sent_one = model.encode(i[0])
    sent_two = model.encode(i[1])

    # Compute dot score between query and all document embeddings
    scores = util.dot_score(sent_one, sent_two)[0].cpu().tolist()

    sen_tup1 = (i[0],"str")
    sen_tup2 = (i[1], "str")

    # Combine docs & scores
    doc_score_pairs = list(zip(sen_tup1, sen_tup2, scores))
    
    # Sort by decreasing score
    doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)

    # Output passages & scores
    for doc, doc2, score in doc_score_pairs:
        if score > .85:
            print(score, doc, "####", doc2)
            duplicate_headlines.append(doc)

0.8824230432510376 Report: Joe Biden Fabricated Emotional Story About Afghanistan War #### Joe Biden Unapologetic for Fabricating Afghan War Story
0.8600894212722778 Biden defends his telling of a harrowing war story after report he got facts wrong #### Biden denies conflating details of war story


In [18]:
duplicate_headlines

['Report: Joe Biden Fabricated Emotional Story About Afghanistan War',
 'Biden defends his telling of a harrowing war story after report he got facts wrong']

In [19]:
df_clust.shape

(13, 18)

In [20]:
df_clust = df_clust.loc[~df_clust["headline"].isin(duplicate_headlines)]

In [23]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [24]:
sentences_all = []
df_sentences = pd.DataFrame(columns=["outlet", "sentence"])
for index, row in df_clust.iterrows():
    sentence_list = tokenizer.tokenize(row['body']) 
    for sent in sentence_list:
        df_sentences = df_sentences.append({"outlet": row['outlet'], "sentence": sent}, ignore_index=True)

In [25]:
df_sentences.shape

(387, 2)

## Remove duplicate sentences

Some code sourced from: https://huggingface.co/sentence-transformers/multi-qa-MiniLM-L6-cos-v1

In [None]:
duplicate_sentences = []

for i in itertools.combinations(df_sentences["sentence"].to_list(), 2):
    sent_one = model.encode(i[0])
    sent_two = model.encode(i[1])

    # Compute dot score between query and all document embeddings
    scores = util.dot_score(sent_one, sent_two)[0].cpu().tolist()

    sen_tup1 = (i[0],"str")
    sen_tup2 = (i[1], "str")

    # Combine docs & scores
    doc_score_pairs = list(zip(sen_tup1, sen_tup2, scores))
    
    # Sort by decreasing score
    doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)

    # Output passages & scores
    for doc, doc2, score in doc_score_pairs:
        if score > .98:
            print(score, doc, "####", doc2)
            duplicate_sentences.append(doc)

0.9999998807907104 “We can lose a vice president,” he said, recounting his words to a crowd during an event on Friday. #### “We can lose a vice president,” he said, recounting his words to a crowd during an event on Friday.
0.9999998807907104 “We can lose a vice president,” he said, recounting his words to a crowd during an event on Friday. #### “We can lose a vice president,” he said, recounting his words to a crowd during an event on Friday.
0.9999998807907104 “We can’t lose many more of these kids. #### “We can’t lose many more of these kids.
0.9999998807907104 “We can’t lose many more of these kids. #### “We can’t lose many more of these kids.
0.9999998807907104 “We can’t lose many more of these kids. #### “We can’t lose many more of these kids.
0.9999997019767761 Not a joke.” His story involved the captain dramatically telling Biden he didn't want the medal because his comrade ended up dying. #### Not a joke.” His story involved the captain dramatically telling Biden he didn't wan

0.9999995231628418 And that soldier, Kyle J. #### And that soldier, Kyle J.
0.9999999403953552 White, never had a Silver Star, or any other medal, pinned on him by Biden. #### White, never had a Silver Star, or any other medal, pinned on him by Biden.
1.0000001192092896 At a White House ceremony six years after Biden’s visit, White stood at attention as President Barack Obama placed a Medal of Honor, the nation’s highest award for valor, around his neck. #### At a White House ceremony six years after Biden’s visit, White stood at attention as President Barack Obama placed a Medal of Honor, the nation’s highest award for valor, around his neck.
0.9999997615814209 “He died. #### “He died.
0.9999995231628418 He died. #### He died.
0.9999995231628418 He died. #### He died.
0.9999995231628418 He died. #### He died.
0.9999995231628418 He died. #### He died.
0.9999995231628418 He died. #### He died.
0.9999995231628418 He died. #### He died.
0.9999995231628418 He died. #### He died.
0.99999952

In [None]:
df_sentences = df_sentences.loc[~df_sentences["sentence"].isin(duplicate_sentences)]

In [None]:
sentences_all_embed = df_sentences["sentence"].to_list()
sentences_all_embed

## Convert to sentence embeddings

In [None]:
from sklearn.cluster import AgglomerativeClustering
from sentence_transformers import SentenceTransformer

Some code sourced from: https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2

In [None]:
embedder = SentenceTransformer('all-MiniLM-L6-v2')
corpus_embeddings = embedder.encode(sentences_all_embed, show_progress_bar=True)
corpus_embeddings = corpus_embeddings /  np.linalg.norm(corpus_embeddings, axis=1, keepdims=True)

In [None]:
len(corpus_embeddings)

## Perform clustering

Some code sourced from: https://scikit-learn.org/stable/modules/generated/sklearn.cluster.AgglomerativeClustering.html

In [None]:
clustering_model = AgglomerativeClustering(n_clusters=None, distance_threshold=1.05, compute_distances=True) 
clustering_model.fit(corpus_embeddings)
cluster_assignment = clustering_model.labels_

clustered_sentences = {}
for sentence_id, cluster_id in enumerate(cluster_assignment):
    if cluster_id not in clustered_sentences:
        clustered_sentences[cluster_id] = []

    clustered_sentences[cluster_id].append(sentences_all_embed[sentence_id] + "777777")

In [None]:
for i, cluster in clustered_sentences.items():
    print("Cluster ", i)
    for article in cluster:
        print(article.split("###")[0])
    print("")

In [None]:
df_sentences["cluster"] = clustering_model.labels_

In [None]:
df_sentences.head()

## Remove clusters with only one outlet

In [None]:
one_outlet_clusters = []
for clust in df_sentences["cluster"].to_list():
    df_temp = df_sentences.loc[df_sentences["cluster"]==clust]
    if len(set(df_temp["outlet"])) < 2:
        one_outlet_clusters.append(clust)
        print(str(clust), set(df_temp["outlet"]))

In [None]:
one_outlet_clusters = list(set(one_outlet_clusters))

In [None]:
df_sentences_filter = df_sentences.loc[~df_sentences["cluster"].isin(one_outlet_clusters)]

In [None]:
df_sentences.shape

In [None]:
df_sentences_filter.shape

In [None]:
df_sentences_filter[df_sentences_filter["cluster"]==0]["sentence"].values

## Remove low similarity sentences from clusters

In [370]:
list(set(df_sentences_filter["cluster"]))[:20]

[1, 2, 3, 4, 133, 6, 7, 8, 5, 11, 12, 14, 15, 16, 19, 20, 21, 22, 23, 24]

In [371]:
for clust_num in list(set(df_sentences_filter["cluster"])):
    scores_all = []
    df_sent_clust = df_sentences_filter[df_sentences_filter["cluster"]==clust_num]
    for i in itertools.combinations(df_sent_clust["sentence"].to_list(), 2):
        sent_one = model.encode(i[0])
        sent_two = model.encode(i[1])

        # Compute dot score between query and all document embeddings
        scores = util.dot_score(sent_one, sent_two)[0].cpu().tolist()

        sen_tup1 = (i[0],"str")
        sen_tup2 = (i[1], "str")

        # Combine docs & scores
        doc_score_pairs = list(zip(sen_tup1, sen_tup2, scores))

        # Sort by decreasing score
        doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)

        # Output passages & scores
        for doc, doc2, score in doc_score_pairs:
            scores_all.append(score)
    if np.mean(scores_all) > .75:
        print(clust_num, np.mean(scores_all), scores_all)

133 0.8987131118774414 [0.8987131118774414]
32 0.7733260989189148 [0.723791778087616, 0.7031174302101135, 0.8930690884590149]
54 0.7699702978134155 [0.7699702978134155]
84 0.9046187996864319 [0.9046187996864319]
88 0.8552548289299011 [0.8160620927810669, 0.8929188847541809, 0.8567835092544556]
91 0.8186214566230774 [0.8186214566230774]
93 0.9231092154979705 [0.9420449733734131, 0.9573326706886292, 0.9721425771713257, 0.8341306447982788, 0.9673680067062378, 0.956412672996521, 0.9004188179969788, 0.9609381556510925, 0.8598504662513733, 0.8804531693458557]
99 0.9687110185623169 [0.9687110185623169]
108 0.8379121025403341 [0.8017549514770508, 0.7747612595558167, 0.9372200965881348]
115 0.93657386302948 [0.93657386302948]
117 0.9340170621871948 [0.9340170621871948]
119 0.7574264407157898 [0.7574264407157898]
123 0.9175558090209961 [0.9175558090209961]


In [374]:
sent_clust_num = 32
df_sent_clust = df_sentences_filter[df_sentences_filter["cluster"]==sent_clust_num]
df_sent_clust["sentence"].values

array(['In an interview Thursday with The Post and Courier, a South Carolina newspaper, Biden pushed back on the report, saying that the "central point" of the story he told is accurate.',
       '"I don’t understand what they’re talking about, but the central point is it was absolutely accurate what I said,” Biden told the paper.',
       '“I don’t understand what they’re talking about, but the central point is it was absolutely accurate what I said,” Biden told The Post and Courier when asked about the controversy.'],
      dtype=object)