In [2]:
import re
import coreferee, spacy
import nltk.data
import numpy as np
import itertools
import pandas as pd
from spacy import displacy
from nltk.tokenize import word_tokenize
from sentence_transformers import SentenceTransformer, util
from sklearn.cluster import AgglomerativeClustering
from rouge_score import rouge_scorer

In [2]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# Data 620: Final Project

**by Matthew Lucich**

## Data Preprocessing (code sourced from my master's capstone)

Here article data is being loaded that is already grouped into events (articles covering the same story) via the cluster number. I chose the cluster below due to the high number of outlets covering the story.

In [3]:
df_clusters_raw = pd.read_csv("good_clusters_1.csv")

In [4]:
df_clusters = df_clusters_raw.drop_duplicates(subset=['lead'], keep='last')
df_clusters = df_clusters.drop_duplicates(subset=['headline'], keep='last')

In [384]:
cnum = 69
df_clust = df_clusters.loc[df_clusters["cluster"]==cnum]

In [385]:
df_clust[['date_publish', 'outlet', 'headline', 'lead', 'body']]

Unnamed: 0,date_publish,outlet,headline,lead,body
170,2019-08-29 00:00:00,The New York Times,Joe Biden Dismisses Report That He Told False ...,After The Washington Post reported that Mr. Bi...,"Joseph R. Biden Jr., whose habit of verbal mis..."
171,2019-08-29 14:12:49,Fox News,Joe Biden told moving military story at campai...,Former Vice President Joe Biden's campaign has...,Former Vice President Joe Biden's campaign has...
172,2019-08-29 14:27:19,National Review,Joe Biden's War Story -- 2020 Democrat Fabrica...,During a campaign stop in New Hampshire on Fri...,"Joe Biden speaks in Des Moines, Iowa, August 1..."
173,2019-08-29 15:45:00,ABC News,Biden misstated details of war story on the ca...,Former Vice President Joe Biden is facing ques...,Former Vice President Joe Biden is facing ques...
174,2019-08-29 16:37:57,The Washington Post,"As he campaigns for president, Joe Biden tells...","Biden’s tale of heroism, an emotional highligh...",Joe Biden painted a vivid scene for the 400 pe...
175,2019-08-29 16:50:27,Fox News,CNN pundit defends Biden for war story by comp...,CNN political commentator Paul Begala went to ...,CNN political commentator Paul Begala went to ...
176,2019-08-29 20:25:12,Breitbart,Report: Joe Biden Fabricated Emotional Story A...,Joe Biden is under fire for fabricating an emo...,Former Vice President Joe Biden is under fire ...
177,2019-08-29 21:49:16,Fox News,Mark Steyn: Biden's war story controversy late...,Former Vice President Joe Biden's emotional ye...,Former Vice President Joe Biden's emotional ye...
178,2019-08-29 22:00:03,Fox News,Biden pushes back on report refuting disputed ...,Former Vice President Joe Biden responded Thur...,Former Vice President Joe Biden responded Thur...
180,2019-08-29 22:16:00,NBC News,Biden defends his telling of a harrowing war s...,Former Vice President Joe Biden defended a war...,Biden defends his telling of a harrowing war s...


#### Remove duplicate headlines

In [386]:
model = SentenceTransformer('sentence-transformers/multi-qa-MiniLM-L6-cos-v1')

Here I remove duplicate headlines via a sentence similarity transformer model. If a sentence pair's similarity score is above .95 (heuristic chosen based on manual review), only the second (most recent) instance is kept. Some of the duplicate_headlines function code sourced from [Huggingface](https://huggingface.co/sentence-transformers/multi-qa-MiniLM-L6-cos-v1).

In [387]:
def duplicate_headlines(df_clust):
    duplicate_headlines = []
    for i in itertools.combinations(df_clust["headline"].to_list(), 2):
        sent_one = model.encode(i[0])
        sent_two = model.encode(i[1])

        # Compute dot score between query and all document embeddings
        scores = util.dot_score(sent_one, sent_two)[0].cpu().tolist()

        sen_tup1 = (i[0],"str")
        sen_tup2 = (i[1], "str")

        # Combine docs & scores
        doc_score_pairs = list(zip(sen_tup1, sen_tup2, scores))

        # Sort by decreasing score
        doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)

        # Output passages & scores
        for doc, doc2, score in doc_score_pairs:
            if score > .95:
                print(score, doc, "####", doc2)
                duplicate_headlines.append(doc)
    return duplicate_headlines

In [388]:
duplicate_headlines = duplicate_headlines(df_clust)

In [389]:
df_clust = df_clust.loc[~df_clust["headline"].isin(duplicate_headlines)]

#### Tokenize articles into sentences

In [390]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [391]:
def sentence_df(df_clust):
    df_sentences = pd.DataFrame(columns=["outlet", "sentence"])
    for index, row in df_clust.iterrows():
        sentence_list = tokenizer.tokenize(row['body']) 
        for sent in sentence_list:
            df_sentences = df_sentences.append({"outlet": row['outlet'], "sentence": sent}, ignore_index=True)
    return df_sentences

In [392]:
df_sentences = sentence_df(df_clust)

#### Remove duplicate sentences

Similar to the duplicate headline function, we remove duplicate sentences as determined by a sentence similairity score. This is necessary since some outlets are apart of associations which allow for text to be borrowed by outlets within their networks. Additionally, sometimes an outlet has published multiple stories on the same event and reuse certain sentences. Some of the remove_duplicate_sentences function code sourced from [Huggingface](https://huggingface.co/sentence-transformers/multi-qa-MiniLM-L6-cos-v1).

In [393]:
def remove_duplicate_sentences(df_sentences):
    duplicate_sentences = []

    for i in itertools.combinations(df_sentences["sentence"].to_list(), 2):
        sent_one = model.encode(i[0])
        sent_two = model.encode(i[1])

        # Compute dot score between query and all document embeddings
        scores = util.dot_score(sent_one, sent_two)[0].cpu().tolist()

        sen_tup1 = (i[0],"str")
        sen_tup2 = (i[1], "str")

        # Combine docs & scores
        doc_score_pairs = list(zip(sen_tup1, sen_tup2, scores))

        # Sort by decreasing score
        doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)

        # Output passages & scores
        for doc, doc2, score in doc_score_pairs:
            if score > .98:
                print(score, doc, "####", doc2)
                duplicate_sentences.append(doc)
    return duplicate_sentences

In [394]:
duplicate_sentences = remove_duplicate_sentences(df_sentences)

0.9999998807907104 “We can lose a vice president,” he said, recounting his words to a crowd during an event on Friday. #### “We can lose a vice president,” he said, recounting his words to a crowd during an event on Friday.
0.9999998807907104 “We can lose a vice president,” he said, recounting his words to a crowd during an event on Friday. #### “We can lose a vice president,” he said, recounting his words to a crowd during an event on Friday.
0.9999998807907104 “We can’t lose many more of these kids. #### “We can’t lose many more of these kids.
0.9999998807907104 “We can’t lose many more of these kids. #### “We can’t lose many more of these kids.
0.9999998807907104 “We can’t lose many more of these kids. #### “We can’t lose many more of these kids.
0.9999998807907104 “We can’t lose many more of these kids. #### “We can’t lose many more of these kids.
0.9999997019767761 Not a joke.” His story involved the captain dramatically telling Biden he didn't want the medal because his comrade e

0.9999998807907104 “We can’t lose many more of these kids. #### “We can’t lose many more of these kids.
0.9999998807907104 “We can’t lose many more of these kids. #### “We can’t lose many more of these kids.
0.9999998807907104 “We can’t lose many more of these kids. #### “We can’t lose many more of these kids.
0.999999463558197 “’Do not pin it on me, Sir! #### “’Do not pin it on me, Sir!
1.0000003576278687 Please, Sir. #### Please, Sir.
0.9999999403953552 Do not do that! #### Do not do that!
0.9999995231628418 He died. #### He died.
0.9999995231628418 He died. #### He died.
0.9999995231628418 He died. #### He died.
0.9999995231628418 He died. #### He died.
0.9999995231628418 He died. #### He died.
0.9999995231628418 He died. #### He died.
0.9999995231628418 He died. #### He died.
0.9999995231628418 He died. #### He died.
0.9999995231628418 He died. #### He died.
1.0000001192092896 Biden visited Kunar province in 2008 as a U.S. senator, not as vice president. #### Biden visited Kunar pr

0.9999995231628418 He died. #### He died.
0.9999995231628418 He died. #### He died.
0.9999995231628418 He died. #### He died.
0.9999995231628418 He died. #### He died.
0.9999995231628418 He died. #### He died.
1.0000001192092896 “He refused the medal. #### “He refused the medal.
1.0000004768371582 I put it on him, he said, ‘Don’t do that to me, sir. #### I put it on him, he said, ‘Don’t do that to me, sir.
0.9999995231628418 He died. #### He died.


In [395]:
df_sentences = df_sentences.loc[~df_sentences["sentence"].isin(duplicate_sentences)]

In [396]:
sentences_all_embed = df_sentences["sentence"].to_list()


# New Method (code written for Data 620 final)

## Coreference Resolution

Load model and add conreference library to pipeline.

In [397]:
nlp = spacy.load('en_core_web_sm')
nlp.add_pipe('coreferee')

<coreferee.manager.CorefereeBroker at 0x7fe66e785370>

In order to properly apply Coreference Resolution, the article bodies need to first be reassembled.

In [398]:
def reassemble_articles(df_sentences):
    articles = []
    # Generate set, but retain order
    order = df_sentences["outlet"].to_list()
    # Loop outlets and reassemble articles from sentences
    for outlet in sorted(set(order), key=order.index):
        sent_list = df_sentences.loc[df_sentences["outlet"]==outlet]["sentence"].to_list()
        article_text = ". ".join(sent_list)
        articles.append(article_text)
    return articles

In [399]:
article_list_new = reassemble_articles(df_sentences)

Through the entity_list_tuples function Coreference Resolution recommendations are generated, then since SpaCy's NLP object cannot be edited, we must convert back to text, then resolve the coreferences. 

In [568]:
articles_recompiled = []

def article_recompile(doc, token_nums, tuple_dict):
    doc_to_text = []
    # Loop spaCy NLP object and append string sentences to list
    for i, token in enumerate(doc):
        if i in token_nums:
            token = tuple_dict[i]
        doc_to_text.append(token)
    # Join sentences into one article body
    article_recompiled = ' '.join(str(v) for v in doc_to_text)
    articles_recompiled.append(article_recompiled)

In [569]:
def entity_list_tuples(doc):
    entity_list_of_tuples = []
    # Loop coreference chain and extract first instance in chain (assuming it is proper form)
    for entity in doc._.coref_chains.serialize_obj()["__coreferee_chain_holder__"]:
        try:
            main_entity = re.findall(r"(?<!\()\b\w+\b(?![\)])", entity[0][0][1])[0]
        except IndexError:
            continue
        for ref in entity[0]:
            entity_list_of_tuples.append((main_entity, ref[0][0]))
    token_nums = [token_tuple[1] for token_tuple in entity_list_of_tuples]
    tuple_dict = dict((y, x) for x, y in entity_list_of_tuples)
    article_recompile(doc, token_nums, tuple_dict)

In [570]:
for article in article_list_new:
    print("NEW ARTICLE")
    doc = nlp(article)
    articles_recompiled.append(entity_list_tuples(doc))

NEW ARTICLE
NEW ARTICLE
NEW ARTICLE
NEW ARTICLE
NEW ARTICLE
NEW ARTICLE
NEW ARTICLE
NEW ARTICLE


In [403]:
articles_recompiled_clean = [i for i in articles_recompiled if i]

### Compare Article Versions with and without Coreference Resolution

#### With Coreference Resolution

One example of Coreference Resolution seen below is seen in "who says he was then vice president", where "he" is replaced with "Biden". This will potentially help with similarity scores when the sentences are broken out into to clauses and retain the proper name of the subject.

In [566]:
articles_recompiled_clean[3][:1500]

'Former Vice President Joe Biden is facing questions about a harrowing war story Biden tells on the campaign trail after a report in the Washington Post pointed out multiple inaccuracies in Biden account .. Interested in Joe Biden ? . Add Joe Biden as an interest to stay up to date on the latest Joe Biden news , video , and analysis from ABC News .. Add Interest \n The story , which Biden most recently told during a town hall event in Hanover , New Hampshire last week , details the story of a " young Navy captain , " as Biden puts story , stoically trying to reject a military medal from Biden -- who says Biden was then vice president -- after the captain failed to recover a fellow soldier on the battlefield .. As Biden tells story , the Navy captain rappelled down a 60 - foot ravine in the mountains in the Kunar province of Afghanistan to try to retrieve the fellow soldier but was unsuccessful .. Biden says that afterward , a general wanted Biden to fly to Afghanistan to pin a Silver S

#### Without Coreference Resolution (original text)

In [567]:
article_list_new[3][:1500]

'Former Vice President Joe Biden is facing questions about a harrowing war story he tells on the campaign trail after a report in the Washington Post pointed out multiple inaccuracies in his account.. Interested in Joe Biden?. Add Joe Biden as an interest to stay up to date on the latest Joe Biden news, video, and analysis from ABC News.. Add Interest\nThe story, which he most recently told during a town hall event in Hanover, New Hampshire last week, details the story of a "young Navy captain," as he puts it, stoically trying to reject a military medal from Biden -- who says he was then vice president -- after the captain failed to recover a fellow soldier on the battlefield.. As Biden tells it, the Navy captain rappelled down a 60-foot ravine in the mountains in the Kunar province of Afghanistan to try to retrieve the fellow soldier but was unsuccessful.. Biden says that afterward, a general wanted him to fly to Afghanistan to pin a Silver Star on the captain, a medal the captain ref

## Dependency Parsing

The original sentence...

In [551]:
df_sentences.loc[df_sentences.index == 111]["sentence"].values[0]

'Some told him it was too risky, but Biden said he brushed off their concerns.'

...is broken into independent clauses, based around verb. In this example, only the below clause is retained. We can see "he" was replaced with "Biden" due to Conreference Resolution and that the clause is specific aspect of the story that will likely make for more accurate comparisons with aspects from other news articles.

In [552]:
df_sentences_new.loc[df_sentences_new["sentence"].str.contains("brushed off")]["sentence"].values[0]

'Biden brushed off people concerns'

In [494]:
displacy.render(nlp(df_sentences.loc[df_sentences.index == 111]["sentence"].values[0]), style='dep')

In [406]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

### Algorithm: Parse Clauses

The following algorithm, which needs much future improvement, is one where I assume independent clauses center around a verb. Therefore, for each verb detected, it is associated with all its children and its children's children and so on, as determined by SpaCy's dependency parser. The exceptions around what children are chosen to follow is where the core benefit of the algorithm comes in. After research and manual review, the dependencies: ccomp, advcl, punct, appos, relcl, and dep, were determined to be children of verbs that should not be followed. <br>

Note, ccomp stands for clausal complement, advcl for adverbial clause modifier, punct for punctuation, appos for appositional modifier, relcl for relative clause modifier, and dep for unclassified dependent.

In [540]:
word_tuple_list = []

def find_children(verb, clause_num, sentence_idx, article_idx):
    """Loop children of verbs (if not dep in if condition) and then recursively loop through children of their children"""
    for child in verb.children:
        if (child.dep_ == "ccomp") or (child.dep_ == "advcl") or (child.dep_ == "punct") \
        or (child.dep_ == "appos") or (child.dep_ == "relcl") or (child.dep_ == "dep"):
            continue
        else:
            word_tuple_list.append((child.text, child.tag_, child.dep_, child.i, clause_num, sentence_idx, article_idx))
            find_children(child, clause_num, sentence_idx, article_idx)

def find_verbs(doc, sentence_idx, article_idx):
    """Loop through each word in sentence until verb is found, then loop through select children."""
    clause_num = 0
    for token in doc:
            if (token.pos_ == "VERB"):
                word_tuple_list.append((token.text, token.tag_, token.dep_, token.i, clause_num, sentence_idx, article_idx))
                find_children(token, clause_num, sentence_idx, article_idx)
                clause_num += 1

In [541]:
for article_idx, article in enumerate(articles_recompiled_clean):
    sentence_list = tokenizer.tokenize(article)
    for sentence_idx, sentence in enumerate(sentence_list):
        doc = nlp(sentence)
        find_verbs(doc, sentence_idx, article_idx)

#### Recompile clauses from words

Here I put words that make up individual clauses back in order. They were out of order since the previous algorithm looks for verbs, then finds their children in any direction. Note, I also use this opportunity to prune clauses with only three words or less. Finally, I add the clauses to a dataframe that reflects the format of df_sentences.

In [542]:
df_clauses = pd.DataFrame.from_records(word_tuple_list, columns = ["Text", "Tag", "Dep", "Position", \
                                                                   "Clause", "Sentence", "Article"])

In [543]:
def create_sentence_df(df_clauses):
    """Reassemble individual words of clauses into the proper order and then join into single string clause"""
    df_sentences_new = pd.DataFrame(columns=["outlet", "sentence"])
    order = df_clauses["Article"].to_list()
    for article in sorted(set(order), key=order.index):
        order_sent = df_clauses.loc[(df_clauses["Article"]==article)]["Sentence"].to_list()
        for sentence in sorted(set(order_sent), key=order_sent.index):
            order_clause = df_clauses.loc[(df_clauses["Article"]==article) & (df_clauses["Sentence"]==sentence)]["Clause"].to_list()
            for clause in sorted(set(order_clause), key=order_clause.index):
                clause_rows = df_clauses.loc[(df_clauses["Clause"]==clause) 
                                              & (df_clauses["Sentence"]==sentence)
                                              & (df_clauses["Article"]==article)].sort_values(["Position"])
                # Do not include clauses with three or less words
                if len(clause_rows["Text"].to_list()) <= 3:
                    continue
                sentence = " ".join(clause_rows["Text"].to_list())
                df_sentences_new = df_sentences_new.append({"outlet": article, "sentence": sentence}, ignore_index=True)
    return df_sentences_new


In [544]:
df_sentences_new = create_sentence_df(df_clauses)

In [545]:
df_sentences_new = df_sentences_new.loc[df_sentences_new["sentence"]!=""]

Associate clauses with their outlets.

In [546]:
ordered_outlets = df_sentences["outlet"].to_list()
outlet_list = sorted(set(ordered_outlets), key=ordered_outlets.index)
outlet_dict = {v: k for v, k in enumerate(outlet_list)}

In [547]:
df_sentences_new = df_sentences_new.replace({"outlet": outlet_dict})

In [571]:
df_sentences_new.head()

Unnamed: 0,outlet,sentence
0,The New York Times,whose habit of verbal missteps on the 2020 cam...
1,The New York Times,Jr remarks came in response to a Washington Po...
2,The New York Times,that I said wrong
3,The New York Times,that Mr. Biden pinned a medal on a military se...
4,The New York Times,But in the story of military heroism Biden told


In [550]:
df_sentences.head()

Unnamed: 0,outlet,sentence,cluster
0,The New York Times,"Joseph R. Biden Jr., whose habit of verbal mis...",65
1,The New York Times,"In two interviews released on Thursday, Mr. Bi...",13
2,The New York Times,His remarks came in response to a Washington P...,13
3,The New York Times,“I was making the point how courageous these p...,89
4,The New York Times,"“And so, I don’t know what the problem is.",28


# Convert to Sentence Embeddings (code sourced from my master's capstone)

Here I convert the sentence text into sentence-embeddings via the all-MiniLM-L6-v2 model from SBERT.

In [418]:
sentences_all_embed_prev = df_sentences["sentence"].to_list()
sentences_all_embed_new = df_sentences_new["sentence"].to_list()

In [419]:
embedder = SentenceTransformer('all-MiniLM-L6-v2')

In [420]:
def text_to_embeddings(sentences_all_embed):
    corpus_embeddings = embedder.encode(sentences_all_embed, show_progress_bar=True)
    corpus_embeddings = corpus_embeddings / np.linalg.norm(corpus_embeddings, axis=1, keepdims=True)
    return corpus_embeddings

In [421]:
method_embeddings_prev = text_to_embeddings(sentences_all_embed_prev)
method_embeddings_new = text_to_embeddings(sentences_all_embed_new)

Batches:   0%|          | 0/11 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

# Perform Clustering (code sourced from my master's capstone)

Agglomerative clustering is used to group similar sentences and clauses (sentence-embeddings). Parameters used were chosen based on thorough testing done during my capstone research. Some code of the agglomerative_clustering function sourced from [scikit-learn](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.AgglomerativeClustering.html).

In [422]:
def agglomerative_clustering(corpus_embeddings, sentences_all_embed):
    clustering_model = AgglomerativeClustering(n_clusters=None, distance_threshold=1.05, compute_distances=True) 
    clustering_model.fit(corpus_embeddings)
    cluster_assignment = clustering_model.labels_

    clustered_sentences = {}
    for sentence_id, cluster_id in enumerate(cluster_assignment):
        if cluster_id not in clustered_sentences:
            clustered_sentences[cluster_id] = []

        clustered_sentences[cluster_id].append(sentences_all_embed[sentence_id] + "777777")
    return clustered_sentences, clustering_model

In [450]:
def print_clusters(clustered_sentences):
    for i, cluster in clustered_sentences.items():
        # Filter out clusters with only one sentence or clause
        if len(cluster) < 2:
            continue
        else:
            print("Cluster ", i)
            for article in cluster:
                print(article.split("777777")[0])
            print("")

In [424]:
clusters_prev, clustering_model_prev = agglomerative_clustering(method_embeddings_prev, sentences_all_embed_prev)
clusters_new, clustering_model_new = agglomerative_clustering(method_embeddings_new, sentences_all_embed_new)


In [451]:
print_clusters(clusters_prev)

Cluster  65
Joseph R. Biden Jr., whose habit of verbal missteps on the 2020 campaign trail has concerned some Democrats, is again facing scrutiny after a new report that says he misstated multiple details in a war story he told last week.
In recent days, Biden inaccurately stated the decade when Robert F. Kenndy and Martin Luther King Jr. were assassinated, said he preferred "truth over facts," and misidentified New Hampshire as Vermont while campaigning in the early primary state.
Former Vice President Joe Biden is facing questions about a harrowing war story he tells on the campaign trail after a report in the Washington Post pointed out multiple inaccuracies in his account.
USA TODAY
WASHINGTON -- Former Vice President Joe Biden denied Thursday that he conflated details of a gripping war story he has told on the campaign trail after a Washington Post report called into question remarks by Biden last week and in previous years.

Cluster  13
In two interviews released on Thursday, Mr.

In [452]:
print_clusters(clusters_new)

Cluster  2
Jr remarks came in response to a Washington Post article
according to The Washington Post
According to the Washington Post

Cluster  18
that I said wrong
that I said wrong

Cluster  25
that Mr. Biden pinned a medal on a military service member
that tells about a solider trying to reject a medal
Embedded in Biden ’s medal story are the touchstones of candidate long career

Cluster  11
But in the story of military heroism Biden told
Well maybe Biden has seen too much real life heroics and Biden 's conflating and confusing heroics
In an interview with the Post Workman 's version of the story mirrors Biden 's
as Biden told the story
The story and story inaccuracies come to light as Biden ’s campaign
Unfortunately for Biden however the story ’s strong impression also meant
that the 76 year old former Biden had in fact conflated three different stories including one Army Staff Sgt

Cluster  17
Former Vice President Joe Biden 's campaign has been dogged by gaffes but a new error co

In [427]:
df_sentences["cluster"] = clustering_model_prev.labels_
df_sentences_new["cluster"] = clustering_model_new.labels_

In [428]:
df_sentences_new.head()

Unnamed: 0,outlet,sentence,cluster
0,The New York Times,whose habit of verbal missteps on the 2020 cam...,90
1,The New York Times,Jr remarks came in response to a Washington Po...,2
2,The New York Times,that I said wrong,18
3,The New York Times,that Mr. Biden pinned a medal on a military se...,25
4,The New York Times,But in the story of military heroism Biden told,11


### Pruning / Filters

Here I filter out clusters that include sentences or clauses from only one outlet, since our goal is to compare news aspects included in articles across outlets.

In [572]:
def prune_clusters(df_sentences):
    """Returns list of clusters that only contain one outlet"""
    one_outlet_clusters = []
    for clust in df_sentences["cluster"].to_list():
        df_temp = df_sentences.loc[df_sentences["cluster"]==clust]
        if len(set(df_temp["outlet"])) < 2:
            one_outlet_clusters.append(clust)
    return one_outlet_clusters

In [454]:
one_outlet_clusters_prev = list(set(prune_clusters(df_sentences)))
one_outlet_clusters_new = list(set(prune_clusters(df_sentences_new)))

In [455]:
df_sentences_filter_prev = df_sentences.loc[~df_sentences["cluster"].isin(one_outlet_clusters_prev)]
df_sentences_filter_new = df_sentences_new.loc[~df_sentences_new["cluster"].isin(one_outlet_clusters_new)]


# Evaluation (code written for Data 620 final)

Once again, I use the sentence similarity model to compare sentence pairs. This time I am comparing all sentence and clause combinations and retaining only pairs with similarity scores above .7, a heuristic chosen after thorough manual review. The idea is sentence and clause pairs above .7 represent text covering the same story aspect. This will ultimately let us know what news outlets are not including the core aspects of the story. We can see from the results that while the sentence comparisons lead to more successful results (as determined by filtering criteria), the clause comparison provide incremental story aspect matches. For now, including both approaches appears to be the optimal approach. Additionally, improving the clause algorithm, which currently discards a high number of clauses, will likely improve the volume of successful results.

In [456]:
def find_high_quality_aspect_matches(df_sentences_filter):
    for clust_num in list(set(df_sentences_filter["cluster"])):
        scores_all = []
        df_sent_clust = df_sentences_filter[df_sentences_filter["cluster"]==clust_num]
        for i in itertools.combinations(df_sent_clust["sentence"].to_list(), 2):
            sent_one = model.encode(i[0])
            sent_two = model.encode(i[1])

            # Compute dot score between query and all document embeddings
            scores = util.dot_score(sent_one, sent_two)[0].cpu().tolist()

            sen_tup1 = (i[0],"str")
            sen_tup2 = (i[1], "str")

            # Combine docs & scores
            doc_score_pairs = list(zip(sen_tup1, sen_tup2, scores))

            # Sort by decreasing score
            doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)

            # Output passages & scores
            for doc, doc2, score in doc_score_pairs:
                scores_all.append(score)
        if np.mean(scores_all) > .7:
            print(clust_num, np.mean(scores_all), scores_all)

In [457]:
find_high_quality_aspect_matches(df_sentences_filter_prev)

130 0.7140808502833048 [0.7027899026870728, 0.694469153881073, 0.7449834942817688]
133 0.7041655977567037 [0.7004562020301819, 0.6947999596595764, 0.7172406315803528]
10 0.7078566630681355 [0.7418990731239319, 0.7039955854415894, 0.6154731512069702, 0.7471650242805481, 0.7409366965293884, 0.7188611030578613, 0.5176450610160828, 0.7137230038642883, 0.6953073143959045, 0.6507924795150757, 0.7638155221939087, 0.7188830971717834, 0.6939267516136169, 0.7385793328285217, 0.8568467497825623]
19 0.7423152526219686 [0.7074116468429565, 0.7488417625427246, 0.7706923484802246]
28 0.7574264407157898 [0.7574264407157898]
31 0.7140036026636759 [0.8055289387702942, 0.6783513426780701, 0.6581305265426636]
34 0.7338991761207581 [0.7046188116073608, 0.8622657656669617, 0.6348129510879517]
36 0.7432963490486145 [0.786224901676178, 0.7259458899497986, 0.7608432173728943, 0.6242493987083435, 0.7492883205413818, 0.8211634755134583, 0.6780215501785278, 0.7931594252586365, 0.656410813331604, 0.837656497955322

In [458]:
find_high_quality_aspect_matches(df_sentences_filter_new)

10 0.8400708436965942 [0.8400708436965942]
12 0.7096285025278727 [0.8252482414245605, 0.6208471655845642, 0.6827901005744934]
13 0.7280360658963522 [0.6171013116836548, 0.9999995827674866, 0.6911301612854004, 0.6171013116836548, 0.7517538666725159, 0.6911301612854004]
17 0.7332401971022288 [0.7421143054962158, 0.7893087267875671, 0.6902952194213867, 0.7710067629814148, 0.7316427826881409, 0.6750733852386475]
18 0.9999995827674866 [0.9999995827674866]
71 0.9461154937744141 [0.9461154937744141]


#### Incremental Aspects

Below we can see the incremental aspects provided by the clause extraction comparisons. Cluster 13 at first glance does not appear to be helpful in providing information, however, considering the importance of determining the source of the original article, it is important to highlight which outlets noted which source(s). Cluster 71 is a more prototypical example of what this analysis is trying to uncover, where the exact same story aspects are included.

**Cluster  12** <br>
Former Vice President Joe Biden responded Thursday to a Washington Post report claiming <br>
Biden responded Thursday to a Washington Post report claiming <br>
The Biden campaign has not responded to USA TODAY 's request for comment


**Cluster  13** <br>
according to the Post <br>
as the Post reported <br>
According to the Post <br>
According to the Post ’s reporting

**Cluster  71** <br>
the Pentagon has no record of a U.S. Army captain receiving a Silver Star during the period <br>
The Pentagon has no record of an Army captain receiving a Silver Star in Iraq during the time period

### Compute ROUGE

ROUGE, Recall-Oriented Understudy for Gisting Evaluation, is a natural language processing metric used primarily for evaluating the quality of generated summary text compared to a reference summary. As mentioned, my process for clause comparisons still needs much improvement, though we can see moderate results are being obtained on a subset of the results.

In [11]:
scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)

In [25]:
test_list_according = ['according to the Post',
                      'as the Post reported',
                      'According to the Post',
                      'According to the Post ’s reporting']

test_list_pentagon = ['the Pentagon has no record of a U.S. Army captain receiving a Silver Star during the period',
                      'The Pentagon has no record of an Army captain receiving a Silver Star in Iraq during the time period',]

In [26]:
def compute_rouge(test_list):
    all_precision = []
    for i in itertools.combinations(test_list, 2):
                scores = scorer.score(i[0], i[1])
                print(scores['rouge1'])
                all_precision.append(scores['rouge1'].precision)

    print("Avg. Precision of Cluster")
    print((sum(all_precision)/len(all_precision)))

In [27]:
compute_rouge(test_list_according)

Score(precision=0.5, recall=0.5, fmeasure=0.5)
Score(precision=1.0, recall=1.0, fmeasure=1.0)
Score(precision=0.6666666666666666, recall=1.0, fmeasure=0.8)
Score(precision=0.5, recall=0.5, fmeasure=0.5)
Score(precision=0.5, recall=0.75, fmeasure=0.6)
Score(precision=0.6666666666666666, recall=1.0, fmeasure=0.8)
Avg. Precision of Cluster
0.6388888888888888


In [28]:
compute_rouge(test_list_pentagon)

Score(precision=0.7894736842105263, recall=0.8333333333333334, fmeasure=0.8108108108108109)
Avg. Precision of Cluster
0.7894736842105263


# References

spaCy: https://spacy.io/ <br>
SentenceTransformers: https://www.sbert.net/ <br>
Sentence similarity: https://huggingface.co/sentence-transformers/multi-qa-MiniLM-L6-cos-v1 <br>
Coreference Resolution: https://github.com/explosion/coreferee <br>
CS 124: From Languages to Information: http://web.stanford.edu/class/cs124/ <br>
scikit-learn, Agglomerative Clustering: https://scikit-learn.org/stable/modules/generated/sklearn.cluster.AgglomerativeClustering.html <br>
The Most Common Evaluation Metrics In NLP, Towards Science: https://towardsdatascience.com/the-most-common-evaluation-metrics-in-nlp-ced6a763ac8b