In [926]:
import pandas as pd
import numpy as np
import re
import nltk
from datetime import datetime
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.cluster import KMeansClusterer
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn import cluster
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import AgglomerativeClustering
from sentence_transformers import SentenceTransformer

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/matthewlucich/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [120]:
#df_polusa_raw = pd.read_csv("2017_unbalanced.csv")
df_polusa_raw = pd.read_csv("2019_unbalanced.csv")

In [121]:
df_polusa_raw.shape

(110193, 10)

In [122]:
df_polusa = df_polusa_raw.dropna()

In [123]:
df_polusa.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 93610 entries, 0 to 110192
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 93610 non-null  int64 
 1   date_publish       93610 non-null  object
 2   outlet             93610 non-null  object
 3   headline           93610 non-null  object
 4   lead               93610 non-null  object
 5   body               93610 non-null  object
 6   authors            93610 non-null  object
 7   domain             93610 non-null  object
 8   url                93610 non-null  object
 9   political_leaning  93610 non-null  object
dtypes: int64(1), object(9)
memory usage: 7.9+ MB


In [124]:
df_polusa['headline'] = df_polusa['headline'].astype(str)
df_polusa['head_lead'] = df_polusa['headline'] + "###" + df_polusa['lead']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


## Filter out irrelevant articles

In [125]:
df_polusa.head(2)

Unnamed: 0,id,date_publish,outlet,headline,lead,body,authors,domain,url,political_leaning,head_lead
0,52876988,2019-07-01 00:00:00,NPR,"News Brief: Trump-Kim Weekend Meeting, Hong Ko...",President Trump met North Korea's leader at th...,"News Brief: Trump-Kim Weekend Meeting, Hong Ko...",Rachel Martin;Steve Inskeep,www.npr.org,https://www.npr.org/2019/07/01/737535379/morni...,LEFT,"News Brief: Trump-Kim Weekend Meeting, Hong Ko..."
1,4011336,2019-07-01 00:00:00,The New York Times,Al Kelly of Visa on the Reagan White House and...,"Everywhere and nowhere, he works to keep Visa ...",You’re very involved with the Catholic Church....,David Gelles,www.nytimes.com,https://www.nytimes.com/2019/07/01/business/al...,LEFT,Al Kelly of Visa on the Reagan White House and...


In [126]:
df_polusa = df_polusa.loc[~df_polusa["url"].str.contains("opinion")] # Remove opinion
df_polusa = df_polusa.loc[~df_polusa["url"].str.contains("Your Monday Briefing")]
df_polusa = df_polusa.loc[~df_polusa["url"].str.contains("Shots - Health News")]
df_polusa = df_polusa.loc[~df_polusa["url"].str.contains("GLOBAL MARKETS-")]
df_polusa = df_polusa.loc[~df_polusa["url"].str.contains("CANADA STOCKS-")]
df_polusa = df_polusa.loc[~df_polusa["url"].str.contains("Reuters Select")]
df_polusa = df_polusa.loc[~df_polusa["url"].str.contains("From the archive")]
df_polusa = df_polusa.loc[~df_polusa["url"].str.contains("Happy Hour Roundup")]

In [127]:
location_stock = "US STOCKS|UK Stocks|German stocks|Korean stocks|HK stocks|European stocks" + \
                 "China stocks|Hong Kong stocks|Asia Stocks|Global stocks|CANADA STOCKS|MIDEAST STOCKS" + \
                 "Swiss stocks|Latam stocks|Japanese stocks|Benelux stocks|S&P 500"   
df_polusa = df_polusa.loc[~df_polusa["url"].str.contains(location_stock)]

In [128]:
subject_mkt = "stock|bond|futures"
verb_mkt = "yield|rally|gain|extend loss|drop|slip|slide|fall|rise|climb|lower|volatile|edge higer"
df_polusa = df_polusa.loc[(~df_polusa["url"].str.contains(subject_mkt)) & 
                          (~df_polusa["url"].str.contains(verb_mkt))]

In [129]:
#df_polusa.loc[df_polusa["headline"].str.contains("stock")]["headline"]

## Subset data

In [130]:
df_polusa = df_polusa.iloc[-5000:, :]

## Add date to corpus embeddings

In [131]:
temp_list = df_polusa["date_publish"].to_list()

In [132]:
max(df_polusa["date_publish"])

'2019-08-31 23:47:08'

In [133]:
min_date = datetime.strptime(min(df_polusa["date_publish"]), '%Y-%m-%d %H:%M:%S')

In [134]:
def convert_to_days(date_str):
    temp_date = datetime.strptime(date_str, '%Y-%m-%d %H:%M:%S')
    #days_since = (temp_date - datetime(1970,1,1)).total_seconds()
    days_since = (temp_date - min_date).total_seconds()
    return days_since

In [135]:
time_list = np.array([convert_to_days(art_date) for art_date in temp_list])

In [136]:
v = time_list
time_list_minmax = (v - v.min()) / (v.max() - v.min())

In [137]:
len(time_list)

5000

In [138]:
date_shaped = time_list_minmax.reshape(len(time_list_minmax), 1)

## Corpus Embeddings

In [139]:
embedder = SentenceTransformer('all-MiniLM-L6-v2') # 'distilbert-base-nli-mean-tokens'

#corpus = df_polusa.iloc[0:15000, :]["head_lead"].to_list()
corpus = df_polusa["head_lead"].to_list()

corpus_embeddings = embedder.encode(corpus, show_progress_bar=True)

# Normalize the embeddings to unit length
corpus_embeddings = corpus_embeddings /  np.linalg.norm(corpus_embeddings, axis=1, keepdims=True)

Batches:   0%|          | 0/157 [00:00<?, ?it/s]

In [140]:
corpus_embeddings_date  = np.hstack((corpus_embeddings, date_shaped))

## Clustering

#### HDBSCAN Clustering

In [1246]:
# import hdbscan

In [1269]:
# cluster_hdbscan = hdbscan.HDBSCAN(min_cluster_size=3,
#                           metric='manhattan',  min_samples=100,                    
#                           cluster_selection_method='eom').fit(corpus_embeddings_date)

In [1270]:
# df_polusa["cluster_hdbs"] = cluster_hdbscan.labels_

In [1271]:
# len(set(df_polusa["cluster_hdbs"]))

4

In [1265]:
# cluster_assignment = cluster_hdbscan.labels_

In [1272]:
# clustered_sentences = {}
# for sentence_id, cluster_id in enumerate(cluster_assignment):
#     if cluster_id not in clustered_sentences:
#         clustered_sentences[cluster_id] = []

#     clustered_sentences[cluster_id].append(corpus[sentence_id])

In [1273]:
# for i, cluster in clustered_sentences.items():
#     print("Cluster ", i)
#     for article in cluster:
#         print(article.split("###")[0])
#     print("")

#### Agglomerative Clustering

In [141]:
# Perform kmean clustering
clustering_model = AgglomerativeClustering(n_clusters=None, distance_threshold=1.25) # 1.5 #, affinity='cosine', linkage='average', distance_threshold=0.4)
clustering_model.fit(corpus_embeddings_date)
cluster_assignment = clustering_model.labels_

clustered_sentences = {}
for sentence_id, cluster_id in enumerate(cluster_assignment):
    if cluster_id not in clustered_sentences:
        clustered_sentences[cluster_id] = []

    clustered_sentences[cluster_id].append(corpus[sentence_id])

In [142]:
for i, cluster in clustered_sentences.items():
    print("Cluster ", i)
    for article in cluster:
        print(article.split("###")[0])
    print("")

Cluster  185
Russia's Polymetal may consider special dividend payout: CFO
Finland's Solidium plans to cut number of holdings over time

Cluster  1366
CORRECTED-Chinese steel maker picks additional 24% stake in Global Switch
Chinese steel maker picks additional 24% stake in Global Switch ahead of IPO

Cluster  1336
UPDATE 1-Erdogan says Turkey wants to continue defence cooperation with Russia
Erdogan says Turkey wants to continue defense cooperation with Russia
Russia, Turkey agree steps to tackle militants in Syria's Idlib: Putin

Cluster  42
UPDATE 1-Zambian court suspends KCM wind-up hearing pending appeal ruling
U.S. appeals court revives aluminum antitrust cases vs Goldman, JPMorgan, Glencore
Appeals court: New Orleans judges have conflict on fines
CVS fights ex-exec's appellate bid to join Amazon's PillPack
11th Circuit revives would-be class action over credit union's overdraft fees

Cluster  1128
Argentina's Macri says inflation rising, central bank props up peso
Argentina sells

Russian opposition figure re-arrested upon release
China police detain instigator of RRR cut rumor: newspaper

Cluster  1322
Gregory Craig, Washington Lawyer on Trial, Says He Never Lied to Investigators
Ex-Obama lawyer Greg Craig takes stand, denies lying about Ukrainian work
‘I did not lie’: Democratic power lawyer Gregory Craig takes stand in foreign lobbying trial

Cluster  75
With Brexit Gambit, Boris Johnson Reveals a Ruthless Side
'Bumbling' Boris Johnson shows his ruthless streak with dramatic step closer to sealing Brexit
Johnson has wrongfooted opponents of no deal. An election surely looms
'His Dark Materials' Author Suggested Boris Should be Publicly Lynched
Why comparisons between Boris Johnson and Charles I aren’t just lazy rhetoric

Cluster  750
Isakson to Resign From Senate Citing Health Reasons
Georgia GOP Sen. Isakson to resign at end of year, amid battle with Parkinson’s
Senator Johnny Isakson to Retire at End of Year
Republican Sen. Johnny Isakson to retire in Decem

Let it burn: U.S. fights wildfires with fire, backed by Trump

Cluster  724
42 wild burros are dead in the Mojave Desert, and there's a $50,000 reward
$58,000 reward offered after more than 40 wild burros found shot dead in the Mojave Desert

Cluster  1206
What would the Earth be like without it?
What would the Earth be like without the Amazon rainforest?

Cluster  1068
Donald Trump Ridicules 'Three Stooges' Republican Primary Challengers
Factbox: Three Republicans, 20 Democrats vie for U.S. presidential nominations

Cluster  352
Iran's middle class feels the brunt of U.S. sanctions
Russia says new U.S. sanctions hurt prospects for bilateral ties: RIA
U.S. sanctions networks it says are connected to Iran's government, military
U.S. sanctions networks it says are connected to Iran's government, military
U.S. Imposes Sanctions on Wandering Iranian Oil Tanker
U.S. imposes sanctions on people, firms it says helped North Korea evade sanctions

Cluster  190
Finance Minister Scholz appoints c

Cluster  51
Immigrants, social media threats, Joe Kennedy, Stacey Abrams.
Ann Coulter: Could I Get That Illegal ‘To Go’?
DHS Releasing More Than 200 Illegal Aliens into the U.S. Every Day
Sex Offenders, Dangerous Criminals Apprehended Crossing into U.S.

Cluster  1315
Prosecutors: Kansas drug network tied to Chicago dismantled
Prosecutors: Kansas drug network tied to Chicago dismantled

Cluster  1212
Coalition to release religious discrimination bill as Labor urges greater scrutiny
Religious discrimination bill: Coalition accused of weakening state human rights law
What is the religious discrimination bill and what will it do?
Religious discrimination bill attacked as 'extraordinary foray in the culture wars'

Cluster  601
A summer day at the beach? For many Gazans, the conflict has put an end to that, too.
War and poverty drive Gazans to seek better life in Europe despite dangers
War and poverty drive Gazans to seek better life in Europe despite dangers

Cluster  111
Lundin Mining inc

Brexit Gives Its Former Colonies a Whiff of Schadenfreude
‘Loud, obsessive, tribal’: the radicalisation of remain - podcast

Cluster  234
'The misdeeds of a few': Nigeria speaks out over $46m fraud case
Sierra Leoneans sue government for alleged environmental failings at diamond mine
Explainer: Nigerian assets at risk worldwide in $9 billion arbitration case

Cluster  341
Pentagon wants Ukraine military aid to continue
Trump considering blocking military aid to Ukraine: CNN
Trump may block $250 million in aid to Ukraine: officials
Despite battle with Congress, Trump administration slow-walking $4 billion in aid, including key funds for Ukraine

Cluster  742
Sibanye-Stillwater confirms first-half loss as gold strike weighs
Sibanye-Stillwater in talks over AngloGold Ashanti's Mponeng mine

Cluster  524
Parents, it’s time to stop talking about tenacity and start talking about character
Here's What Kids Who Bully Often Have In Common

Cluster  850
California housing crisis traps seniors wi


Cluster  310
Former CIA officer calls James Comey's handling of Trump investigation 'horrifying'
Geraldo Rivera: James Comey 'attempted a coup' against Trump in true 'swamp' fashion
Obama-era officials Clapper and Brennan defend Comey in wake of IG report
Devin Nunes: Comey's media defenders sound like 'Baghdad Bob' after IG report

Cluster  1186
'Difference-maker' independent voters in U.S. presidential election crosshairs
Top Trump aide reportedly most scared of Kamala Harris: She's 'the least flawed' of 2020 Dems

Cluster  20
Simon Fiduciaria has registered stake to vote at Mediaset shareholder meeting: source
Vivendi trust presses to vote down Mediaset deal
Italy's court rules in favour of Vivendi in Mediaset restructuring row-sources
Italy's court rules in favor of Vivendi in Mediaset restructuring row: sources
Italy's court rules in favour of Vivendi in Mediaset restructuring row

Cluster  154
Johnson: 'movement under the keel' in Brexit talks - ITV reporter
EU leaders can see '

In [143]:
df_polusa["cluster"] = clustering_model.labels_

In [1005]:
df_cluster = df_polusa.loc[df_polusa["cluster"]==125]
df_cluster["headline"].values

array(['Water Scarcity Is A Growing Problem. We Can Use Satellites To Better Measure It.',
       'Are water shortages driving migration? Researchers dispel myths',
       "Water costs are rising across the U.S. - here's why",
       'More people, less water? Scientists see risks on upper Nile',
       'More people, less water? Scientists see risks on upper Nile'],
      dtype=object)

## c-TF-IDF

In [515]:
# docs_df = pd.DataFrame(data, columns=["Doc"])
# docs_df['Topic'] = cluster.labels_
df_polusa['doc_id'] = range(len(df_polusa))
docs_per_topic = df_polusa.groupby(['cluster'], as_index = False).agg({'head_lead': ' '.join})

In [516]:
docs_per_topic.head()

Unnamed: 0,cluster,head_lead
0,0,An Interview with former U.S. Ambassador to Ru...
1,1,Former Google engineer charged with trade secr...
2,2,Pew: Twitter Dominated by Young Affluent Democ...
3,3,How has Brexit vote affected the UK economy? A...
4,4,Family of slain Honduran activist appeal to US...


In [517]:
def c_tf_idf(documents, m, ngram_range=(1, 1)):
    count = CountVectorizer(ngram_range=ngram_range, stop_words="english").fit(documents)
    t = count.transform(documents).toarray()
    w = t.sum(axis=1)
    tf = np.divide(t.T, w)
    sum_t = t.sum(axis=0)
    idf = np.log(np.divide(m, sum_t)).reshape(-1, 1)
    tf_idf = np.multiply(tf, idf)

    return tf_idf, count

In [518]:
tf_idf, count = c_tf_idf(docs_per_topic["head_lead"].values, m=len(data))

In [519]:
def extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=20):
    words = count.get_feature_names()
    labels = list(docs_per_topic["cluster"])
    tf_idf_transposed = tf_idf.T
    indices = tf_idf_transposed.argsort()[:, -n:]
    top_n_words = {label: [(words[j], tf_idf_transposed[i][j]) for j in indices[i]][::-1] for i, label in enumerate(labels)}
    return top_n_words

def extract_topic_sizes(df):
    topic_sizes = (df.groupby(['cluster'])
                     .head_lead
                     .count()
                     .reset_index()
                     .rename({"cluster": "cluster", "head_lead": "Size"}, axis='columns')
                     .sort_values("Size", ascending=False))
    return topic_sizes

In [520]:
top_n_words = extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=20)
topic_sizes = extract_topic_sizes(df_polusa); topic_sizes.head(10)

Unnamed: 0,cluster,Size
261,261,22
61,61,20
141,141,19
927,927,19
203,203,18
610,610,17
119,119,17
500,500,17
200,200,16
195,195,16


## Filter out articles with not enough top words

In [962]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [963]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/matthewlucich/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [999]:
ps = PorterStemmer()

def clean_headline_lead(text):
    art_words = re.sub(r'[^\w\s]', '', text)
    art_words = word_tokenize(art_words)
    art_words_lower = [x.lower() for x in art_words]
    art_words_no_stop = [word for word in art_words_lower if not word in stopwords.words()]
    art_words_lower_stem = [ps.stem(word) for word in art_words_no_stop]
    return art_words_lower_stem

In [993]:
df_cluster["top_words_article"] = df_cluster["head_lead"].apply(clean_headline_lead)

In [1000]:
df_cluster.iloc[0, 13]

['water',
 'scarciti',
 'grow',
 'problem',
 'use',
 'satellit',
 'better',
 'measur',
 'itth',
 'problem',
 'could',
 'affect',
 'much',
 '80',
 'peopl',
 'africa',
 '2050']

In [1001]:
df_cluster["num_top_words"] = df_cluster["head_lead"].apply(clean_headline_lead)

In [1002]:
len(set(df_polusa["cluster"]))

1420

In [1006]:
df_polusa.columns

Index(['id', 'date_publish', 'outlet', 'headline', 'lead', 'body', 'authors',
       'domain', 'url', 'political_leaning', 'head_lead', 'cluster', 'doc_id'],
      dtype='object')

In [1022]:
df_labeled = pd.DataFrame(columns = ['id', 'date_publish', 'outlet', 'headline', 'lead', 'body', 'authors',
       'domain', 'url', 'political_leaning', 'head_lead', 'cluster', 'doc_id', 'cluster_terms', 'match_terms', 'num_match'])

for cluster in set(df_polusa["cluster"]):
    word_list_cluster = [word[0] for word in top_n_words[cluster][:15]]
    word_list_cluster_lower = [x.lower() for x in word_list_cluster]
    word_list_cluster_no_stop = [word for word in word_list_cluster_lower if not word in stopwords.words()]
    word_list_cluster_stem = [ps.stem(word) for word in word_list_cluster_no_stop]
    word_list_cluster_stem = word_list_cluster_stem[:10]
    df_clus = df_polusa.loc[df_polusa["cluster"]==cluster]
    df_clus["top_words_article"] = df_clus["head_lead"].apply(clean_headline_lead)
    #print("CLUSTER TOP WORDS:", str(cluster))
    #print(word_list_cluster_stem)
    for index, row in df_clus.iterrows():
        df_labeled = df_labeled.append({'id': row["id"], 'date_publish': row["date_publish"], 
                                      'outlet':row["outlet"], 'headline': row["headline"], 
                                      'lead': row["lead"], 'body': row["body"], 'authors': row["authors"],
                                      'domain': row["domain"], 'url': row["url"], 
                                      'political_leaning': row["political_leaning"], 
                                      'head_lead': row["head_lead"], 'cluster': row["cluster"], 
                                      'doc_id': row["doc_id"], 'cluster_terms': word_list_cluster_stem,
                                      'article_terms': row["top_words_article"],
                                      'match_terms': list(set(word_list_cluster_stem) & set(row["top_words_article"])),
                                      'num_match': len(list(set(word_list_cluster_stem) & set(row["top_words_article"])))},
                                       ignore_index = True)

In [1023]:
# Create new dataframe
# Append row if cluster is good

In [1024]:
df_labeled.tail()

Unnamed: 0,id,date_publish,outlet,headline,lead,body,authors,domain,url,political_leaning,head_lead,cluster,doc_id,cluster_terms,match_terms,num_match,article_terms
4995,38996999,2019-08-31 15:56:17,The Guardian,Javid defends settled status scheme in respons...,Campaigners point to rise in grants for pre-se...,Campaigners point to rise in grants for pre-se...,Nadeem Badshah;Marley Morris;Marie Le Conte,www.theguardian.com,https://www.theguardian.com/politics/2019/aug/...,LEFT,Javid defends settled status scheme in respons...,1418,4853,"[javid, fiscal, settl, statu, break, review, s...","[javid, statu, settl, grant]",4,"[javid, defend, settl, statu, scheme, respons,..."
4996,45658582,2019-08-30 08:35:11,The Washington Post,About 100 migrants jump fences into Spanish Af...,Authorities in Ceuta say that around 100 migra...,MADRID — Authorities in Ceuta say that around ...,Associated Press;August At Am,www.washingtonpost.com,https://www.washingtonpost.com/world/europe/ab...,UNDEFINED,About 100 migrants jump fences into Spanish Af...,1419,3663,"[enclav, migrant, spanish, 150, ceuta, fenc, a...","[spanish, african, cross, ceuta, enclav, fenc,...",7,"[100, migrant, jump, fenc, spanish, african, e..."
4997,2824670,2019-08-30 09:21:48,Reuters,Over 150 migrants storm through Spain's enclav...,More than 150 migrants crossed the border fenc...,A migrant leaves the Spanish military ship 'Au...,Reuters Editorial;Min Read,www.reuters.com,https://www.reuters.com/article/us-europe-migr...,CENTER,Over 150 migrants storm through Spain's enclav...,1419,3696,"[enclav, migrant, spanish, 150, ceuta, fenc, a...","[spanish, spain, 150, african, ceuta, cross, e...",9,"[150, migrant, storm, spain, enclav, fencemor,..."
4998,4352555,2019-08-30 16:16:19,Breitbart,"150+ African Migrants Storm Border Fence, Ente...",At least 155 migrants from Sub-Saharan Africa ...,MADRID (AP) – At least 155 migrants from Sub-S...,Breitbart London,www.breitbart.com,https://www.breitbart.com/europe/2019/08/30/15...,RIGHT,"150+ African Migrants Storm Border Fence, Ente...",1419,4147,"[enclav, migrant, spanish, 150, ceuta, fenc, a...","[spanish, 150, african, ceuta, enclav, fenc, m...",7,"[150, african, migrant, storm, border, fenc, e..."
4999,1156862,2019-08-30 16:40:55,Reuters,Over 150 migrants break into Spanish North Afr...,More than 150 migrants crossed the border fenc...,"SAN ROQUE, Spain (Reuters) - More than 150 mig...",Jon Nazca;Min Read,www.reuters.com,https://www.reuters.com/article/us-europe-migr...,CENTER,Over 150 migrants break into Spanish North Afr...,1419,4173,"[enclav, migrant, spanish, 150, ceuta, fenc, a...","[spanish, spain, 150, african, ceuta, cross, e...",9,"[150, migrant, break, spanish, north, african,..."


In [1028]:
df_labeled.shape

(5000, 17)

In [1032]:
df_labeled = df_labeled.loc[df_labeled["num_match"]>=4]

## Filter out low quality clusters (e.g. less than 3 articles, one source)

In [1036]:
low_quality_clusters = []
for cluster in set(df_labeled["cluster"]):
    df_clust = df_labeled.loc[df_labeled["cluster"]==cluster]
    if df_clust.shape[0] < 3:
        low_quality_clusters.append(cluster)
    elif len(set(df_clust["outlet"])) < 2:
        low_quality_clusters.append(cluster)

In [1038]:
len(low_quality_clusters)

968

In [1041]:
df_good = df_labeled.loc[~df_labeled["cluster"].isin(low_quality_clusters)]

In [1043]:
set(df_good["cluster"])

{1,
 3,
 5,
 12,
 13,
 17,
 18,
 22,
 26,
 28,
 30,
 33,
 34,
 35,
 36,
 40,
 42,
 47,
 49,
 50,
 53,
 56,
 57,
 59,
 61,
 66,
 68,
 69,
 70,
 75,
 77,
 78,
 84,
 86,
 90,
 92,
 97,
 101,
 102,
 104,
 113,
 114,
 117,
 119,
 124,
 125,
 135,
 136,
 139,
 141,
 142,
 143,
 150,
 151,
 157,
 161,
 165,
 168,
 170,
 183,
 186,
 188,
 193,
 194,
 195,
 199,
 203,
 204,
 215,
 216,
 218,
 230,
 235,
 239,
 244,
 246,
 252,
 256,
 257,
 260,
 261,
 266,
 267,
 268,
 270,
 277,
 278,
 279,
 283,
 284,
 285,
 291,
 292,
 293,
 295,
 305,
 306,
 307,
 309,
 313,
 314,
 320,
 324,
 329,
 331,
 333,
 341,
 347,
 349,
 350,
 352,
 355,
 356,
 358,
 359,
 361,
 367,
 368,
 369,
 377,
 379,
 387,
 388,
 390,
 391,
 392,
 393,
 394,
 395,
 396,
 402,
 404,
 407,
 410,
 411,
 413,
 417,
 418,
 420,
 422,
 423,
 425,
 427,
 431,
 437,
 440,
 441,
 449,
 456,
 457,
 458,
 459,
 465,
 468,
 470,
 471,
 473,
 474,
 475,
 476,
 482,
 485,
 489,
 500,
 501,
 504,
 507,
 510,
 512,
 514,
 515,
 522,
 525,
 5

In [1120]:
num_clust = 18
df_qa = df_good.loc[df_good["cluster"]==num_clust]
df_qa["headline"].values

array(["Violence Follows Pro-Independence Protests In Indonesia's Papua Region",
       "Explainer: Deepening unrest in Indonesia's Papua",
       'Protesters burn local assembly building in Papua protest',
       "Violent protest erupts in capital of Indonesia's Papua",
       "Violent protest erupts in capital of Indonesia's Papua",
       'Indonesia urges calm in Papua after two weeks of protests'],
      dtype=object)

In [1127]:
df_good.loc[df_good["cluster"]==num_clust]

Unnamed: 0,id,date_publish,outlet,headline,lead,body,authors,domain,url,political_leaning,head_lead,cluster,doc_id,cluster_terms,match_terms,num_match,article_terms
99,52874105,2019-08-28 00:00:00,NPR,Violence Follows Pro-Independence Protests In ...,Tensions between pro-independence supporters a...,Violence Follows Pro-Independence Protests In ...,Ashley Westerman,www.npr.org,https://www.npr.org/2019/08/28/754276641/viole...,LEFT,Violence Follows Pro-Independence Protests In ...,18,503,"[papua, indonesia, easternmost, region, capit,...","[papua, provinc, indonesia, protest]",4,"[violenc, follow, proindepend, protest, indone..."
100,3407626,2019-08-29 05:39:28,Reuters,Explainer: Deepening unrest in Indonesia's Papua,Indonesia's easternmost provinces of Papua and...,(Reuters) - Indonesia’s easternmost provinces ...,Tom Allard;Min Read,www.reuters.com,https://www.reuters.com/article/us-indonesia-p...,CENTER,Explainer: Deepening unrest in Indonesia's Pap...,18,2120,"[papua, indonesia, easternmost, region, capit,...","[easternmost, papua, provinc, indonesia]",4,"[explain, deepen, unrest, indonesia, papuaindo..."
101,45838358,2019-08-29 09:53:32,The Washington Post,Protesters burn local assembly building in Pap...,Police say protesters in Indonesia’s restive P...,A Papuan student with her face painted with th...,Associated Press;August At Am,www.washingtonpost.com,https://www.washingtonpost.com/world/asia_paci...,UNDEFINED,Protesters burn local assembly building in Pap...,18,2305,"[papua, indonesia, easternmost, region, capit,...","[papua, provinc, indonesia, protest]",4,"[protest, burn, local, assembl, build, papua, ..."
102,2598373,2019-08-29 11:36:46,Reuters,Violent protest erupts in capital of Indonesia...,Protesters in Indonesia's easternmost region o...,JAKARTA (Reuters) - Protesters in Indonesia’s ...,Reuters Editorial;Min Read,www.reuters.com,https://www.reuters.com/article/us-indonesia-p...,CENTER,Violent protest erupts in capital of Indonesia...,18,2418,"[papua, indonesia, easternmost, region, capit,...","[indonesia, region, protest, easternmost, viol...",9,"[violent, protest, erupt, capit, indonesia, pa..."
103,1191947,2019-08-29 12:41:22,Reuters,Violent protest erupts in capital of Indonesia...,Protesters in Indonesia's easternmost region o...,JAKARTA (Reuters) - Protesters in Indonesia’s ...,Reuters Editorial;Min Read,www.reuters.com,https://www.reuters.com/article/us-indonesia-p...,CENTER,Violent protest erupts in capital of Indonesia...,18,2503,"[papua, indonesia, easternmost, region, capit,...","[indonesia, region, protest, easternmost, viol...",9,"[violent, protest, erupt, capit, indonesia, pa..."
104,3313117,2019-08-30 11:23:59,Reuters,Indonesia urges calm in Papua after two weeks ...,Indonesia's chief security minister on Friday ...,JAKARTA (Reuters) - Indonesia’s chief security...,Reuters Editorial;Min Read,www.reuters.com,https://www.reuters.com/article/us-indonesia-p...,CENTER,Indonesia urges calm in Papua after two weeks ...,18,3830,"[papua, indonesia, easternmost, region, capit,...","[indonesia, region, protest, easternmost, viol...",6,"[indonesia, urg, calm, papua, two, week, prote..."


## Highlight Event Selection

In [1136]:
import statistics

In [1178]:
df_event_selection = pd.DataFrame(columns = ["cluster", "median_date", "covered", "not_covered", "cluster_terms"])
all_outlets = set(df_polusa["outlet"])
for cluster in set(df_good["cluster"]):
    df_clust = df_good.loc[df_good["cluster"]==cluster]
    clust_outlets = set(df_clust["outlet"])
    did_not_cover = all_outlets - clust_outlets
    date_list = pd.to_datetime(df_clust["date_publish"], infer_datetime_format=True)
    median_date = pd.Timestamp.fromordinal(int(date_list.apply(lambda x: x.toordinal()).median()))
    df_event_selection = df_event_selection.append({'cluster': cluster, 'median_date': median_date, 
                                       'covered': clust_outlets, 'not_covered': did_not_cover, 
                                        'cluster_terms': df_clust["cluster_terms"].iloc[0]},
                                       ignore_index = True)

In [1207]:
df_event_selection.tail(10)

Unnamed: 0,cluster,median_date,covered,not_covered,cluster_terms
435,1364,2019-08-28,"{The Guardian, Reuters, The Washington Post}","{PBS, The Daily Caller, NBC News, Politico, AB...","[davidson, leader, conserv, quit, ruth, scotla..."
436,1367,2019-08-31,"{Reuters, NBC News, Yahoo! News, The Washingto...","{PBS, The Daily Caller, Politico, ABC News, BB...","[sirhan, prison, robert, stab, kennedi, assass..."
437,1376,2019-08-27,"{Chicago Tribune, Reuters, NBC News, Yahoo! Ne...","{CNN, PBS, The Daily Caller, Politico, Nationa...","[missouri, judg, abort, law, feder, ban, block..."
438,1377,2019-08-29,"{CBS News, NBC News}","{PBS, The Daily Caller, Politico, ABC News, BB...","[children, polici, ill, die, deport, sick, tru..."
439,1378,2019-08-29,"{USA Today, Yahoo! News, The Washington Post}","{PBS, The Daily Caller, NBC News, Politico, AB...","[minnesota, omar, threat, fair, hate, democrat..."
440,1386,2019-08-27,"{CNN, Yahoo! News, The Guardian, Fox News, The...","{PBS, Chicago Tribune, Reuters, The Daily Call...","[harvard, friend, freshman, entri, palestinian..."
441,1389,2019-08-30,"{CNN, NBC News}","{PBS, The Daily Caller, Politico, ABC News, BB...","[suicid, philadelphia, weapon, home, arsen, gr..."
442,1391,2019-08-28,"{Breitbart, Yahoo! News, Reuters}","{PBS, The Daily Caller, NBC News, Politico, AB...","[facebook, quit, research, democraci, impact, ..."
443,1402,2019-08-30,"{Reuters, The Washington Post}","{PBS, The Daily Caller, NBC News, Politico, AB...","[swap, prison, ukrain, russia, ukrainian, film..."
444,1419,2019-08-30,"{Breitbart, Reuters, The Washington Post}","{PBS, The Daily Caller, NBC News, Politico, AB...","[enclav, migrant, spanish, 150, ceuta, fenc, a..."


In [1240]:
regex_str = "Ukraine|prison|swap"
outlet = 'Reuters'
df_polusa.loc[(df_polusa["outlet"]==outlet)
             & (df_polusa["head_lead"].str.contains(regex_str))]["headline"].values

array(["Deputy head of presidential office may serve as Ukraine's next premier",
       'At least one killed in gas explosion in Ukraine',
       'Ukraine frees jailed Russian journalist Vyshinsky on bail: TASS',
       'Ukraine frees jailed Russian journalist amid prisoner swap talks',
       'Trump adviser Bolton, in Ukraine, warns of Chinese influence',
       'Trump adviser Bolton, in Ukraine, warns of Chinese influence',
       'Trump adviser Bolton tells Ukraine: Beware Chinese influence',
       "China's yuan struggles to find floor as tariffs loom",
       "South Korea's top court orders review of ex-president Park's graft case",
       'Myanmar jails filmmaker for Facebook posts critical of military',
       'Jailed Ukrainian filmmaker brought to Moscow amid prisoner swap talks: Russian media',
       'Ukraine President nominates new foreign, defense ministers',
       'Ukraine president proposes to appoint Honcharuk as prime minister',
       'Ukraine president proposes polit

In [1239]:
df_polusa_raw.loc[(df_polusa_raw["outlet"]==outlet)
             & (df_polusa_raw["headline"].str.contains(regex_str))]["headline"].values

array(['Judge in census case denies DOJ motion to swap out lawyers',
       'Patronis v. Rubin heats up — Florida Democrats remain cautious on impeachment talk — Bad ballot design cost Nelson — Are prisons ready for hurricane season?',
       'Prosecutors back Pell Grants for prisoners',
       'A gun control swap for immigration reform?',
       'U.S. prisons chief removed after Epstein’s death',
       'Trump fuels outsize expectations that Russia investigators will face prison time',
       'Rick Gates says he told Vin Weber and Tony Podesta that Ukraine controlled think tank',
       'Pentagon wants Ukraine military aid to continue',
       'Vin Weber resigns from lobbying firm after scrutiny over Ukraine work with Manafort'],
      dtype=object)

In [1237]:
#df_polusa_raw.loc[df_polusa_raw["id"]==131771587]

In [1230]:
df_good.loc[df_good["cluster"]==1402]["headline"].values

array(['Jailed Ukrainian filmmaker brought to Moscow amid prisoner swap talks: Russian media',
       'Ukraine official says Ukraine, Russia swapped prisoners, including sailors',
       "Ukraine president's office says no prisoner swap with Russia yet",
       'Ukraine says major prisoner exchange with Russia underway',
       "Ukraine president's office says no prisoner swap with Russia yet",
       'Ukraine denies reports of prisoner swap with Russia'],
      dtype=object)

In [1195]:
#set(df_polusa["outlet"])

In [1196]:
#set(df_qa["outlet"])

In [1197]:
#set(df_polusa["outlet"]) - set(df_qa["outlet"])

In [1198]:
# df_polusa.loc[(df_polusa["outlet"]=='The New York Times')
#              & (df_polusa["head_lead"].str.contains("Citizenship"))]["headline"].values

In [1132]:
# df_polusa_raw.loc[(df_polusa_raw["outlet"]=='The New York Times')
#              & (df_polusa_raw["headline"].str.contains("Uber"))]["headline"].values

In [1079]:
# df_polusa.loc[(df_polusa["outlet"]=='The New York Times') & 
#               (df_polusa["date_publish"] > min(df_polusa["date_publish"])) & (df_polusa["date_publish"] < max(df_polusa["date_publish"]))
#              ]["headline"].values

In [1241]:
#df_event_selection.to_csv("event_selection_1.csv")

In [1242]:
#df_good.to_csv("good_clusters_1.csv")