# Text Mining -  Group Project

---

# Preparing the Data

In [2]:
import gensim
import hdbscan
import nltk
import numpy as np
import pandas as pd
import random
import re
import string
import umap

from collections import Counter
from itertools import chain
from nltk.tokenize import regexp_tokenize

## Loading the Data

### Reddit News

In [3]:
reddit_path = "data/reddit_worldnews.csv"
reddit_file = pd.read_csv(reddit_path, encoding="utf-8", encoding_errors="ignore")
print("reddit_file_clmns:", reddit_file.columns)
reddit = pd.DataFrame(reddit_file["title"]).rename(columns={"title":"Headline"})
reddit.head()

reddit_file_clmns: Index(['time_created', 'date_created', 'up_votes', 'down_votes', 'title',
       'over_18', 'author', 'subreddit'],
      dtype='object')


Unnamed: 0,Headline
0,Scores killed in Pakistan clashes
1,Japan resumes refuelling mission
2,US presses Egypt on Gaza border
3,Jump-start economy: Give health care to all
4,Council of Europe bashes EU&UN terror blacklist


### Irland News

In [187]:
ireland_path = "data/ireland-news-headlines.csv"
ireland_file = pd.read_csv(ireland_path, encoding="utf-8", encoding_errors="ignore")
print("ireland_file_clmns:", ireland_file.columns)
ireland_filtered_date = ireland_file[(ireland_file["publish_date"] >= 20080125)& 
                                     (ireland_file["publish_date"] <= 20161122)]
ireland = pd.DataFrame(ireland_filtered_date["headline_text"]).rename(columns={"headline_text":"Headline"})
ireland.head()

ireland_file_clmns: Index(['publish_date', 'headline_category', 'headline_text'], dtype='object')


Unnamed: 0,Headline
747865,Egypt moves to close Gaza border breach
747866,Almost two-thirds of voters undecided on EU tr...
747867,Jacob Fruitfield factory to lay off 220 staff
747868,Bono says rich world failing anti-poverty camp...
747869,Government; NRA announce €1.68bn roads plan


## Preprocessing

In [5]:
# Pre-processing 
# Regular expression used for tokenization
pattern = r'''(?x)    
(?:[A-Z]\.)+          
|\w+(?:-\w+)*         
|\$?\d+(?:\.\d+)?%?   
|\.\.\.               
|[][.,;"\'?():-_`]  
'''

# Lemmatizer used 
lemmatizer = nltk.WordNetLemmatizer()

def preprocessing(df):
    """Input: dataframe
       Output: preprocessed dataframe"""
    
    # Reduce amount of data for quicker training purposes
    # Headline = df["Headline"].head(100)
        
    # Get the stopwords and punctuation
    stopwords = nltk.corpus.stopwords.words('english')
    punct = list(string.punctuation)
    
    # Initialize tokenized list of headlines
    # Get list of headlines
    headlns_lst = df["Headline"].to_list()
    
    tokenized_lines = []
    for headln in headlns_lst:
        line = str(headln).strip().lower()
        line = regexp_tokenize(line, pattern)
        line = [tok for tok in line if tok not in stopwords and tok not in punct and tok.isalpha() and len(tok)>2]
        tokenized_lines.append(line)
    
    # Initialize lemmatized list of headlines
    pp_df = pd.DataFrame(columns = ["Headline"])
    
    lemmatized_lines = [[lemmatizer.lemmatize(token) for token in headln] for headln in tokenized_lines]
    line_df = pd.DataFrame({"Headline": lemmatized_lines})
    pp_df = pp_df.append(line_df, ignore_index=True)
 
    return pp_df

### Reddit News

In [170]:
reddit_pp = preprocessing(reddit)
reddit_pp
t = timeit.timeit(lambda:reddit_pp)
print(t)

Unnamed: 0,Headline
0,"[score, killed, pakistan, clash]"
1,"[japan, resume, refuelling, mission]"
2,"[press, egypt, gaza, border]"
3,"[economy, give, health, care]"
4,"[council, europe, bash, terror, blacklist]"
...,...
509231,"[heil, trump, donald, trump, white, nationalis..."
509232,"[people, speculating, could, madeleine, mccann]"
509233,"[professor, receives, arab, researcher, award]"
509234,"[nigel, farage, attack, response, trump, ambas..."


### Ireland News

In [189]:
ireland_pp = preprocessing(ireland)
ireland_pp

Unnamed: 0,Headline
0,"[egypt, move, close, gaza, border, breach]"
1,"[almost, voter, undecided, treaty]"
2,"[jacob, fruitfield, factory, lay, staff]"
3,"[bono, say, rich, world, failing, campaign]"
4,"[government, nra, announce, road, plan]"
...,...
598832,"[vincent, kompany, suffers, injury, eight, yea..."
598833,"[baby, needed, resuscitation, birth, inquiry, ..."
598834,"[spur, whimper, bright, light, monaco]"
598835,"[leicester, continue, contrasting, form, toppi..."


# Sentence Embeddings

We are sentence embeddings by averaging the pre-trained word embeddings from the GoogleNews vectors. These embeddings can be found at https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?resourcekey=0-wjGZdNAUop6WykTtMip30g. Download the dataset to the directory ```TMproject/data```.

In [8]:
# Import the GoogleNews embeddings
model = gensim.models.KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin.gz', binary=True)

In [9]:
def mean_vector(word2vec_model, doc):
    """Calculate the mean vector according to a word2vec model for one document/headline"""
    # remove out-of-vocabulary words
    doc = [word for word in doc if word in word2vec_model]
    if len(doc) >= 1:
        return sum(word2vec_model[doc])/len(doc)
    else:
        return []

def doc_embeddings(dataset):
    """Calculate the mean vector for all documents in a dataset.
    Outputs a dataframe with document and 300 dim vector representation of it."""
    embeddings_df = pd.DataFrame()

    for doc in dataset["Headline"]:
        vec = mean_vector(model, doc)
        if len(vec) > 0:
            vec_df = pd.Series(vec)
            doc_df = pd.Series([doc]).append(vec_df, ignore_index = True)
            embeddings_df = embeddings_df.append(doc_df, ignore_index=True) 
    
    return embeddings_df

Using the above defined functions we created headline embeddings for each headline in the two datasets.

In [10]:
# Use all the data, not just first 1000 headlines when finalizing process.
# Embedding the Reddit headlines
reddit_embed = doc_embeddings(reddit_pp[:10000])
reddit_embed

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,291,292,293,294,295,296,297,298,299,300
0,"[score, killed, pakistan, clash]",-0.016129,0.087769,0.174744,0.032501,0.062805,0.018555,0.054245,-0.231934,-0.019043,...,0.011230,-0.065674,-0.117554,0.085938,-0.078674,0.006287,-0.067261,-0.108887,0.037109,0.077209
1,"[japan, resume, refuelling, mission]",-0.159668,0.098999,0.081258,0.101440,-0.027832,0.044149,-0.012207,-0.201497,0.269694,...,0.005534,0.073608,-0.049805,0.068034,-0.031067,-0.114746,-0.174072,0.117676,0.026042,-0.007975
2,"[press, egypt, gaza, border]",-0.023438,0.138458,-0.059265,0.034424,-0.024048,-0.052673,-0.092682,-0.093994,0.023010,...,0.017792,0.044189,-0.009369,-0.029266,0.012695,-0.024078,-0.188660,-0.022461,0.017151,0.098053
3,"[economy, give, health, care]",-0.005249,0.042480,0.000671,0.056671,0.016022,-0.035461,0.098572,-0.119873,0.085754,...,-0.073151,0.017151,0.017548,-0.055221,-0.057190,0.163560,-0.052979,0.052917,0.126587,-0.110962
4,"[council, europe, bash, terror, blacklist]",-0.027258,0.040814,0.126221,0.196533,-0.120996,-0.073096,-0.180078,-0.112744,0.060791,...,0.098340,0.071582,0.000366,0.184985,-0.114307,-0.017651,-0.054053,-0.059961,0.040894,-0.090448
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9980,"[president, musharraf, boasted, country, earne...",-0.003353,0.029210,0.000122,0.071635,0.023311,-0.035470,-0.050380,-0.066982,0.129551,...,-0.059130,0.033483,-0.121948,-0.010001,-0.014195,-0.015163,0.002241,-0.087930,0.078821,0.008942
9981,"[china, president, jintao, answer, question, o...",-0.019923,-0.001953,-0.037343,0.116455,-0.068034,0.055949,0.027425,-0.093740,0.148438,...,-0.011963,0.024333,-0.017700,-0.014160,0.042623,-0.095327,0.048096,-0.042562,-0.012482,-0.043264
9982,"[submit, positive, news, roadside, bomb, decli...",0.034933,0.080406,0.004280,0.007933,-0.083014,-0.092378,-0.071119,-0.103663,0.121845,...,-0.111990,-0.019152,-0.035244,0.056843,-0.060734,0.013799,-0.079612,-0.001825,-0.006447,-0.034307
9983,"[ceasefire, broken, palestinian, time, mortar,...",-0.001872,0.037323,-0.045603,0.161987,0.120870,-0.101603,-0.006897,-0.115102,0.110235,...,-0.055517,-0.011678,-0.056234,-0.159831,0.039673,0.132039,-0.146637,-0.031263,0.049835,-0.061503


In [190]:
# Embedding the Ireland headlines
ireland_embed = doc_embeddings(ireland_pp[:10000])
ireland_embed

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,291,292,293,294,295,296,297,298,299,300
0,"[egypt, move, close, gaza, border, breach]",-0.038417,0.058207,-0.004354,0.068441,-0.083333,-0.014771,-0.117472,-0.073893,0.112671,...,0.008728,0.131022,-0.095276,-0.015930,-0.051456,0.086187,-0.094309,-0.018880,-0.006632,-0.001099
1,"[almost, voter, undecided, treaty]",0.285034,0.014404,0.158081,0.135986,-0.093658,-0.060181,0.168396,-0.071533,0.186218,...,-0.027222,-0.081997,0.080460,-0.266479,0.010132,-0.087280,0.035980,0.070435,0.069702,0.003939
2,"[jacob, fruitfield, factory, lay, staff]",0.031555,-0.072874,0.132446,0.113281,0.062256,0.009979,-0.059769,-0.051270,0.069962,...,-0.145996,0.044922,-0.076172,0.151855,-0.114258,-0.128418,-0.024658,-0.011597,-0.008011,-0.029602
3,"[bono, say, rich, world, failing, campaign]",0.104574,0.018433,0.097127,0.042318,-0.053141,0.034871,0.052979,-0.130880,0.016996,...,-0.003866,-0.061656,-0.110779,0.008993,-0.029704,-0.027995,0.005948,-0.033732,0.038595,0.034668
4,"[government, nra, announce, road, plan]",-0.062500,0.051147,0.052444,0.009888,0.028900,-0.152466,-0.021606,-0.046082,0.153847,...,-0.103516,0.014824,-0.034119,-0.089368,-0.026458,-0.048523,-0.080322,0.008057,0.067474,-0.147369
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9919,"[many, case, high, court, say, judge]",0.023722,0.042114,-0.013835,0.096008,0.085836,0.009277,0.046036,-0.032580,0.134277,...,0.007935,-0.041748,-0.021769,-0.008626,-0.003713,0.034098,0.045205,-0.042562,0.070262,0.033671
9920,"[scientist, eye, way, identify, cow]",0.101562,0.010663,0.023682,0.055762,-0.164941,0.165332,0.041071,-0.076099,0.066644,...,0.022754,0.053894,-0.070990,0.041553,0.005249,0.087744,0.088074,0.038843,0.030566,-0.027686
9921,"[gardaí, investigate, death, man, ejection, pu...",0.031361,-0.040815,0.095276,-0.002999,-0.085746,-0.042376,-0.083112,-0.196629,0.248413,...,0.101789,0.026332,-0.123317,0.074637,-0.081526,-0.017718,0.044936,-0.168841,-0.001134,0.047023
9922,"[english, girl, found, alive, day, going, miss...",0.044782,-0.000303,0.038993,0.005824,-0.055821,0.018747,0.059444,-0.109720,0.093366,...,-0.036482,-0.008527,-0.113142,0.024920,-0.022984,-0.028791,-0.070033,0.000353,0.069624,-0.025356


---

# Clustering

The headline embeddings are 300 dimensions and so in the process of clustering the headlines we also conduct a dimensionality reduction. We use a UMAP algorithm together with a HDBSCAN clustering method to form hierarchical density based clusters from the word embeddings. 

The functions defined below, used to generate and optimize the clustering were written by David Borrelli and can be found at [Clustering sentence embeddings to identify intents in short text](https://towardsdatascience.com/clustering-sentence-embeddings-to-identify-intents-in-short-text-48d22d3bf02e).

In [11]:
def generate_clusters(sentence_embeddings,
                      n_neighbors,
                      n_components, 
                      min_cluster_size,
                      random_state = None):
    """
    Generate HDBSCAN cluster object after reducing embedding dimensionality with UMAP
    """
    
    umap_embeddings = umap.UMAP(n_neighbors=n_neighbors, 
                                n_components=n_components, 
                                metric='cosine', 
                                # Changed original function to fit the format of our data.
                                random_state=random_state).fit_transform(sentence_embeddings.loc[:, 1:].to_numpy())

    clusters = hdbscan.HDBSCAN(min_cluster_size = min_cluster_size,
                               metric='euclidean', 
                               cluster_selection_method='eom').fit(umap_embeddings)
    

    return clusters


def score_clusters(clusters, prob_threshold = 0.05):
    """
    Returns the label count and cost of a given cluster supplied from running hdbscan
    """
    
    cluster_labels = clusters.labels_
    label_count = len(np.unique(cluster_labels))
    total_num = len(clusters.labels_)
    cost = (np.count_nonzero(clusters.probabilities_ < prob_threshold)/total_num)
    
    return label_count, cost


def random_search(embeddings, space, num_evals):
    """
    Randomly search hyperparameter space and limited number of times 
    and return a summary of the results
    """
    
    results = []
    
    for i in range(num_evals):
        n_neighbors = random.choice(space['n_neighbors'])
        n_components = random.choice(space['n_components'])
        min_cluster_size = random.choice(space['min_cluster_size'])
        
        clusters = generate_clusters(embeddings, 
                                     n_neighbors = n_neighbors, 
                                     n_components = n_components, 
                                     min_cluster_size = min_cluster_size, 
                                     random_state = 42)
    
        label_count, cost = score_clusters(clusters, prob_threshold = 0.05)
                
        results.append([i, n_neighbors, n_components, min_cluster_size, 
                        label_count, cost])
    
    result_df = pd.DataFrame(results, columns=['run_id', 'n_neighbors', 'n_components', 
                                               'min_cluster_size', 'label_count', 'cost'])
    
    return result_df.sort_values(by='cost')

We ran a random parameter search to optimize our clusters for each dataset. We looked for a clustering that had a low loss function whilst at the same time no more than 150 clusters. The rational for this limit is discussed in the report.

In [172]:
# Defining the search space for the random search
space = {"n_neighbors": range(5,15),
        "n_components": range(2,7),
        "min_cluster_size": range(2,15),
        "random_state": 42}

In [174]:
#Running the random parameter search on the Ireland news
reddit_param_search = random_search(embeddings=reddit_embed, space=space, num_evals= 100)
reddit_param_search

Unnamed: 0,run_id,n_neighbors,n_components,min_cluster_size,label_count,cost
76,76,11,2,2,1437,0.239359
11,11,7,3,2,1262,0.286630
97,97,14,2,3,728,0.302854
45,45,9,5,2,1177,0.311267
1,1,5,2,5,407,0.316074
...,...,...,...,...,...,...
78,78,13,5,11,108,0.545418
31,31,13,6,11,110,0.546620
48,48,14,4,14,81,0.547021
71,71,13,3,12,108,0.547621


In [191]:
# Running the random parameter search on the Ireland news
ireland_param_search = random_search(embeddings=ireland_embed, space=space, num_evals= 100)
ireland_param_search

Unnamed: 0,run_id,n_neighbors,n_components,min_cluster_size,label_count,cost
55,55,5,5,2,1302,0.219367
0,0,7,4,2,1209,0.246574
98,98,6,2,3,827,0.248690
65,65,7,4,2,1195,0.250202
36,36,8,6,2,1179,0.252519
...,...,...,...,...,...,...
38,38,14,3,13,122,0.453245
28,28,14,4,12,139,0.454454
29,29,13,6,11,158,0.454857
71,71,10,4,14,113,0.470576


In [54]:
# Generate the optimized cluster model for the Reddit data
reddit_clusters = generate_clusters(sentence_embeddings=reddit_embed, 
                                    n_neighbors=9, 
                                    n_components=4, 
                                    min_cluster_size=14)

In [67]:
# Generate the optimized cluster model for the Ireland data
ireland_clusters = generate_clusters(sentence_embeddings=ireland_embed, 
                                     n_neighbors=9, 
                                     n_components=3, 
                                     min_cluster_size=14)

# Cluster Labeling

When labeling the clusters we looked at the 20 most common words in each cluster. Based on these words we derived the theme or topic of that cluster.

In [46]:
def cluster_words(embedding, clustering, label, top_words = 20):
    """Given a list of sentence embeddings and correspoding cluster labels, returns the most frequent words
   for a given cluster."""
    
    word_list = []

    for n in range(len(embedding)):
        if clustering.labels_[n] == label:
            word_list.append(embedding.iloc[n][0])

    word_counter = Counter(chain.from_iterable(word_list))

    return word_counter.most_common(top_words)

def clusters_words(embedding, clustering, top_words = 20):
    """Given a list of sentence embeddings and corresponding cluster labels, returns the most frequent words
    for each cluster."""
    
    word_freq_df = pd.DataFrame()
    
    for n in np.unique(clustering.labels_):
        words_df = pd.Series(cluster_words(embedding = embedding, 
                                          clustering = clustering, 
                                          label = n, 
                                          top_words = top_words))
        cluster_df = pd.Series(n).append(words_df, ignore_index = True)
        word_freq_df = word_freq_df.append(cluster_df, ignore_index=True)
        
    return word_freq_df

In [158]:
# The top 20 most common words for each cluster in the Reddit dataset
reddit_freq = clusters_words(embedding= reddit_embed, clustering=reddit_clusters, top_words = 20)
reddit_freq

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,-1.0,"(iraq, 239)","(war, 192)","(say, 171)","(new, 163)","(world, 162)","(china, 155)","(government, 139)","(year, 136)","(israel, 134)",...,"(police, 127)","(iran, 119)","(death, 116)","(attack, 115)","(child, 112)","(woman, 112)","(one, 100)","(bush, 100)","(people, 100)","(military, 99)"
1,0.0,"(european, 12)","(union, 9)","(commission, 2)","(truth, 2)","(relation, 2)","(freedom, 2)","(information, 2)","(take, 1)","(greece, 1)",...,"(environment, 1)","(infringement, 1)","(election, 1)","(observer, 1)","(venezuela, 1)","(aim, 1)","(improve, 1)","(libya, 1)","(sarkozy, 1)","(mediterranean, 1)"
2,1.0,"(spy, 27)","(german, 12)","(spying, 12)","(russia, 8)","(israel, 8)","(charge, 5)","(raid, 5)","(face, 4)","(accused, 4)",...,"(caught, 3)","(scandal, 3)","(government, 3)","(stasi, 3)","(espionage, 3)","(journalist, 3)","(take, 3)","(company, 3)","(deutsche, 3)","(telekom, 3)"
3,2.0,"(ship, 23)","(pirate, 22)","(boat, 14)","(gulf, 13)","(fire, 11)","(coast, 10)","(navy, 10)","(warship, 9)","(iranian, 8)",...,"(somalia, 7)","(yacht, 7)","(somali, 5)","(lebanon, 4)","(say, 4)","(seize, 4)","(luxury, 4)","(persian, 4)","(may, 4)","(sends, 3)"
4,3.0,"(arm, 11)","(ship, 8)","(zimbabwe, 7)","(china, 7)","(chinese, 5)","(weapon, 5)","(shipment, 4)","(south, 3)","(zim, 2)",...,"(africa, 2)","(african, 2)","(return, 2)","(home, 2)","(head, 2)","(recall, 2)","(recalled, 2)","(pentagon, 1)","(admits, 1)","(mistaken, 1)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82,81.0,"(torture, 16)","(waterboarding, 9)","(cia, 5)","(bush, 4)","(top, 3)","(white, 2)","(house, 2)","(mccain, 2)","(standing, 2)",...,"(memo, 2)","(aide, 2)","(pushed, 2)","(guantánamo, 2)","(inside, 2)","(would, 2)","(interrogation, 2)","(mastermind, 2)","(defends, 1)","(use, 1)"
83,82.0,"(court, 11)","(crime, 11)","(war, 9)","(trial, 8)","(prosecutor, 8)","(judge, 7)","(evidence, 5)","(guantanamo, 5)","(detainee, 4)",...,"(former, 4)","(tribunal, 4)","(lawyer, 3)","(rendition, 3)","(supreme, 3)","(british, 3)","(clear, 3)","(hearing, 3)","(liable, 3)","(torture, 3)"
84,83.0,"(old, 18)","(year, 13)","(girl, 11)","(pregnant, 3)","(class, 2)","(kid, 2)","(woman, 2)","(boy, 2)","(school, 2)",...,"(shoplifting, 1)","(smoked, 1)","(salmon, 1)","(city, 1)","(dumber, 1)","(suburban, 1)","(friend, 1)","(fourteen, 1)","(kicked, 1)","(modelling, 1)"
85,84.0,"(cellar, 13)","(austrian, 8)","(child, 7)","(year, 7)","(daughter, 7)","(coma, 6)","(dungeon, 6)","(girl, 5)","(kept, 4)",...,"(woman, 4)","(fritzl, 4)","(police, 3)","(find, 3)","(wake, 3)","(inside, 2)","(horror, 2)","(former, 2)","(home, 2)","(six, 2)"


In [175]:
# The top 20 most common words in each cluster in the Ireland dataset
ireland_freq = clusters_words(embedding= ireland_embed, clustering=ireland_clusters, top_words = 20)
ireland_freq

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,-1.0,"(new, 131)","(man, 123)","(say, 112)","(plan, 92)","(may, 80)","(year, 72)","(woman, 71)","(talk, 70)","(court, 64)",...,"(attack, 58)","(pay, 57)","(job, 56)","(seek, 56)","(take, 52)","(get, 52)","(face, 50)","(dublin, 49)","(back, 46)","(make, 46)"
1,0.0,"(diary, 56)","(irishman, 52)","(irishwoman, 4)",,,,,,,...,,,,,,,,,,
2,1.0,"(correction, 22)","(clarification, 22)","(oral, 1)","(explanation, 1)",,,,,,...,,,,,,,,,,
3,2.0,"(eye, 11)","(market, 9)","(report, 9)","(europe, 9)","(nature, 7)","(british, 1)","(industry, 1)","(bearish, 1)","(reverts, 1)",...,"(doctrine, 1)","(apple, 1)","(park, 1)",,,,,,,
4,3.0,"(people, 47)","(egan, 1)","(die, 1)","(exposure, 1)","(plain, 1)","(need, 1)","(suffering, 1)","(reduced, 1)","(frightful, 1)",...,"(later, 1)","(work, 1)","(young, 1)",,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
127,126.0,"(party, 19)","(clinton, 4)","(election, 4)","(new, 4)","(meet, 3)","(budget, 3)","(late, 3)","(government, 2)","(may, 2)",...,"(say, 2)","(get, 2)","(proposal, 2)","(date, 2)","(welcome, 2)","(congress, 1)","(leader, 1)","(row, 1)","(prepare, 1)","(possible, 1)"
128,127.0,"(election, 48)","(candidate, 12)","(vote, 5)","(major, 5)","(plan, 5)","(stand, 4)","(new, 3)","(contest, 3)","(spring, 3)",...,"(italian, 3)","(nomination, 2)","(win, 2)","(elect, 2)","(socialist, 2)","(meet, 2)","(move, 2)","(gonzalez, 2)","(find, 2)","(proposal, 2)"
129,128.0,"(peace, 89)","(process, 27)","(rally, 13)","(must, 7)","(say, 7)","(call, 6)","(new, 5)","(major, 4)","(angolan, 4)",...,"(support, 4)","(need, 4)","(hope, 4)","(move, 3)","(bruton, 3)","(violence, 3)","(effort, 3)","(seek, 3)","(set, 3)","(cardinal, 2)"
130,129.0,"(unionist, 16)","(party, 4)","(talk, 4)","(vote, 4)","(major, 3)","(face, 2)","(government, 2)","(china, 2)","(trade, 2)",...,"(leader, 2)","(london, 2)","(ulster, 2)","(today, 2)","(islamic, 1)","(one, 1)","(lifeline, 1)","(rope, 1)","(hang, 1)","(socialist, 1)"


In [183]:
# Some useful functions for exploring the clusters

def cluster_size (clusters, label):
    """Given a clustering and a cluster label, return the size of that cluster"""
    return Counter(clusters.labels_)[label]
    
def get_headlines(clusters, label, embedding, number = 0):
    """Given a clustering, a cluster label, and the corresponding headline embeddings,
    return a given number of random preprocessed headlines from the cluster. If the number of headlines
    is left unspecified, return all headlines in the cluster"""
    
    if number == 0:
        number = len(cluster_size(clusters = clusters, label = label))
        
    headlines = []
    
    for n in range(len(embedding)):
        if clusters.labels_[n] == label:
            headlines.append(embedding.iloc[n][0])
    
    return random.sample(headlines, number)

---