# Text Mining -  Group Project

---

# Preparing the Data
### Imports

In [109]:
import gensim
import hdbscan
import nltk
import numpy as np
import pandas as pd
import random
import re
import string
import timeit
import umap

from itertools import chain
from collections import Counter
from sklearn import metrics
from nltk.tokenize import regexp_tokenize

## Loading the Data

The Reddit dataset can be retrieved from [here](https://www.kaggle.com/datasets/rootuser/worldnews-on-reddit) and the Irish News dataset can be retrieved from [here](https://www.kaggle.com/datasets/therohk/ireland-historical-news). In order to run the code, the datasets have to be manually downloaded. In the following code it is assumed that the datasets are stored in a directory named """data""" which is located within the main repository. If the data is stored elsewhere, please adapt the path where indicated.

### Reddit News

In [2]:
reddit_path = "data/reddit_worldnews.csv" #Change path if necessary
reddit_file = pd.read_csv(reddit_path, encoding="utf-8", encoding_errors="ignore")
print("reddit_file_clmns:", reddit_file.columns)
reddit = pd.DataFrame(reddit_file["title"]).rename(columns={"title":"Headline"})
reddit.head()

reddit_file_clmns: Index(['time_created', 'date_created', 'up_votes', 'down_votes', 'title',
       'over_18', 'author', 'subreddit'],
      dtype='object')


Unnamed: 0,Headline
0,Scores killed in Pakistan clashes
1,Japan resumes refuelling mission
2,US presses Egypt on Gaza border
3,Jump-start economy: Give health care to all
4,Council of Europe bashes EU&UN terror blacklist


### Irland News

In [208]:
ireland_path = "data/ireland-news-headlines.csv" #Change path if necessary
ireland_file = pd.read_csv(ireland_path, encoding="utf-8", encoding_errors="ignore")
print("ireland_file_clmns:", ireland_file.columns)
ireland_filtered_date = ireland_file[(ireland_file["publish_date"] >= 20080125)& 
                                     (ireland_file["publish_date"] <= 20161122)]
ireland = pd.DataFrame(ireland_filtered_date["headline_text"]).rename(columns={"headline_text":"Headline"})
ireland.head()

ireland_file_clmns: Index(['publish_date', 'headline_category', 'headline_text'], dtype='object')


Unnamed: 0,Headline
747865,Egypt moves to close Gaza border breach
747866,Almost two-thirds of voters undecided on EU tr...
747867,Jacob Fruitfield factory to lay off 220 staff
747868,Bono says rich world failing anti-poverty camp...
747869,Government; NRA announce €1.68bn roads plan


## Pre-processing
Preprocessing includes tokenizing the data, removing stopwords and punctuation ans well as lemmatizing the tokens.

In [4]:
# Pre-processing 
# Regular expression used for tokenization
pattern = r'''(?x)    
(?:[A-Z]\.)+          
|\w+(?:-\w+)*         
|\$?\d+(?:\.\d+)?%?   
|\.\.\.               
|[][.,;"\'?():-_`]  
'''

# Lemmatizer used 
lemmatizer = nltk.WordNetLemmatizer()

def preprocessing(df):
    """Input: dataframe
       Output: preprocessed dataframe"""
    
    # Reduce amount of data for quicker training purposes
    # Headline = df["Headline"].head(100)
        
    # Get the stopwords and punctuation
    stopwords = nltk.corpus.stopwords.words('english')
    punct = list(string.punctuation)
    
    # Initialize tokenized list of headlines
    # Get list of headlines
    headlns_lst = df["Headline"].to_list()
    
    tokenized_lines = []
    for headln in headlns_lst:
        line = str(headln).strip().lower()
        line = regexp_tokenize(line, pattern)
        line = [tok for tok in line if tok not in stopwords and tok not in punct and tok.isalpha() and len(tok)>2]
        tokenized_lines.append(line)
    
    # Initialize lemmatized list of headlines
    pp_df = pd.DataFrame(columns = ["Headline"])
    
    lemmatized_lines = [[lemmatizer.lemmatize(token) for token in headln] for headln in tokenized_lines]
    line_df = pd.DataFrame({"Headline": lemmatized_lines})
    pp_df = pp_df.append(line_df, ignore_index=True)
 
    return pp_df

### Reddit News

In [9]:
#Pre-processing the Reddit dataset
reddit_pp = preprocessing(reddit)
t = timeit.timeit(lambda:reddit_pp)
print(t)

0.07059840800002348


In [270]:
reddit_pp.head()

Unnamed: 0,Headline
0,"[score, killed, pakistan, clash]"
1,"[japan, resume, refuelling, mission]"
2,"[press, egypt, gaza, border]"
3,"[economy, give, health, care]"
4,"[council, europe, bash, terror, blacklist]"


In [276]:
len(reddit_pp)

509236

### Ireland News

In [209]:
# Pre-processing the Irish Times dataset
ireland_pp = preprocessing(ireland)
t = timeit.timeit(lambda:ireland_pp)
print(t)

0.07471210899529979


In [272]:
ireland_pp.head()

Unnamed: 0,Headline
0,"[egypt, move, close, gaza, border, breach]"
1,"[almost, voter, undecided, treaty]"
2,"[jacob, fruitfield, factory, lay, staff]"
3,"[bono, say, rich, world, failing, campaign]"
4,"[government, nra, announce, road, plan]"


In [273]:
len(ireland_pp)

598837

# Sentence Embeddings

We are sentence embeddings by averaging the pre-trained word embeddings from the GoogleNews vectors. These embeddings can be found at https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?resourcekey=0-wjGZdNAUop6WykTtMip30g. Download the dataset to the directory ```TMproject/data``` as with the previous files.

In [6]:
# Import the GoogleNews embeddings, change path id necessary
model = gensim.models.KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin.gz', binary=True)

In [87]:
def mean_vector(word2vec_model, doc):
    """Calculate the mean vector according to a word2vec model for one document/headline"""
    # remove out-of-vocabulary words
    doc = [word for word in doc if word in word2vec_model]
    if len(doc) >= 1:
        return sum(word2vec_model[doc])/len(doc)
    else:
        return []

def doc_embeddings(dataset):
    """Calculate the mean vector for all documents in a dataset.
    Outputs a dataframe with document and 300 dim vector representation of it."""
    embeddings_df = pd.DataFrame()

    for doc in dataset["Headline"]:
        vec = mean_vector(model, doc)
        if len(vec) > 0:
            vec_df = pd.Series(vec)
            doc_df = pd.Series([doc]).append(vec_df, ignore_index = True)
            embeddings_df = embeddings_df.append(doc_df, ignore_index=True) 

    return embeddings_df

Using the above defined functions we created headline embeddings for each headline in the two datasets.

In [90]:
# Embedding the Reddit headlines
reddit_embed = doc_embeddings(reddit_pp)

In [274]:
reddit_embed.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,291,292,293,294,295,296,297,298,299,300
0,"[score, killed, pakistan, clash]",-0.016129,0.087769,0.174744,0.032501,0.062805,0.018555,0.054245,-0.231934,-0.019043,...,0.01123,-0.065674,-0.117554,0.085938,-0.078674,0.006287,-0.067261,-0.108887,0.037109,0.077209
1,"[japan, resume, refuelling, mission]",-0.159668,0.098999,0.081258,0.10144,-0.027832,0.044149,-0.012207,-0.201497,0.269694,...,0.005534,0.073608,-0.049805,0.068034,-0.031067,-0.114746,-0.174072,0.117676,0.026042,-0.007975
2,"[press, egypt, gaza, border]",-0.023438,0.138458,-0.059265,0.034424,-0.024048,-0.052673,-0.092682,-0.093994,0.02301,...,0.017792,0.044189,-0.009369,-0.029266,0.012695,-0.024078,-0.18866,-0.022461,0.017151,0.098053
3,"[economy, give, health, care]",-0.005249,0.04248,0.000671,0.056671,0.016022,-0.035461,0.098572,-0.119873,0.085754,...,-0.073151,0.017151,0.017548,-0.055221,-0.05719,0.16356,-0.052979,0.052917,0.126587,-0.110962
4,"[council, europe, bash, terror, blacklist]",-0.027258,0.040814,0.126221,0.196533,-0.120996,-0.073096,-0.180078,-0.112744,0.060791,...,0.09834,0.071582,0.000366,0.184985,-0.114307,-0.017651,-0.054053,-0.059961,0.040894,-0.090448


In [277]:
len(reddit_embed)

508319

In [210]:
# Embedding the Ireland headlines
ireland_embed = doc_embeddings(ireland_pp)

In [278]:
ireland_embed.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,291,292,293,294,295,296,297,298,299,300
0,"[egypt, move, close, gaza, border, breach]",-0.038417,0.058207,-0.004354,0.068441,-0.083333,-0.014771,-0.117472,-0.073893,0.112671,...,0.008728,0.131022,-0.095276,-0.01593,-0.051456,0.086187,-0.094309,-0.01888,-0.006632,-0.001099
1,"[almost, voter, undecided, treaty]",0.285034,0.014404,0.158081,0.135986,-0.093658,-0.060181,0.168396,-0.071533,0.186218,...,-0.027222,-0.081997,0.08046,-0.266479,0.010132,-0.08728,0.03598,0.070435,0.069702,0.003939
2,"[jacob, fruitfield, factory, lay, staff]",0.031555,-0.072874,0.132446,0.113281,0.062256,0.009979,-0.059769,-0.05127,0.069962,...,-0.145996,0.044922,-0.076172,0.151855,-0.114258,-0.128418,-0.024658,-0.011597,-0.008011,-0.029602
3,"[bono, say, rich, world, failing, campaign]",0.104574,0.018433,0.097127,0.042318,-0.053141,0.034871,0.052979,-0.13088,0.016996,...,-0.003866,-0.061656,-0.110779,0.008993,-0.029704,-0.027995,0.005948,-0.033732,0.038595,0.034668
4,"[government, nra, announce, road, plan]",-0.0625,0.051147,0.052444,0.009888,0.0289,-0.152466,-0.021606,-0.046082,0.153847,...,-0.103516,0.014824,-0.034119,-0.089368,-0.026458,-0.048523,-0.080322,0.008057,0.067474,-0.147369


In [279]:
len(ireland_embed)

597058

---

# Clustering

The headline embeddings are 300 dimensions and so in the process of clustering the headlines we also conduct a dimensionality reduction. We use a UMAP algorithm together with a HDBSCAN clustering method to form hierarchical density based clusters from the word embeddings. 

The functions defined below were originally written by David Borrelli and can be found at [Clustering sentence embeddings to identify intents in short text](https://towardsdatascience.com/clustering-sentence-embeddings-to-identify-intents-in-short-text-48d22d3bf02e). The function ```random_search``` has been slightly modified to also generate Sihouette scores for each clustering.

In [260]:
def score_clusters(clusters, prob_threshold = 0.05):
    """
    Returns the label count and cost of a given cluster supplied from running hdbscan
    """
    
    cluster_labels = clusters.labels_
    label_count = len(np.unique(cluster_labels))
    total_num = len(clusters.labels_)
    cost = (np.count_nonzero(clusters.probabilities_ < prob_threshold)/total_num)
    
    return label_count, cost


def random_search(embeddings, space, num_evals):
    """
    Randomly search hyperparameter space and limited number of times 
    and return a summary of the results
    """
    
    results = []
    
    for i in range(num_evals):

        n_neighbors = random.choice(space['n_neighbors'])
        n_components = random.choice(space['n_components'])
        min_cluster_size = random.choice(space['min_cluster_size'])
        
        # Dimensionality reduction using UMAP
        umap_embedding = umap.UMAP(n_neighbors=n_neighbors, 
                                n_components=n_components, 
                                metric='cosine', 
                                random_state= 42).fit_transform(embeddings.loc[:, 1:].to_numpy())

        # Clustering the UMAP reduced word embedding
        clusters = hdbscan.HDBSCAN(min_cluster_size = min_cluster_size,
                           metric='euclidean', 
                           cluster_selection_method='eom').fit(umap_embedding)
        
        # Filtering the noise for the silhouette score
        embedding = []
        labels = []
        for n in range(len(umap_embedding)):
            if clusters.labels_[n] != -1:
                embedding.append(umap_embedding[n])
                labels.append(clusters.labels_[n])
        
        silhouette_score = metrics.silhouette_score(embedding, labels)
        
        label_count, cost = score_clusters(clusters, prob_threshold = 0.05)
                
        results.append([i, n_neighbors, n_components, min_cluster_size, 
                        label_count, cost, silhouette_score])
    
    result_df = pd.DataFrame(results, columns=['run_id', 'n_neighbors', 'n_components', 
                                               'min_cluster_size', 'label_count', 'cost', 'silhouette'])
    
    return result_df.sort_values(by='cost')

In [104]:
# Defining the search space for the random search
space = {"n_neighbors": range(8,20),
        "n_components": range(2,7),
        "min_cluster_size": range(10,30)}

### Reddit News

In [262]:
#Running the random parameter search on the Reddit news
reddit_param_search = random_search(embeddings=reddit_embed, space=space, num_evals= 100)
reddit_param_search

Unnamed: 0,run_id,n_neighbors,n_components,min_cluster_size,label_count,cost,silhouette
27,27,14,3,20,3,0.000000,0.744467
77,77,15,4,22,3,0.000000,0.746574
25,25,13,6,15,3,0.000000,0.752363
16,16,18,4,18,3,0.000000,0.775728
48,48,13,6,24,3,0.000000,0.752363
...,...,...,...,...,...,...,...
29,29,19,5,14,175,0.585103,0.554993
50,50,14,5,13,215,0.585270,0.559129
2,2,12,4,19,138,0.585437,0.563620
51,51,12,5,18,150,0.585771,0.571402


In [295]:
# Cluster that was decided upon. Note that label count_uncludes "unlabeled" data as a label. 
# We remove this label from the label count in our presentation of results in the report.
reddit_param_search.iloc[65]

run_id               35.000000
n_neighbors           8.000000
n_components          3.000000
min_cluster_size     20.000000
label_count         155.000000
cost                  0.428971
silhouette            0.534276
Name: 0, dtype: float64

In [308]:
# Generate the optimized cluster model for the Reddit data
reddit_umap_embedding = umap.UMAP(n_neighbors=8, 
                                n_components=3, 
                                metric='cosine', 
                                # Changed original function to fit the format of our data.
                                random_state=42).fit_transform(reddit_embed.loc[:, 1:].to_numpy())

reddit_clusters = hdbscan.HDBSCAN(min_cluster_size = 20,
                           metric='euclidean', 
                           cluster_selection_method='eom').fit(reddit_umap_embedding)

In [280]:
# Running the random parameter search on the Ireland news
ireland_param_search = random_search(embeddings=ireland_embed, space=space, num_evals= 100)
ireland_param_search

failed. This is likely due to too small an eigengap. Consider
adding some noise or jitter to your data.

Falling back to random initialisation!
  warn(
failed. This is likely due to too small an eigengap. Consider
adding some noise or jitter to your data.

Falling back to random initialisation!
  warn(
failed. This is likely due to too small an eigengap. Consider
adding some noise or jitter to your data.

Falling back to random initialisation!
  warn(


Unnamed: 0,run_id,n_neighbors,n_components,min_cluster_size,label_count,cost,silhouette
35,35,13,2,28,11,0.013849,0.312246
16,16,18,6,26,10,0.015511,0.401176
41,41,19,6,29,11,0.015712,0.351361
90,90,8,2,13,283,0.387672,0.594165
11,11,12,2,10,317,0.392456,0.562624
...,...,...,...,...,...,...,...
73,73,12,5,27,95,0.567759,0.618530
29,29,19,6,23,109,0.570277,0.624632
18,18,19,3,25,101,0.575263,0.596092
93,93,15,6,29,82,0.580299,0.588972


In [299]:
# Cluster that was decided upon. Note that label_count uncludes "unlabeled" data as a label. 
# We remove this label from the label count in our presentation of results in the report.
ireland_param_search.iloc[17]

run_id               68.000000
n_neighbors          12.000000
n_components          2.000000
min_cluster_size     23.000000
label_count         132.000000
cost                  0.460341
silhouette            0.558982
Name: 68, dtype: float64

In [302]:
ireland_umap_embedding = umap.UMAP(n_neighbors=12, 
                                n_components=2, 
                                metric='cosine', 
                                # Changed original function to fit the format of our data.
                                random_state=42).fit_transform(ireland_embed.loc[:, 1:].to_numpy())

ireland_clusters = hdbscan.HDBSCAN(min_cluster_size = 23,
                           metric='euclidean', 
                           cluster_selection_method='eom').fit(ireland_umap_embedding)

# Cluster Labeling

When labeling the clusters we looked at the 20 most common words in each cluster as well as 15 randomly sampled headlines from the cluster. Based on these words we derived the theme or topic of that cluster as outlined in the report. Below, are the functions used in the labeling process and a summary of the 20 most common words for each cluster from each dataset. We have also provided an example of 15 sample sentences from one cluster from each dataset.

In [25]:
def cluster_words(embedding, clustering, label, top_words = 20):
    """Given a list of sentence embeddings and correspoding cluster labels, returns the most frequent words
   for a given cluster."""
    
    word_list = []

    for n in range(len(embedding)):
        if clustering.labels_[n] == label:
            word_list.append(embedding.iloc[n][0])

    word_counter = Counter(chain.from_iterable(word_list))

    return word_counter.most_common(top_words)

def clusters_words(embedding, clustering, top_words = 20):
    """Given a list of sentence embeddings and corresponding cluster labels, returns the most frequent words
    for each cluster."""
    
    word_freq_df = pd.DataFrame()
    
    for n in np.unique(clustering.labels_):
        words_df = pd.Series(cluster_words(embedding = embedding, 
                                          clustering = clustering, 
                                          label = n, 
                                          top_words = top_words))
        cluster_df = pd.Series(n).append(words_df, ignore_index = True)
        word_freq_df = word_freq_df.append(cluster_df, ignore_index=True)
        
    return word_freq_df

def get_headlines(clusters, label, embedding, number = 0):
    """Given a clustering, a cluster label, and the corresponding headline embeddings,
    return a given number of random preprocessed headlines from the cluster. If the number of headlines
    is left unspecified, return all headlines in the cluster"""
    
    if number == 0:
        number = len(cluster_size(clusters = clusters, label = label))
        
    headlines = []
    
    for n in range(len(embedding)):
        if clusters.labels_[n] == label:
            headlines.append(embedding.iloc[n][0])
    
    return random.sample(headlines, number)

def cluster_size (clusters, label):
    """Given a clustering and a cluster label, return the size of that cluster"""
    return Counter(clusters.labels_)[label]

In [309]:
# The top 20 most common words for each cluster in the Reddit dataset
reddit_freq = clusters_words(embedding= reddit_embed, clustering=reddit_clusters, top_words = 20)

In [310]:
reddit_freq

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,-1.0,"(israel, 816)","(gaza, 724)","(israeli, 582)","(say, 567)","(world, 566)","(war, 539)","(year, 494)","(new, 490)","(iraq, 472)",...,"(attack, 414)","(government, 403)","(people, 393)","(police, 370)","(video, 368)","(child, 361)","(pakistan, 354)","(one, 338)","(woman, 327)","(iran, 323)"
1,0.0,"(evening, 42)","(summary, 42)","(news, 21)","(flash, 21)","(quick, 21)","(story, 21)","(link, 21)","(associated, 21)","(article, 21)",...,"(quote, 4)","(day, 4)","(february, 3)",,,,,,,
2,1.0,"(summary, 63)","(evening, 62)","(story, 32)","(news, 31)","(flash, 31)","(quick, 31)","(link, 31)","(associated, 31)","(article, 31)",...,"(october, 5)","(target, 1)","(blogger, 1)","(excerpt, 1)","(backgrounder, 1)","(basic, 1)","(fact, 1)","(abkhazia, 1)","(chronicle, 1)","(injustice, 1)"
3,2.0,"(jimmy, 17)","(carter, 16)","(hadron, 5)","(collider, 5)","(hamas, 4)","(large, 4)","(meet, 2)","(terrorist, 2)","(leader, 2)",...,"(barred, 2)","(zimbabwe, 2)","(shame, 1)","(news, 1)","(madlibs, 1)","(defends, 1)","(meeting, 1)","(going, 1)","(israeli, 1)","(snub, 1)"
4,3.0,"(bin, 82)","(laden, 78)","(osama, 24)","(urge, 8)","(son, 7)","(jihad, 7)","(dead, 6)","(tape, 6)","(israel, 6)",...,"(driver, 5)","(guantanamo, 4)","(still, 4)","(bush, 4)","(call, 4)","(yemen, 4)","(obama, 4)","(trial, 4)","(breaking, 3)","(video, 3)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150,149.0,"(prison, 64)","(year, 37)","(jail, 28)","(sentenced, 26)","(sentence, 25)","(court, 20)","(jailed, 19)","(murder, 19)","(death, 14)",...,"(journalist, 13)","(guilty, 12)","(jury, 12)","(four, 9)","(convicted, 9)","(man, 9)","(face, 9)","(released, 8)","(judge, 8)","(two, 8)"
151,150.0,"(detainee, 35)","(guantanamo, 14)","(court, 10)","(gitmo, 9)","(enemy, 5)","(combatant, 5)","(say, 5)","(prisoner, 5)","(judge, 4)",...,"(release, 4)","(abu, 4)","(tortured, 4)","(case, 4)","(supreme, 3)","(ghraib, 3)","(former, 3)","(inmate, 3)","(sue, 3)","(denied, 3)"
152,151.0,"(war, 20)","(crime, 20)","(court, 16)","(probe, 11)","(warrant, 11)","(prosecutor, 10)","(president, 10)","(international, 10)","(criminal, 9)",...,"(sudan, 7)","(former, 6)","(tribunal, 6)","(fraud, 5)","(call, 5)","(israeli, 5)","(darfur, 5)","(karadzic, 5)","(face, 4)","(leader, 4)"
153,152.0,"(trial, 14)","(detention, 7)","(guantanamo, 5)","(american, 5)","(former, 4)","(prosecutor, 3)","(genocide, 3)","(start, 3)","(terror, 2)",...,"(world, 2)","(limit, 2)","(day, 2)","(jail, 2)","(terrorism, 2)","(suspect, 2)","(torture, 2)","(charge, 2)","(terrorist, 2)","(citizen, 2)"


In [316]:
get_headlines(clusters=reddit_clusters, label=3, embedding=reddit_embed, number = 15)

[['sept',
  'victim',
  'seek',
  'tie',
  'osama',
  'bin',
  'laden',
  'family',
  'business',
  'share',
  'vast',
  'fortune'],
 ['islamic',
  'jihadist',
  'group',
  'gspc',
  'claim',
  'potent',
  'osama',
  'bin',
  'laden',
  'affiliate'],
 ['target', 'bin', 'laden'],
 ['come', 'gaza', 'jihad', 'spearheaded', 'old', 'friend', 'bin', 'laden'],
 ['bin', 'laden', 'slam', 'prophet', 'cartoon'],
 ['obama', 'bin', 'laden', 'still', 'free', 'gop'],
 ['bin', 'laden', 'videomaker', 'face', 'life', 'gitmo', 'trial'],
 ['osama', 'bin', 'laden', 'writing', 'memoir', 'report'],
 ['love', 'love', 'sheikh', 'osama', 'bin', 'laden', 'kill', 'america'],
 ['white', 'house', 'blocking', 'search', 'bin', 'laden'],
 ['bin', 'laden', 'happy', 'september', 'toll', 'war', 'court', 'told'],
 ['cia',
  'transfer',
  'suspected',
  'qaeda',
  'member',
  'close',
  'tie',
  'osama',
  'bin',
  'laden',
  'guantanamo'],
 ['omar', 'bin', 'laden', 'made', 'copy', 'father'],
 ['osama', 'bin', 'laden', 'fo

In [306]:
# The top 20 most common words in each cluster in the Ireland dataset
ireland_freq = clusters_words(embedding= ireland_embed, clustering=ireland_clusters, top_words = 20)

In [311]:
ireland_freq

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,-1.0,"(new, 262)","(say, 233)","(irish, 190)","(man, 188)","(take, 146)","(call, 143)","(ahern, 134)","(plan, 132)","(get, 131)",...,"(set, 129)","(back, 126)","(court, 120)","(home, 114)","(talk, 108)","(report, 106)","(case, 105)","(year, 105)","(make, 104)","(time, 99)"
1,0.0,"(diary, 93)","(irishman, 79)","(irishwoman, 7)","(dead, 1)","(arminta, 1)","(wallace, 1)","(walk, 1)","(may, 1)",,...,,,,,,,,,,
2,1.0,"(short, 220)","(mossbank, 1)","(come, 1)","(long, 1)","(lane, 1)","(hop, 1)","(sharp, 1)","(chest, 1)","(shock, 1)",...,"(sight, 1)","(drive, 1)","(ulster, 1)",,,,,,,
3,2.0,"(paperback, 12)","(noticeboard, 11)","(leaflet, 2)","(ban, 1)","(advert, 1)","(dublin, 1)","(city, 1)","(centre, 1)","(concern, 1)",...,"(lisbon, 1)",,,,,,,,,
4,3.0,"(movie, 15)","(weekly, 14)","(quiz, 14)","(lifeline, 14)","(telly, 1)","(version, 1)","(better, 1)","(throwing, 1)","(waterford, 1)",...,"(make, 1)","(sense, 1)",,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
127,126.0,"(green, 18)","(revolution, 5)","(light, 4)","(rub, 3)","(recovery, 3)","(ahern, 2)","(party, 2)","(shade, 2)","(leader, 1)",...,"(hoping, 1)","(spec, 1)","(promise, 1)","(new, 1)","(insight, 1)","(lady, 1)","(red, 1)","(shoot, 1)","(reappear, 1)","(tipperary, 1)"
128,127.0,"(red, 18)","(hot, 11)","(blue, 9)","(light, 9)","(black, 5)","(card, 5)","(make, 5)","(cold, 3)","(chip, 3)",...,"(get, 3)","(diamond, 3)","(still, 3)","(dublin, 3)","(see, 3)","(success, 3)","(side, 3)","(brown, 3)","(time, 2)","(ability, 2)"
129,128.0,"(keep, 18)","(record, 14)","(point, 12)","(arsenal, 10)","(clear, 8)","(alive, 8)","(move, 7)","(winning, 7)","(strong, 6)",...,"(hope, 5)","(canning, 5)","(track, 5)","(prove, 5)","(luck, 4)","(pole, 4)","(position, 4)","(brace, 4)","(formula, 4)","(put, 4)"
130,129.0,"(win, 7)","(united, 6)","(patrick, 6)","(back, 4)","(defeat, 4)","(fight, 4)","(keep, 3)","(title, 3)","(singh, 2)",...,"(front, 2)","(alive, 2)","(game, 2)","(say, 2)","(run, 2)","(walsh, 2)","(award, 2)","(sullivan, 2)","(clear, 2)","(russell, 2)"


In [314]:
get_headlines(clusters=ireland_clusters, label=128, embedding=ireland_embed, number = 15)

[['galway', 'give', 'canning', 'senior', 'bow'],
 ['ride'],
 ['loughlin', 'put', 'frame', 'olympic', 'place'],
 ['drogheda', 'strong'],
 ['connell', 'dominant', 'form', 'munster', 'prevail'],
 ['laois', 'left', 'ruing', 'luck'],
 ['brennan', 'brace', 'keep', 'bohs', 'touch'],
 ['mcfadden', 'stop', 'arsenal', 'track'],
 ['record', 'shot', 'winning', 'margin', 'oosthuizen'],
 ['curragh', 'run', 'oxx', 'kargali'],
 ['arsenal', 'refocused', 'big', 'stage'],
 ['calzaghe', 'keep', 'cool'],
 ['galway', 'salute', 'second', 'canning'],
 ['sligo', 'dominate', 'fail', 'make', 'breakthrough'],
 ['record']]

---