# Text Mining -  Group Project

---

# Preparing the Data

In [116]:
import gensim
import hdbscan
import nltk
import numpy as np
import pandas as pd
import random
import re
import string
import umap

from sklearn.utils import shuffle
from nltk.tokenize import regexp_tokenize
from sklearn.model_selection import train_test_split

## Loading the Data

### Reddit News

In [2]:
reddit_path = "data/reddit_worldnews.csv"
reddit_file = pd.read_csv(reddit_path, encoding="utf-8", encoding_errors="ignore")
print("reddit_file_clmns:", reddit_file.columns)
reddit = pd.DataFrame(reddit_file["title"]).rename(columns={"title":"Headline"})
reddit.head()

reddit_file_clmns: Index(['time_created', 'date_created', 'up_votes', 'down_votes', 'title',
       'over_18', 'author', 'subreddit'],
      dtype='object')


Unnamed: 0,Headline
0,Scores killed in Pakistan clashes
1,Japan resumes refuelling mission
2,US presses Egypt on Gaza border
3,Jump-start economy: Give health care to all
4,Council of Europe bashes EU&UN terror blacklist


### Irland News

In [146]:
ireland_path = "data/ireland-news-headlines.csv"
ireland_file = pd.read_csv(ireland_path, encoding="utf-8", encoding_errors="ignore")
print("ireland_file_clmns:", ireland_file.columns)
ireland_filtered_date = ireland_file[(ireland_file["publish_date"] >= 20080125) & 
                                     (ireland_file["publish_date"] <= 20161122)]
ireland = pd.DataFrame(ireland_file["headline_text"]).rename(columns={"headline_text":"Headline"})
ireland.head()

ireland_file_clmns: Index(['publish_date', 'headline_category', 'headline_text'], dtype='object')


Unnamed: 0,Headline
0,UUP sees possibility of voting Major out
1,Pubs targeted as curbs on smoking are extended
2,Papers reveal secret links with O'Neill cabinet
3,Domestic chaos as Italy takes EU presidency
4,Learning about the star to which we owe life


In [147]:
type(ireland_file)

pandas.core.frame.DataFrame

## Preprocessing

In [4]:
# Pre-processing 
# Regular expression used for tokenization
pattern = r'''(?x)    
(?:[A-Z]\.)+          
|\w+(?:-\w+)*         
|\$?\d+(?:\.\d+)?%?   
|\.\.\.               
|[][.,;"\'?():-_`]  
'''

# Lemmatizer used 
lemmatizer = nltk.WordNetLemmatizer()

def preprocessing(df):
    """Input: dataframe
       Output: preprocessed dataframe"""
    
    # Reduce amount of data for quicker training purposes
    # Headline = df["Headline"].head(100)
        
    # Get the stopwords and punctuation
    stopwords = nltk.corpus.stopwords.words('english')
    punct = list(string.punctuation)
    
    # Initialize tokenized list of headlines
    # Get list of headlines
    headlns_lst = df["Headline"].to_list()
    
    tokenized_lines = []
    for headln in headlns_lst:
        line = str(headln).strip().lower()
        line = regexp_tokenize(line, pattern)
        line = [tok for tok in line if tok not in stopwords and tok not in punct and tok.isalpha() and len(tok)>2]
        tokenized_lines.append(line)
    
    # Initialize lemmatized list of headlines
    pp_df = pd.DataFrame(columns = ["Headline"])
    
    lemmatized_lines = [[lemmatizer.lemmatize(token) for token in headln] for headln in tokenized_lines]
    line_df = pd.DataFrame({"Headline": lemmatized_lines})
    pp_df = pp_df.append(line_df, ignore_index=True)
 
    return pp_df

### Reddit News

In [5]:
reddit_pp = preprocessing(reddit)
reddit_pp
t = timeit.timeit(lambda:reddit_pp)
print(t)
print(reddit_pp)

Unnamed: 0,Headline
0,"[score, killed, pakistan, clash]"
1,"[japan, resume, refuelling, mission]"
2,"[press, egypt, gaza, border]"
3,"[economy, give, health, care]"
4,"[council, europe, bash, terror, blacklist]"
...,...
509231,"[heil, trump, donald, trump, white, nationalis..."
509232,"[people, speculating, could, madeleine, mccann]"
509233,"[professor, receives, arab, researcher, award]"
509234,"[nigel, farage, attack, response, trump, ambas..."


### Irland News

In [6]:
ireland_pp = preprocessing(ireland)
ireland_pp

Unnamed: 0,Headline
0,"[uup, see, possibility, voting, major]"
1,"[pub, targeted, curb, smoking, extended]"
2,"[paper, reveal, secret, link, neill, cabinet]"
3,"[domestic, chaos, italy, take, presidency]"
4,"[learning, star, owe, life]"
...,...
1611490,"[reserve, member, defence, force, allowed, ser..."
1611491,"[maureen, dowd, joe, biden, crazy, irish, plan..."
1611492,"[andy, murray, roll, back, year, centre, court]"
1611493,"[delta, variant, could, significant, damage, p..."


In [None]:
# Need to use pre-trained embeddings since headlines not enough
# Should we cut out randomly some data of the ireland dataset so that we have equal amount or do we want to normalize in the
# end the amount of headlines per theme for the amount of data ?

# Sentence Embeddings

We are sentence embeddings by averaging the pre-trained word embeddings from the GoogleNews vectors. These embeddings can be found at https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?resourcekey=0-wjGZdNAUop6WykTtMip30g.

In [9]:
# The goal is to be able to read the GoogleNews from its URL but haven't found a simple way to read a 
# bin.gz file from URL yet. 
model = gensim.models.KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin.gz', binary=True)

In [64]:
def mean_vector(word2vec_model, doc):
    """Calculate the mean vector according to a word2vec model for one document/headline"""
    # remove out-of-vocabulary words
    doc = [word for word in doc if word in word2vec_model]
    if len(doc) >= 1:
        return sum(word2vec_model[doc])/len(doc)
    else:
        return []

def doc_embeddings(dataset):
    """Calculate the mean vector for all documents in a dataset.
    Outputs a dataframe with document and 300 dim vector representation of it."""
    embeddings_df = pd.DataFrame()

    for doc in dataset["Headline"]:
        vec = mean_vector(model, doc)
        if len(vec) > 0:
            vec_df = pd.Series(vec)
            doc_df = pd.Series([doc]).append(vec_df, ignore_index = True)
            embeddings_df = embeddings_df.append(doc_df, ignore_index=True) 
    
    return embeddings_df

In [None]:
# Use all the data, not just first 1000 headlines when finalizing process.
reddit_embed = doc_embeddings(reddit_pp)
reddit_embed

In [66]:
ireland_embed = doc_embeddings(ireland_pp)
ireland_embed

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,291,292,293,294,295,296,297,298,299,300
0,"[uup, see, possibility, voting, major]",-0.019440,0.096291,0.004242,0.111938,-0.122192,0.011169,0.063950,-0.051147,0.126831,...,-0.105103,-0.063385,-0.040253,-0.077667,0.060120,-0.015381,-0.030247,-0.026241,0.083755,-0.034698
1,"[pub, targeted, curb, smoking, extended]",-0.083624,0.037817,-0.006555,0.198828,-0.109766,-0.000049,-0.002713,-0.003613,0.247559,...,0.015234,0.042723,-0.034412,0.052075,-0.091187,0.059448,0.062207,0.046216,0.077539,-0.031396
2,"[paper, reveal, secret, link, neill, cabinet]",-0.109766,0.024805,-0.097681,-0.043976,-0.031348,0.067725,0.056592,-0.013770,0.245703,...,-0.072803,0.006732,-0.129858,0.046387,-0.097412,-0.167065,0.065753,-0.116599,-0.008594,0.048206
3,"[domestic, chaos, italy, take, presidency]",-0.069678,0.009131,0.003613,0.110767,-0.011865,-0.110742,0.000977,-0.161548,0.070337,...,-0.104053,-0.004004,-0.069653,0.019873,-0.081329,0.116309,-0.025882,-0.066553,0.028613,0.098199
4,"[learning, star, owe, life]",0.045166,0.118469,0.081787,0.070648,0.093658,0.063385,0.142822,-0.122314,0.066040,...,-0.059326,0.094330,-0.176895,-0.117706,-0.069595,-0.030823,0.028152,0.034851,-0.002686,0.005997
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93,"[decisive, loss]",0.072021,0.074402,0.037495,-0.089233,0.029663,-0.036621,-0.067627,-0.306641,0.217285,...,0.001862,-0.014404,0.019531,-0.077393,-0.056885,0.110840,-0.101562,-0.075928,-0.063599,0.069092
94,"[meaty, food]",-0.113281,0.130859,-0.164062,0.199097,-0.008545,0.028320,0.063965,-0.101440,-0.188965,...,-0.204590,-0.239502,-0.059875,0.279297,0.058777,0.008423,0.044922,0.039307,0.031982,0.100235
95,"[terminal, opportunism]",0.250977,0.082642,-0.138916,0.208008,-0.098938,-0.235352,-0.096924,-0.029846,0.259766,...,-0.155945,-0.179688,-0.023926,0.000244,-0.083679,-0.063477,0.019531,0.001221,-0.126709,0.212280
96,"[surprised, scribe]",0.082886,0.087036,0.039307,-0.024536,0.002625,0.089111,0.089844,0.116455,0.133461,...,0.100708,-0.163574,0.015137,0.006470,-0.064697,-0.154785,-0.206543,-0.067993,-0.052246,0.055420


In [71]:
ireland_embed.loc[:, 1:].to_numpy()

array([[-0.0194397 ,  0.09629059,  0.00424194, ..., -0.0262413 ,
         0.08375549, -0.03469849],
       [-0.08362427,  0.03781738, -0.00655518, ...,  0.04621582,
         0.07753906, -0.03139649],
       [-0.10976563,  0.02480469, -0.09768067, ..., -0.11659851,
        -0.00859375,  0.04820557],
       ...,
       [ 0.25097656,  0.0826416 , -0.13891602, ...,  0.0012207 ,
        -0.12670898,  0.21228027],
       [ 0.08288574,  0.08703613,  0.03930664, ..., -0.06799316,
        -0.05224609,  0.05541992],
       [ 0.35546875,  0.18359375,  0.14941406, ...,  0.04980469,
        -0.22265625,  0.00405884]])

---

# Clustering

In [131]:
def generate_clusters(sentence_embeddings,
                      n_neighbors,
                      n_components, 
                      min_cluster_size,
                      random_state = None):
    """
    Generate HDBSCAN cluster object after reducing embedding dimensionality with UMAP
    """
    
    umap_embeddings = umap.UMAP(n_neighbors=n_neighbors, 
                                n_components=n_components, 
                                metric='cosine', 
                                random_state=random_state).fit_transform(sentence_embeddings.loc[:, 1:].to_numpy())

    clusters = hdbscan.HDBSCAN(min_cluster_size = min_cluster_size,
                               metric='euclidean', 
                               cluster_selection_method='eom').fit(umap_embeddings)
    

    return clusters


def score_clusters(clusters, prob_threshold = 0.05):
    """
    Returns the label count and cost of a given cluster supplied from running hdbscan
    """
    
    cluster_labels = clusters.labels_
    label_count = len(np.unique(cluster_labels))
    total_num = len(clusters.labels_)
    cost = (np.count_nonzero(clusters.probabilities_ < prob_threshold)/total_num)
    
    return label_count, cost


def random_search(embeddings, space, num_evals):
    """
    Randomly search hyperparameter space and limited number of times 
    and return a summary of the results
    """
    
    results = []
    
    for i in range(num_evals):
        n_neighbors = random.choice(space['n_neighbors'])
        n_components = random.choice(space['n_components'])
        min_cluster_size = random.choice(space['min_cluster_size'])
        
        clusters = generate_clusters(embeddings, 
                                     n_neighbors = n_neighbors, 
                                     n_components = n_components, 
                                     min_cluster_size = min_cluster_size, 
                                     random_state = 42)
    
        label_count, cost = score_clusters(clusters, prob_threshold = 0.05)
                
        results.append([i, n_neighbors, n_components, min_cluster_size, 
                        label_count, cost])
    
    result_df = pd.DataFrame(results, columns=['run_id', 'n_neighbors', 'n_components', 
                                               'min_cluster_size', 'label_count', 'cost'])
    
    return result_df.sort_values(by='cost')

In [132]:
space = {"n_neighbors": range(5,15),
        "n_components": range(2,7),
        "min_cluster_size": range(2,15),
        "random_state": 42}

random_search(embeddings=reddit_embed, space=space, num_evals= 100)

Unnamed: 0,run_id,n_neighbors,n_components,min_cluster_size,label_count,cost
78,78,7,3,9,2,0.000000
45,45,7,4,10,3,0.001001
22,22,7,4,11,3,0.001001
16,16,7,2,10,3,0.003003
83,83,6,4,8,3,0.008008
...,...,...,...,...,...,...
35,35,9,4,13,15,0.574575
63,63,7,3,14,14,0.583584
10,10,11,3,13,15,0.605606
57,57,13,3,14,12,0.627628


In [126]:
reddit_clusters = generate_clusters(sentence_embeddings= reddit_embed, n_neighbors= 3, n_components=3, min_cluster_size=3, random_state=42)

## Splitting data into a train and a test set 
80% for training and 20% for testing.
Data is shuffled.

In [108]:
reddit_df = shuffle(reddit_pp, random_state=42)
ireland_df = shuffle(ireland_pp, random_state=42)

In [109]:
# Train Test Split
train_reddit, test_reddit = train_test_split(reddit_df, test_size=0.20, random_state=42)
train_ireland, test_ireland = train_test_split(ireland_df, test_size=0.20, random_state=42)

---