In [1]:
import pickle
import pandas as pd
import numpy as np
import nltk
import re

In [2]:
with open('cleaned_content.pkl','rb') as file:
    content = pickle.load(file)

In [44]:
news_df = pd.read_pickle("./cleaned_news.pkl")

In [3]:
len(content)

13461

In [4]:
content_df = pd.read_pickle('cleaned_content.pkl')

In [5]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

In [6]:
#tokenize and stem
def tokenize_and_stem(text):
    #tokenize by sentence, then by word to ensure punctuation is caught
    tokens = [ word for sentence in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sentence)]
    filtered_tokens =[] 
    # filter tokens not containing letters
    for token in tokens:
        if re.search('[a-zA-Z]',token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

def tokenize_no_stem(text):
    #tokenize by sentence, then by word to ensure punctuation is caught
    tokens = [ word for sentence in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sentence)]
    filtered_tokens =[]
    # filter tokens not containing letters
    for token in tokens:
        if re.search('[a-zA-Z]',token):
            filtered_tokens.append(token)
    return filtered_tokens

#### Document Term Matrix
To get a Tf-idf matrix, first count word occurrences by document. This is transformed into a document-term matrix (dtm). This is also just called a term frequency matrix
Then apply the term frequency-inverse document frequency weighting: words that occur frequently within a document but not frequently within the corpus receive a higher weighting as these words are assumed to contain more meaning in relation to the document.
##### parameters:
max_df: this is the maximum frequency within the documents a given feature can have to be used in the tfi-idf matrix. If the term is in greater than 80% of the documents it probably carries little meanining.
min_idf: this could be an integer (e.g. 5) and the term would have to be in at least 5 of the documents to be considered. Here I pass 5 ie: the term must be in at least 5 of the documents. 

In [40]:
from sklearn.feature_extraction.text import TfidfVectorizer

#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
                                 min_df=5, stop_words='english',
                                 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))

tfidf_matrix = tfidf_vectorizer.fit_transform(content) #fit the vectorizer to content

print(tfidf_matrix.shape)

(13461, 84304)


In [61]:
type(tfidf_matrix)

scipy.sparse.csr.csr_matrix

In [54]:
tfidf_vectorizer.vocabulary_.items()

dict_items([('veteran', 79356), ('salut', 64655), ('worcest', 82036), ("'s", 71), ('breakfast', 9990), ('club', 13959), ('won', 81954), ('heart', 32249), ('mind', 45803), ('belli', 8556), ('hm', 33127), ('forc', 27913), ('met', 45317), ('postal', 55686), ('order', 51388), ('street', 71764), ('10am', 1481), ('saturday', 64831), ('design', 20019), ('allow', 3967), ('place', 54131), ('meet', 44951), ('socialis', 69081), ('eat', 22698), ('drink', 22005), ('hunger', 34028), ('loneli', 41782), ('march', 43759), ('father-of-two', 26311), ('dave', 18763), ('carney', 11699), ('age', 3454), ('hill', 32964), ('set', 67211), ('inspir', 36280), ('similar', 68278), ('countri', 17150), ('said', 63712), ('pictur', 54001), ('good', 30201), ('respons', 62042), ('attend', 6593), ('saw', 64927), ('articl', 6066), ('newspap', 49042), ('turn', 77024), ('old', 50450), ('chap', 12709), ('travel', 76474), ('late', 39464), ('parad', 52203), ('hour', 33676), ('generat', 29566), ('lot', 42177), ('estim', 24374), 

In [41]:
#terms is just a list of the features used in the tf-idf matrix
terms = tfidf_vectorizer.get_feature_names()
#type(terms)#list

In [38]:
len(terms)

200000

In [42]:
terms[:25]

["'a",
 "'d",
 "'d alway",
 "'d come",
 "'d just",
 "'d like",
 "'d love",
 "'d say",
 "'d think",
 "'d tri",
 "'he",
 "'i",
 "'i 'm",
 "'i know",
 "'i love",
 "'i n't",
 "'i think",
 "'i ve",
 "'it",
 "'it 's",
 "'m",
 "'m 'm",
 "'m abl",
 "'m afraid",
 "'m alway"]

dist is defined as 1 - the cosine similarity of each document.
Cosine similarity is measured against the tf-idf matrix and can be used to generate a measure of similarity between each document and the other documents in the corpus 
Note that with dist it is possible to evaluate the similarity of any two or more documents

In [15]:
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(tfidf_matrix)
type(dist)

numpy.ndarray

CLUSTERING

In [43]:
from sklearn.cluster import KMeans
num_clusters = 5
km = KMeans(n_clusters=num_clusters)
%time km.fit(tfidf_matrix)
clusters = km.labels_.tolist()

CPU times: user 11min 46s, sys: 5.7 s, total: 11min 51s
Wall time: 12min


In [46]:
km.labels_

array([0, 4, 1, ..., 0, 1, 2], dtype=int32)

In [49]:
from sklearn.metrics import silhouette_score

SSEs = []
Sil_coefs = []
for k in range(2,20):
    km = KMeans(n_clusters=k, random_state=1)
    km.fit(tfidf_matrix)
    labels = km.labels_
    Sil_coefs.append(silhouette_score(tfidf_matrix, labels, metric='euclidean'))
    SSEs.append(km.inertia_) 

KeyboardInterrupt: 

The silhouette value is a measure of how similar an object is to its own cluster (cohesion) compared to other clusters (separation). The silhouette ranges from −1 to +1, where a high value indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters.

TOPIC MODELLING WITH GENSIM

In [52]:
# gensim
from gensim import corpora, models, similarities, matutils

# logging for gensim (set to INFO)
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

Convert to gensim
We need to convert our sparse scipy matrix to a gensim-friendly object called a Corpus. This is also essential for term-document matrices that are larger than local memory.

In [51]:
# Convert sparse matrix of counts to a gensim corpus
corpus = matutils.Sparse2Corpus(tfidf_matrix)

Map matrix rows to words (tokens)
We need to save a mapping (dict) of row id to word (token) for later use by gensim:

In [55]:
id2word = { identifier: word for word, identifier in tfidf_vectorizer.vocabulary_.items()}

In [56]:
len(id2word)

84304

In [57]:
#LDA: At this point, we can create an LDA model. It requires our corpus of word counts, mapping of row ids to words, and our selection of num_topics.

In [58]:
lda = models.LdaModel(corpus=corpus, num_topics=5, minimum_probability=0.03, id2word=id2word, passes=10)

2018-08-21 17:46:08,572 : INFO : using symmetric alpha at 0.2
2018-08-21 17:46:08,578 : INFO : using symmetric eta at 0.2
2018-08-21 17:46:08,642 : INFO : using serial LDA version on this node
2018-08-21 17:46:08,706 : INFO : running online (multi-pass) LDA training, 5 topics, 10 passes over the supplied corpus of 84304 documents, updating model once every 2000 documents, evaluating perplexity every 20000 documents, iterating 50x with a convergence threshold of 0.001000
2018-08-21 17:46:08,742 : INFO : PROGRESS: pass 0, at document #2000/84304
2018-08-21 17:46:09,516 : INFO : merging changes from 2000 documents into a model of 84304 documents
2018-08-21 17:46:09,576 : INFO : topic #0 (0.200): 0.001*"allow consum" + 0.001*"car driven" + 0.001*"chatham" + 0.001*"acorn" + 0.001*"55-year-old" + 0.001*"blood alcohol" + 0.001*"'s site" + 0.001*"ashraf" + 0.001*"appoint minist" + 0.001*"befor invest aci"
2018-08-21 17:46:09,580 : INFO : topic #1 (0.200): 0.001*"bbc 's" + 0.001*"avail www.sec.

In [59]:
#Let's take a look at what happened. Here are the 5 most important words for each topic we found:
lda.print_topics()


2018-08-21 17:57:18,535 : INFO : topic #0 (0.200): 0.002*"angel angel" + 0.002*"befor bed" + 0.002*"bran" + 0.002*"'s consum" + 0.002*"celebr win" + 0.001*"arriv hungari" + 0.001*"baazi hot" + 0.001*"breen" + 0.001*"a.m. p.m." + 0.001*"avenu new"
2018-08-21 17:57:18,540 : INFO : topic #1 (0.200): 0.000*"barack" + 0.000*"base larg" + 0.000*"add del.icio.us tweet" + 0.000*"audio video file" + 0.000*"becaus simpli" + 0.000*"boston globe" + 0.000*"ballist" + 0.000*"award recipi" + 0.000*"busi confid" + 0.000*"care educ"
2018-08-21 17:57:18,544 : INFO : topic #2 (0.200): 0.001*"buy food" + 0.001*"averag hour" + 0.001*"billi vunipola" + 0.001*"7.30pm" + 0.001*"buster" + 0.001*"busi acumen" + 0.001*"child parent" + 0.001*"ad anoth" + 0.001*"accord statist" + 0.001*"chanc surviv"
2018-08-21 17:57:18,548 : INFO : topic #3 (0.200): 0.001*"albert" + 0.001*"afford health" + 0.001*"accept respons" + 0.001*"afternoon polic" + 0.001*"british food" + 0.001*"alma mater" + 0.000*"associ american" + 0.00

[(0,
  '0.002*"angel angel" + 0.002*"befor bed" + 0.002*"bran" + 0.002*"\'s consum" + 0.002*"celebr win" + 0.001*"arriv hungari" + 0.001*"baazi hot" + 0.001*"breen" + 0.001*"a.m. p.m." + 0.001*"avenu new"'),
 (1,
  '0.000*"barack" + 0.000*"base larg" + 0.000*"add del.icio.us tweet" + 0.000*"audio video file" + 0.000*"becaus simpli" + 0.000*"boston globe" + 0.000*"ballist" + 0.000*"award recipi" + 0.000*"busi confid" + 0.000*"care educ"'),
 (2,
  '0.001*"buy food" + 0.001*"averag hour" + 0.001*"billi vunipola" + 0.001*"7.30pm" + 0.001*"buster" + 0.001*"busi acumen" + 0.001*"child parent" + 0.001*"ad anoth" + 0.001*"accord statist" + 0.001*"chanc surviv"'),
 (3,
  '0.001*"albert" + 0.001*"afford health" + 0.001*"accept respons" + 0.001*"afternoon polic" + 0.001*"british food" + 0.001*"alma mater" + 0.000*"associ american" + 0.000*"announc open" + 0.000*"bank provid" + 0.000*"beach fl sbwire"'),
 (4,
  '0.001*"bit time" + 0.001*"analyt capabl" + 0.001*"blog applic" + 0.001*"burkina faso" 

In [60]:
# we can quantify the 'fit' of our model, to compare with other corpora, etc.
lda.log_perplexity(corpus)

2018-08-21 17:58:44,940 : INFO : -10.906 per-word bound, 1919.0 perplexity estimate based on a held-out corpus of 84304 documents with 127381 words


-10.906127153073841