In [1]:
%matplotlib inline

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import igraph as ig
import os
import pdb
import nltk
import string
import json
import random
import pickle
import pandas as pd
import seaborn as sns
import scipy

from IPython.display import clear_output
from os import makedirs
from os.path import join, exists
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
logging.root.level = logging.INFO

In [3]:
INPUT_DIR = join('data', 'parsed')
OUTPUT_DIR = join('tmp')
makedirs(OUTPUT_DIR, exist_ok=True)

In [4]:
np.random.seed(1234)

In [5]:
plt.style.use('classic')

In [6]:
from nltk.stem.porter import PorterStemmer

def tokenize(text):
    tokens = nltk.word_tokenize(text)
    stems = []
    for item in tokens:
        stems.append(PorterStemmer().stem(item))
    return stems

In [7]:
import re

#compile regular expressions that match repeated characters and emoji unicode
emoji = re.compile(u'[^\x00-\x7F\x80-\xFF\u0100-\u017F\u0180-\u024F\u1E00-\u1EFF]',re.UNICODE)
multiple = re.compile(r"(.)\1{1,}", re.DOTALL)

def text_format(text):

    #strip emoji
    stripped = emoji.sub('',text)

    #strip URLs
    stripped = re.sub(r'http[s]?[^\s]+','', stripped)

    #strip html '&amp;', '&lt;', etc.  
    stripped = re.sub(r'[\&].*;','',stripped)

    #strip punctuation
    stripped = re.sub(r'[#|\!|\-|\+|:|//]', " ", stripped)

    #strip whitespace down to one.
    stripped = re.sub('[\s]+' ,' ', stripped).strip()

    #strip multiple occurrences of letters
    stripped = multiple.sub(r"\1\1", stripped)

    #strip all non-latin characters
    stripped = re.sub('[^a-zA-Z0-9|\']', " ", stripped).strip()

    return stripped

In [12]:
from nltk.tag import pos_tag

print("Building vocabulary...")

docs = {}
doc_tokens = {}
count = 0

for fname in os.listdir(INPUT_DIR):
    if fname != '.DS_Store':
        with open(join(INPUT_DIR, fname), 'r') as f:
            doc = json.load(f)
            docs[doc['id']] = doc
            text = doc['body']
            table = str.maketrans({key: None for key in string.punctuation})
            text = text.lower().translate(table)
            text = text_format(text)
            tagged = pos_tag(text.split()) #use NLTK's part of speech tagger
            words = [word for word,pos in tagged if pos in ['NN', 'NNS', 'NNP', 'NNPS']]
            docs[doc['id']]["text"] = " ".join(words)
            clear_output(wait=True)
            count += 1
            print(count)

9095


# TF-IDF Vectorizer

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

print("Processing TF-IDF")

vectorizer_tfidf = TfidfVectorizer(tokenizer=tokenize, norm='l2', stop_words='english')
X_tfidf = vectorizer_tfidf.fit_transform(token_values)

print(X_tfidf.shape)

(9095, 69281)


In [None]:
pickle.dump(X_tfidf, open('results/X_tfidf.pickle', 'wb'))

In [18]:
X_tfidf = pickle.load(open('results/X_tfidf.pickle', 'rb'))

print(X_tfidf.shape)

(9095, 69281)


# Add Keywords to docs

In [14]:
def top_tfidf_feats(row, features, top_n=25):
    ''' Get top n tfidf values in row and return them with their corresponding feature names.'''
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df

In [15]:
def top_feats_in_doc(Xtr, features, row_id, top_n=25):
    ''' Top tfidf features in specific document (matrix row) '''
    row = np.squeeze(Xtr[row_id].toarray())
    return top_tfidf_feats(row, features, top_n)

In [None]:
for i, key in enumerate(token_keys):
    print('{0}\r'.format(i/len(token_keys)))
    clear_output(wait=True)
    docs[key]['top_tfidf'] = top_feats_in_doc(X_tfidf, vectorizer_tfidf.get_feature_names(), i, 50)
    
print(docs['2016-05-20-28']['top_tfidf'])

# Add Vector to docs

In [47]:
for i, key in enumerate(token_keys):
    docs[key]['tfidf'] = X_tfidf[i]

docs['2016-05-20-28']['tfidf']

<1x69281 sparse matrix of type '<class 'numpy.float64'>'
	with 196 stored elements in Compressed Sparse Row format>

# Save & Load Docs

In [69]:
pickle.dump(docs, open('results/docs.pickle', 'wb'))

In [16]:
docs = pickle.load(open('results/docs.pickle', 'rb'))

docs['2016-05-20-28'].keys()

dict_keys(['authors', 'section_id', 'id', 'top_tfidf', 'mod_topic', 'webUrl', 'tags', 'webTitle', 'keyword', 'guardianId', 'apiUrl', 'webPublicationDate', 'sectionId', 'body', 'tfidf', 'text'])

In [17]:
sorted_docs = sorted(docs.items(), key=lambda x: x[0], reverse=True)
token_keys = [x[0] for x in sorted_docs]
token_values = [x[1]["text"] for x in sorted_docs]
print(len(token_values))

9095


# Network-based Approach - Build the graph from scratch

In [None]:
g = ig.Graph()
for i, key in enumerate(token_keys):
    g.add_vertices(key)
    
print(g.vcount())

In [None]:
g.delete_edges(g.es)
edges = []
for i, source in enumerate(token_keys):
    clear_output(wait=True)
    print('{0}\r'.format(i/len(token_keys)))
    for j, target in enumerate(token_keys):
        edges.append((source, target))
            
g.add_edges(edges)  
edges = [] 
print(g.ecount())

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

count = 0
cos_sims = cosine_similarity(X_tfidf)
for i, source in enumerate(token_keys):
    sims = cos_sims[i]
    print('{0}\r'.format(i/len(token_keys)))
    clear_output(wait=True)
    for j, target in enumerate(token_keys):            
        g.es[count]["weight"] = sims[j]
        count += 1

print(g.ecount())

In [None]:
g.simplify(multiple=True, loops=True, combine_edges="max")
print(g.ecount())

In [None]:
g.es[1]['weight']

# Apply Modularity

Create New Sub Graph or Use below function to load from data

In [None]:
sub_g = g.es.select(weight_ge=0.2).subgraph()

In [None]:
sub_g.write_gml('results/graph_02.gml')

Load from Data

In [8]:
sub_g = ig.Graph.Read_GML('results/graph_02.gml')

In [9]:
def network_properties(g, community):
    print("Nodes {}".format(g.vcount()))
    print("Edges {}".format(g.ecount()))
    print("Diameter {}".format(g.diameter()))
    print("LCC {}".format(g.clusters().giant().vcount()))
    print("APL {}".format(g.average_path_length()))
    print("AD {}".format(sum(g.degree())/len(g.degree())))
    print("Giant Size {}".format(len(community.giant().vs())))
    print("NO of Communities {}".format(max(community.membership) + 1))
    print("Modularity {}".format(g.modularity(community.membership)))

In [10]:
sg = sub_g.es.select(weight_ge=0.2).subgraph()
communities = sg.community_multilevel(weights='weight',return_levels=True)
for com in communities:
    print(max(com.membership) + 1)
community = communities[-1]
print("Nodes {}".format(sg.vcount()))
print("Edges {}".format(sg.ecount()))
print("LCC {}".format(sg.clusters().giant().vcount()))
print("NO of Communities {}".format(max(community.membership) + 1))
print("Modularity {}".format(sg.modularity(community.membership)))

586
166
140
139
Nodes 8213
Edges 132243
LCC 7950
NO of Communities 139
Modularity 0.7987487477056004


# Network Topic Distribution

In [11]:
def network_to_topics(com):
    topics = {}

    for i in range(max(com.membership) + 1):
        for v in com.subgraph(i).vs():
            topic_no = i
            try:
                topics[topic_no]
            except KeyError:
                topics[topic_no] = []

            topics[topic_no].append(v['name'])
    
    net_topics = {}
    count = 1
    for key, value in sorted(topics.items(), key=lambda x: len(x[1]), reverse=True):
        net_topics[count] = value
        count += 1
        
    return net_topics

In [12]:
net_topics = network_to_topics(community)
len(net_topics)

139

## Network Topic Words

In [13]:
import itertools
from collections import Counter

def keywords_to_keyword_table(topics, n):
    result = {}
    for topic in topics:
        keywords = []
        for top in topic[1]:
            keywords += list(top['feature'])
        net_keywords = Counter(keywords)
        net_keywords = sorted(net_keywords.items(), key=lambda x: x[1], reverse=True)
        keywords2 = list(map(lambda x: x[0], net_keywords))
        result[topic[0]] = (len(topic[1]), keywords2[:n])
    return result

In [14]:
def topics_to_keywords(topics):
    results = {}

    for k, vs in topics.items():
        results[k] = []
        for v in vs:
            top_df = docs[v]['top_tfidf']
            results[k].append(top_df)
#             results[k].append(top_df[(top_df.tfidf >= 0.02)])

    results = sorted(results.items(), key=lambda x: len(x[1]), reverse=True)
    
    return results

In [18]:
net_keywords = keywords_to_keyword_table(topics_to_keywords(net_topics), 10)

In [19]:
import csv

net_s_k = sorted(net_keywords.items(), key=lambda x: x[1][0], reverse=True)


length = len(net_s_k)
rows = []
for i in range(length):
    row = [i+1]
    row += [net_s_k[i][1][0], net_s_k[i][1][1]]
    rows.append(row)

with open('tmp/keywords.csv', 'w+') as file:    
    for row in rows:
        writer = csv.writer(file, delimiter=';')
        writer.writerow(row)

# Visualization

In [20]:
g = ig.Graph()
for i in range(30):
    g.add_vertices(str(i + 1))
    
print(g.vcount())

30


In [21]:
labels = []
for i in range(30):
    labels.append(",".join(net_keywords[i + 1][1][:5]))
len(labels)

30

In [22]:
g.delete_edges(g.es)
edges = []
for i in range(30):
    source = str(i+1)
    for j in range(30):
        target = str(j+1)
        if source != target:
            dup = set(net_keywords[i + 1][1]) & set(net_keywords[j + 1][1])
            if dup:
                g.add_edge(source,target,weight=len(dup)) 

print(g.ecount())

170


In [23]:
g.simplify(multiple=True, loops=True, combine_edges="max")
print(g.ecount())

85


In [24]:
g.write_gml('tmp/words.gml')

# LDA

In [25]:
from gensim import matutils
from gensim.models.ldamodel import LdaModel
from gensim.models.wrappers import LdaVowpalWabbit

In [26]:
from sklearn.feature_extraction.text import CountVectorizer

print("Processing Count")

vectorizer = CountVectorizer(tokenizer=tokenize, stop_words='english')
X_count = vectorizer.fit_transform(token_values)
vocab = vectorizer.get_feature_names()

print(X_count.shape)

(9095, 69281)


In [None]:
pickle.dump(X_count, open('results/X_count.pickle', 'wb'))
pickle.dump(vocab, open('results/X_count_vocab.pickle', 'wb'))

In [27]:
X_count = pickle.load(open('results/X_count.pickle', 'rb'))
vocab = pickle.load(open('results/X_count_vocab.pickle', 'rb'))

print(X_count.shape)

(9095, 69281)


# Search for the best number of topic K

In [68]:
def data_to_folds(data, k):
  folds = []

  for i in range(k):
    fold = []
    for j in range(i, data.shape[0], k):
      fold.append(data[j])
    folds.append(np.random.permutation(fold))

  return folds

folds = data_to_folds(X_count, 10)

In [69]:
# Partition data to train and test sets
# return list train data, list of test data
def split_folds(folds, i):
  train = folds[:i] + folds[i+1:]
  train = [x for data in train for x in data] # Flatten
  train = scipy.sparse.vstack((train))
  test = scipy.sparse.vstack(folds[i])
  
  return train, test

In [224]:
X_count_train, X_count_test = split_folds(folds, 9)

In [237]:
num_topics = 10

In [238]:
lda_model = LdaVowpalWabbit('/usr/local/Cellar/vowpal-wabbit/8.1.1/bin/vw',
                     corpus=matutils.Sparse2Corpus(X_count_train.T),
                     num_topics=num_topics,
                     alpha=(1/num_topics),
                     eta=(1/num_topics),
                     passes=50,
                     id2word=dict([(i, s) for i, s in enumerate(vocab)]))

In [239]:
train_log_prep_gensim = lda_model.log_perplexity(matutils.Sparse2Corpus(X_count_train.T))
test_log_prep_gensim = lda_model.log_perplexity(matutils.Sparse2Corpus(X_count_test.T))
print(train_log_prep_gensim)
print(test_log_prep_gensim)
print(np.exp(-train_log_prep_gensim))
print(np.exp(-test_log_prep_gensim))

-8.258062
-8.605109
3858.60886565
5459.48094936


# Train on all data

In [29]:
num_topics = 40

In [None]:
lda_model = LdaVowpalWabbit('/usr/local/Cellar/vowpal-wabbit/8.1.1/bin/vw',
                     corpus=matutils.Sparse2Corpus(X_count.T),
                     num_topics=num_topics,
                     alpha=(1/num_topics),
                     eta=(1/num_topics),
                     passes=100,
                     id2word=dict([(i, s) for i, s in enumerate(vocab)]))

In [116]:
lda_model.save('results/lda_' + str(num_topics))

In [38]:
lda_model = LdaVowpalWabbit.load('results/lda_' + str(num_topics))

# LDA Topic Words

In [35]:
topics_matrix = lda_model.show_topics(formatted=False, num_words=10, num_topics=lda_model.num_topics)
lda_topic_keywords = {}
count = 1
for topic in topics_matrix:
    lda_topic_keywords[count] = [str(word[1]) for word in topic]
    count += 1

lda_topic_keywords.keys()

dict_keys([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40])

In [36]:
import csv

lda_s_k = sorted(lda_topic_keywords.items(), key=lambda x: x[1][0], reverse=True)

length = len(lda_s_k)
rows = []
for i in range(length):
    row = [i+1]
    row += [lda_s_k[i][0], lda_s_k[i][1]]
    rows.append(row)

with open('tmp/keywords.csv', 'w+') as file:    
    for row in rows:
        writer = csv.writer(file, delimiter=';')
        writer.writerow(row)

# Topic Coherence Measures

In [None]:
from gensim.models.coherencemodel import CoherenceModel
from gensim.models.ldamodel import LdaModel
from gensim.corpora.dictionary import Dictionary
from gensim.parsing.preprocessing import STOPWORDS

In [None]:
def tokenize(text):
    tokens = [token for token in nltk.word_tokenize(text)]
    stems = []
    for item in tokens:
        stems.append(PorterStemmer().stem(item))
    return stems

In [None]:
texts = []
count = 0
for value in token_values:
    texts.append(tokenize(value))
    clear_output(wait=True)
    count += 1
    print(count)

In [None]:
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

In [None]:
topic_words = [['yorkshir', 'detent', 'applic', 'brain', 'ali'],
['year', 'govern', 'servic', 'plan', 'council'],
['year', 'time', 'photograph', 'day', 'thing']]

In [None]:
cm = CoherenceModel(topics=topic_words, 
                    corpus=corpus, 
                    dictionary=dictionary, 
                    coherence='u_mass')

In [None]:
cm.get_coherence()