In [1]:
%matplotlib inline

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import igraph as ig
import lda
import os
import pdb
import nltk
import string
import json
import random

from IPython.display import clear_output
from os import makedirs
from os.path import join, exists

In [2]:
INPUT_DIR = join('tmp', 'parsed')
OUTPUT_DIR = join('tmp')
makedirs(OUTPUT_DIR, exist_ok=True)

In [3]:
from nltk.stem.porter import PorterStemmer

def tokenize(text):
    tokens = nltk.word_tokenize(text)
    stems = []
    for item in tokens:
        stems.append(PorterStemmer().stem(item))
    return stems

In [4]:
print("Building vocabulary...")

tokens = []
docs = {}
for fname in os.listdir(INPUT_DIR):
    if fname != '.DS_Store':
        with open(join(INPUT_DIR, fname), 'r') as f:
            doc = json.load(f)
            docs[doc['id']] = doc
            text = doc['body']
            table = str.maketrans({key: None for key in string.punctuation})
            tokens.append((doc['id'], text.lower().translate(table)))

token_keys = [x[0] for x in tokens]
token_values = [x[1] for x in tokens]
print(len(tokens))

Building vocabulary...
9086


# Build the graph from scratch

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

print("Processing TF-IDF")

vectorizer = TfidfVectorizer(tokenizer=tokenize, stop_words='english')
X = vectorizer.fit_transform(token_values)

print(X.shape)

Processing TF-IDF
(9086, 141273)


In [6]:
g = ig.Graph()
for i, key in enumerate(token_keys):
    g.add_vertices(key)
    
print(g.vcount())

9086


In [10]:
g.delete_edges(g.es)
edges = []
for i, key in enumerate(token_keys):
    print('{0}\r'.format(i/len(token_keys)))
    clear_output(wait=True)
    for i_2, key_2 in enumerate(token_keys):
        edges.append((key, key_2))
            
g.add_edges(edges)  
edges = [] 
print(g.ecount())

82555396


In [12]:
from sklearn.metrics.pairwise import cosine_similarity

count = 0
weights = []
for i, key in enumerate(token_keys):
    similarities = cosine_similarity(X[i], X)[0]
    print('{0}\r'.format(i/len(token_keys)))
    clear_output(wait=True)
    for i_2, key_2 in enumerate(token_keys):            
        g.es[count]["weight"] = similarities[i_2]
        count += 1

print(g.ecount())

82555396


Remove duplicated and self-loop edges

In [13]:
g.simplify(multiple=True, loops=True, combine_edges="max")
print(g.ecount())

41273155


In [14]:
g.es[1]['weight']

0.027366185195034684

In [15]:
g.write_gml('tmp/igraph.gml')

In [65]:
g = ig.Graph.Read_GML('tmp/igraph.gml')

FileNotFoundError: [Errno 2] No such file or directory: 'tmp/igraph_01.gml'

In [16]:
g.es[1].tuple

(0, 2)

In [17]:
g.vs[0]

igraph.Vertex(<igraph.Graph object at 0x10e1f0d68>, 0, {'name': '2016-05-01-1'})

In [73]:
sub_g = g.es.select(weight_ge=0).subgraph()

In [75]:
print("Nodes {}".format(sub_g.vcount()))
print("Edges {}".format(sub_g.ecount()))
# print("Diameter {}".format(sub_g.diameter()))
# print("LCC {}".format(sub_g.clusters().giant().vcount()))
# print("APL {}".format(sub_g.average_path_length()))
print("AD {}".format(sum(sub_g.degree())/len(sub_g.degree())))

Nodes 9086
Edges 41273155
AD 9085.0


In [26]:
sub_g.es()[0].tuple

(0, 1734)

In [23]:
print(sub_g.vs[1])
print(sub_g.vs[3])

igraph.Vertex(<igraph.Graph object at 0x10e1f0c78>, 0, {'name': '2016-05-01-122'})
igraph.Vertex(<igraph.Graph object at 0x10e1f0c78>, 3, {'name': '2016-05-01-81'})


In [39]:
sub_g.write_gml('tmp/subgraph_01.gml')

# Community Detection

In [21]:
g = ig.Graph.Read_GML('tmp/subgraph_01.gml')

In [22]:
sg = g.es.select(weight_ge=0.2).subgraph()

In [23]:
# community = sg.community_infomap(edge_weights='weight')
community = sg.community_multilevel(weights='weight')
# community = sg.community_leading_eigenvector(weights='weight')

In [25]:
print(len(community.giant().vs()))
print(max(community.membership))

612
226


In [None]:
for i in range(max(community.membership)):
    for v in community.subgraph(i).vs():
        print("Topic:{} File:{} Title:{}".format(i, v['name'], docs[v['name']]['webTitle']))

# LDA

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(tokenizer=tokenize, stop_words='english')
X = vectorizer.fit_transform(token_values)

print(X.shape)

(9086, 141273)


In [None]:
from gensim import matutils
from gensim.models.ldamodel import LdaModel
import logging
logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)
logging.root.level = logging.INFO 

vocab = vectorizer.get_feature_names()
%time model = LdaModel(matutils.Sparse2Corpus(X), num_topics=226, passes=50, id2word=dict([(i, s) for i, s in enumerate(vocab)]))

In [18]:
random.seed(1234)

import logging
logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)
logging.root.level = logging.INFO 

model = lda.LDA(n_topics=226, n_iter=500, random_state=1)
model.fit(X) 

<lda.lda.LDA at 0x10ff71e10>

In [19]:
model.loglikelihoods_

[-62315246.82184576,
 -41876225.91660596,
 -39445246.49574409,
 -38541915.86386203,
 -38055713.18078848,
 -37751968.54957015,
 -37539842.882761106,
 -37384775.09410583,
 -37267310.24872986,
 -37177068.978081815,
 -37106785.53718555,
 -37038307.50484783,
 -36999428.71756243,
 -36952980.10411127,
 -36913235.96321809,
 -36881035.44178128,
 -36850973.83775497,
 -36829097.432930805,
 -36802845.03211314,
 -36784860.64174221,
 -36769968.29460513,
 -36747377.910839446,
 -36739656.531647734,
 -36723341.92872516,
 -36709627.093720585,
 -36693643.63736573,
 -36684448.38815882,
 -36675038.16955687,
 -36666628.57038627,
 -36656209.678821094,
 -36649861.68358176,
 -36645132.77845753,
 -36629221.07325693,
 -36631857.58642031,
 -36618503.877287865,
 -36605720.26917494,
 -36605773.08857173,
 -36605324.48081115,
 -36587709.57708163,
 -36584063.27008106,
 -36575889.01301607,
 -36572751.49003264,
 -36566194.53408205,
 -36563200.21564513,
 -36561874.521596454,
 -36547381.94235644,
 -36546164.56168147,
 -36

In [20]:
import pickle

pickle.dump(model, open('tmp/lda.pickle', 'wb'))

In [21]:
model2 = pickle.load(open('tmp/lda.pickle', 'rb'))

In [18]:
doc_topic = model.doc_topic_
print("type(doc_topic): {}".format(type(doc_topic)))
print("shape: {}".format(doc_topic.shape))

type(doc_topic): <class 'numpy.ndarray'>
shape: (9086, 226)


In [None]:
for n in range(len(doc_topic)):
    topic_most_pr = doc_topic[n].argmax()
    docs[token_keys[n]]['topic_most_pr'] = topic_most_pr

sorted_items = sorted(docs.items(), key=lambda x: x[1]['topic_most_pr'])

for item in sorted_items:
    print("Topic:{} File:{} Title:{}...".format(item[1]['topic_most_pr'], item[0], item[1]['webTitle'][:100]))

# Compare two similar groups from two approaches