In [1]:
# visualize the document clustering output using matplotlib and mpld3 (a matplotlib wrapper for D3.js

In [6]:
from __future__ import print_function
from time import time

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

In [7]:
import pandas as pd
from urlparse import urlparse
import tldextract
from collections import Counter
import numpy as np

In [8]:
n_samples = 2000
n_features = 1000
n_topics = 10
n_top_words = 20


def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

In [10]:
df = pd.read_csv('../data/merged/huizhou-v0_1.csv')

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 169086 entries, 0 to 169085
Data columns (total 6 columns):
url            169086 non-null object
title          169086 non-null object
summary        169083 non-null object
content        169086 non-null object
content_seg    169086 non-null object
s_vec          169079 non-null object
dtypes: object(6)
memory usage: 7.7+ MB


In [5]:
df = df.dropna(subset =['content'])

In [6]:
data_samples = df['content']

In [7]:
# Use tf-idf features for NMF.
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
                                   max_features=n_features,
                                   stop_words='english')

Extracting tf-idf features for NMF...


In [8]:
t0 = time()
tfidf = tfidf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))

done in 46.252s.


In [9]:
# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features,
                                stop_words='english')

Extracting tf features for LDA...


In [10]:
t0 = time()
tf = tf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))

done in 47.535s.


In [11]:
# Fit the NMF model
print("Fitting the NMF model with tf-idf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_topics, random_state=1,
          alpha=.1, l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))


Fitting the NMF model with tf-idf features, n_samples=2000 and n_features=1000...
done in 30.355s.


In [12]:
print("\nTopics in NMF model:")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)



Topics in NMF model:
Topic #0:
data science central big sign analyticbridge comments dsc analytics hadoop visualization rss likes scientist need like tim add added webinar
Topic #1:
services solutions support management business software data security cloud products customer enterprise technology product information company partners customers marketing service
Topic #2:
verified account tweets twitter english close tweet likes remove photos query search users uk reply copy location videos language password
Topic #3:
analytics data thinkers social best management world business cloud workforce media policy predictive login followers intelligence crm following marketing infographics
Topic #4:
kdnuggets data learning machine mining news stories science jan deep analytics 2015 dec tags big vs scientist nov python 20
Topic #5:
students university student campus research faculty school programs program graduate college information science alumni career academic course undergraduate internat

In [17]:
print("Fitting LDA models with tf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

Fitting LDA models with tf features, n_samples=2000 and n_features=1000...
done in 176.333s.


In [18]:
print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)


Topics in LDA model:
Topic #0:
data big analytics science predictive hadoop sign visualization learning scientist machine central interview blog business 2016 series need comments python
Topic #1:
time like new people just make work need way comments best post good use know year years number used different
Topic #2:
2015 learning kdnuggets machine data news 2014 mining 2013 2016 deep tags 10 jan 2012 stories 20 dec 15 nov
Topic #3:
com http www class field div https org content href image html wordpress img post images blog style website feeds
Topic #4:
use information data using code web text site search support free user software sql page file terms tools content available
Topic #5:
2016 twitter account search 2015 news new home english video view sign google close 12 verified tweet videos tweets 01
Topic #6:
analytics data business ibm management big blog marketing customer social solutions technology contact services media company content customers use products
Topic #7:
students 

In [None]:
#Multidimensional scaling

In [13]:
import os  # for os.path.basename

import matplotlib.pyplot as plt
import matplotlib as mpl

from sklearn.manifold import MDS

In [14]:
MDS()

MDS(dissimilarity='euclidean', eps=0.001, max_iter=300, metric=True,
  n_components=2, n_init=4, n_jobs=1, random_state=None, verbose=0)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(tfidf)
print
print

In [None]:
# convert two components as we're plotting points in a two-dimensional plane
# "precomputed" because we provide a distance matrix
# we will also specify `random_state` so the plot is reproducible.
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)

pos = mds.fit_transform(dist)  # shape (n_components, n_samples)

xs, ys = pos[:, 0], pos[:, 1]
print()
print()

In [None]:
#set up colors per clusters using a dict
cluster_colors = {0: '#1b9e77', 1: '#d95f02', 2: '#7570b3', 3: '#e7298a', 4: '#66a61e'}

#set up cluster names using a dict
cluster_names = {0: 'Big data', 
                 1: 'Posts', 
                 2: 'Machine Learning', 
                 3: 'Web content', 
                 4: 'Search', 
                 3: 'Web content', 
                 3: 'Web content', 

                 3: 'Web content', 

                 4: 'Killed, soldiers, captain'}