In [35]:
#Non-negative Matrix Factorization and Latent Dirichlet Allocation on a corpus of documents and extract additive 
#models of the topic structure of the corpus. The output is a list of topics, each represented as a list of terms 
#(weights are not shown).

In [1]:
import numpy as np
from collections import Counter

In [2]:
import pandas as pd
import json

In [8]:
import tldextract

In [4]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

In [6]:
#before classification
df = pd.read_csv('../data/raw/solr-dsi-domain_only-04-05-2017-cleaned.csv')

#after classificaiton
#df = pd.read_csv('../data/clean/Solr-dsi-final_v1.csv')

In [9]:
df['domain']  = df.url.map(lambda x:tldextract.extract(x).domain + str('.') + tldextract.extract(x).suffix)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 292147 entries, 0 to 292146
Data columns (total 12 columns):
_version_    292147 non-null int64
cache        3956 non-null object
segment      292147 non-null int64
digest       292147 non-null object
tstamp       292147 non-null object
url          292147 non-null object
anchor       31946 non-null object
content      292147 non-null object
id           292147 non-null object
title        292147 non-null object
boost        292147 non-null float64
domain       292147 non-null object
dtypes: float64(1), int64(2), object(9)
memory usage: 26.7+ MB


In [11]:
df.head()

Unnamed: 0,_version_,cache,segment,digest,tstamp,url,anchor,content,id,title,boost,domain
0,1563673324559532032,,20150728142316,1f8e9667e32a5ddfdd57e5371916449f,2015-07-28T16:30:11.372Z,http://www.analyticbridge.com/xn/detail/200429...,"Big Data Salaries Top BI\, Data Warehousing","Big Data Salaries Top BI, Data Warehousing - A...",http://www.analyticbridge.com/group/salary-tre...,"Big Data Salaries Top BI, Data Warehousing - A...",0.000318,analyticbridge.com
1,1563673325253689344,,20160118174958,a46f9b548075790ba0ff9949432964dc,2016-01-18T20:39:53.457Z,http://www.analyticbridge.com/profile/WilliamK...,,William Kyniston's Page - AnalyticBridge Searc...,http://www.analyticbridge.com/profile/WilliamK...,William Kyniston's Page - AnalyticBridge,0.001015,analyticbridge.com
2,1563673325272563712,,20150728142316,7b130e6daf011f507b3a50c0df243a68,2015-07-28T14:27:57.694Z,http://www.analyticbridge.com/profile/krishnak...,,krishna kant bairagi's Page - AnalyticBridge S...,http://www.analyticbridge.com/profile/krishnak...,krishna kant bairagi's Page - AnalyticBridge,0.002638,analyticbridge.com
3,1563673326429143040,,20160101163526,c23f6957022ca7bf274bc6650f62dc49,2015-07-28T14:23:45.73Z,http://www.analyticsindiasummit.com/speaker/am...,,Amit Khanna | Cypher 2015 Presented by Analyti...,http://www.analyticsindiasummit.com/speaker/am...,Amit Khanna | Cypher 2015,0.003828,analyticsindiasummit.com
4,1563673326431240193,,20160118174958,8b5962c25007458e694657e4708d7a38,2016-01-18T17:50:17.154Z,http://www.analyticsindiasummit.com/speaker/an...,,Ankita Gupta | Cypher 2015 Presented by Analyt...,http://www.analyticsindiasummit.com/speaker/an...,Ankita Gupta | Cypher 2015,0.002474,analyticsindiasummit.com


In [12]:
df_title = df.dropna(subset =['title'])

In [13]:
df_title['domain'].nunique()

8256

In [14]:
count = Counter(df_title['domain'])

In [15]:
top_200 = count.most_common(200)

In [16]:
for t in top_200:
    print t[0], t[1]

kdnuggets.com 14167
twitter.com 12274
ibmbigdatahub.com 11875
wikipedia.org 9330
smartdatacollective.com 8209
datasciencecentral.com 7915
insidebigdata.com 4767
analyticbridge.com 3657
pixabay.com 3626
meetup.com 2778
iaria.org 1927
ibm.com 1860
r-project.org 1853
blogspot.com 1849
drdobbs.com 1819
gabormelli.com 1798
networkcomputing.com 1758
reuters.com 1625
apache.org 1588
techtarget.com 1465
okfn.org 1420
businessinsider.com 1292
aaai.org 1123
flowingdata.com 1123
typepad.com 1121
darkreading.com 1073
slideshare.net 1053
analytictalent.com 1028
predictiveanalyticsworld.com 966
wsj.com 913
oreilly.com 900
wiley.com 897
hbr.org 894
politico.com 893
vimeo.com 872
youtube.com 850
re-work.co 831
ventanaresearch.com 830
iit.edu 820
nbcnews.com 802
okstate.edu 796
datainnovation.org 795
google.com 792
experian.com 786
cnn.com 782
stanford.edu 775
cofc.edu 749
kaggle.com 730
columbia.edu 722
congress.gov 710
datareview.info 708
videolectures.net 707
comscore.com 703
forbes.com 684
techcrun

In [17]:
df_title.to_csv('tmp.csv')
df_title = pd.read_csv('tmp.csv')

In [18]:
df_title.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 292147 entries, 0 to 292146
Data columns (total 13 columns):
Unnamed: 0    292147 non-null int64
_version_     292147 non-null int64
cache         3956 non-null object
segment       292147 non-null int64
digest        292147 non-null object
tstamp        292147 non-null object
url           292147 non-null object
anchor        31946 non-null object
content       292147 non-null object
id            292147 non-null object
title         292147 non-null object
boost         292147 non-null float64
domain        292147 non-null object
dtypes: float64(1), int64(3), object(9)
memory usage: 29.0+ MB


In [19]:
train_articles = df_title['title']

In [20]:
n_samples = 2000
n_features = 1000
n_topics = 10
n_top_words = 20

In [21]:
vectorizer = TfidfVectorizer(stop_words = 'english')
document_term_mat = vectorizer.fit_transform(train_articles)
words = vectorizer.get_feature_names()

In [22]:
print document_term_mat.shape

(292147, 95663)


In [23]:
nmf = NMF(n_components = n_topics)
nmf_W = nmf.fit_transform(document_term_mat)
nmf_H = nmf.components_

In [25]:
def print_top_words(model, feature_names, n_top_words, df):
    for topic_idx, topic in enumerate(model.components_):
        tmp = df[df['topic'] == topic_idx]
        print("Topic #%d:" % topic_idx, "total = %d" % len(tmp))
        print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
        #print()

In [26]:
def main_topic_per_doc(W):
    '''
    Output a list that holds the top topic in each document
    '''
    main_topic = []
    for doc_num, topic_row in enumerate(W):
        main_topic.append(topic_row.argsort()[::-1][0])
        #print("Article %d:" % doc_num, "Main Topic: %d" % topic_row.argsort()[::-1][0])
    return main_topic

In [27]:
def describe_nmf_results_W(W, n_top_topics = 10):
    '''
    Output the top topics in each document
    '''
    for doc_num, topic_row in enumerate(W[:10]): # print out the first 10 documents
        print("Document %d:" % doc_num)
        print topic_row.argsort()[::-1][:n_top_topics]
    return


In [28]:
def describe_results_H(H, n_top_words = 20):
    '''
    Output the top words in each topic
    '''
    for topic_num, topic in enumerate(H):
        print("Topic %d:" % topic_num)
        print(" ".join([words[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
    return

In [29]:
def topic_group_get(W, given_topic):
    '''
    Output a list that holds the top topic in each document 
    '''
    main_topic = []
    for doc_num, topic_row in enumerate(W):
        if topic_row.argsort()[::-1][0] == given_topic:
            main_topic.append(doc_num)
            #print("Article %d:" % doc_num, "Main Topic: %d" % topic_row.argsort()[::-1][0])
    return main_topic

topic_group_get(nmf_W, 0)

In [30]:
df_title['topic'] = main_topic_per_doc(nmf_W)

In [31]:
print_top_words(nmf, words, n_top_words, df_title)

('Topic #0:', 'total = 19378')
ibm analytics hub big data blogs predictive customer insight watson management hadoop cognitive fraud things cloud healthcare financial internet real
('Topic #1:', 'total = 8096')
collective smartdata thinkers best posts world infographics articles cloud bi change things use internet ways future big hadoop customer marketing
('Topic #2:', 'total = 19108')
science central data page computer friends institute analyticbridge weekly digest analytics iot learning department 101 london master nyu python machine
('Topic #3:', 'total = 12780')
twitter tweets media help replies center developers https http blogs status ibm ads facebook issue david new previous chris policy
('Topic #4:', 'total = 14363')
wikipedia category template help software talk theory programming style manual file game wikiproject language computer portal management articles list research
('Topic #5:', 'total = 44836')
data mining big scientist conference open visualization center innovation 

In [32]:
describe_nmf_results_W(nmf_W)

Document 0:
[5 0 2 6 1 9 8 7 4 3]
Document 1:
[2 9 6 7 4 3 8 5 1 0]
Document 2:
[2 9 6 7 3 8 5 4 1 0]
Document 3:
[5 6 9 7 8 3 4 2 1 0]
Document 4:
[5 6 9 7 8 3 4 2 1 0]
Document 5:
[5 6 9 7 8 3 4 2 1 0]
Document 6:
[5 6 9 7 8 3 4 2 1 0]
Document 7:
[5 6 9 7 8 3 4 2 1 0]
Document 8:
[5 6 9 7 8 3 4 2 1 0]
Document 9:
[5 6 9 7 8 3 4 2 1 0]


In [33]:
#with normalized vector
normalized_vectorizer = TfidfVectorizer(stop_words = 'english', norm=None)
normalized_document_term_mat = normalized_vectorizer.fit_transform(train_articles)
words = normalized_vectorizer.get_feature_names()

In [34]:
nmf = NMF(n_components = n_topics)
nmf_W = nmf.fit_transform(normalized_document_term_mat)
nmf_H = nmf.components_

In [35]:
df_title['topic'] = main_topic_per_doc(nmf_W)

In [36]:
print_top_words(nmf, words, n_top_words, df_title)

('Topic #0:', 'total = 21864')
analytics ibm big hub data predictive customer blogs insight watson things cognitive hadoop world management fraud internet real cloud healthcare
('Topic #1:', 'total = 18675')
science central data page computer friends institute analyticbridge digest weekly nyu iot department python analytics master engineering iit applied 101
('Topic #2:', 'total = 9099')
collective smartdata posts infographics thinkers best world articles cloud bi change big intelligence ways use data things hadoop customer internet
('Topic #3:', 'total = 34396')
data mining big open visualization scientist blog social text census analysis conference center media innovation comments software knowledge insidebigdata management
('Topic #4:', 'total = 13287')
free pixabay image photo illustration online minecraft hunch gratis software images vector graphic download web footage foto woman leaf green
('Topic #5:', 'total = 81560')
business university school intelligence analytics college in