In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups
from os import listdir
from os.path import isfile, join
import numpy as np
import logging
from elasticsearch import Elasticsearch

In [2]:
es = Elasticsearch()
logging.getLogger("elasticsearch").setLevel(logging.ERROR)
es.indices.get_alias('*')

{'p2d_duc_index': {'aliases': {}},
 'duc_index': {'aliases': {}},
 'p2a_news_index': {'aliases': {}},
 '.kibana': {'aliases': {}},
 'p2b_duc_index': {'aliases': {}},
 'p2c_news_index': {'aliases': {}},
 'news_index': {'aliases': {}}}

In [3]:
news_dataset = fetch_20newsgroups(shuffle=True, random_state=1, data_home ='/Users/sarthak/Backup/Data',
                             remove=('headers', 'footers', 'quotes'))

In [4]:
mypath ='/Users/sarthak/Backup/Data/DUC2001/'
datafiles = [(mypath+f) for f in listdir(mypath) if isfile(join(mypath, f))]
datalist = []
for item in datafiles:
    f = open(item, 'r')
    filename = str.lower(item[item.rfind('/')+1:]) + '.txt'
    try:
        content = f.read()
        if content.find('[Text]') > -1:
            content = content[content.find('[Text]')+6:]
            content = content[:content.find('<')]
        else:
            content = content[content.find('<TEXT>')+6:]
            content = content[:content.find('<')]
        datalist.append(content)
    except Exception:
        pass
duc_data = np.array(datalist)

In [16]:
def print_top_words(model, feature_names, n_top_words, flag=0):
    topiclist = []
    wordlist =[]
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d:" % topic_idx
        for i in topic.argsort()[:-n_top_words - 1:-1]:
            message += " (" + (feature_names[i])
            message += "-" + "{:.2f}".format(topic[i]) + ")"
        #print(message, "\n")
        topiclist.append(topic_idx)
        wordlist.append(message)
    #print("\n\n\n")
    return topiclist, wordlist

In [6]:
def push_to_elastic(topiclist, wordlist, index_name, k):
    for i in range(len(topiclist)):
        topic_id = topiclist[i]
        topic_id = str(k) + "-" + str(topic_id)
        top_words = wordlist[i]
        data_dict = {
                                "topic_id": topic_id,
                                "topic_words": top_words,
                                "no_of_topics" : k
                    }
        es.index(index=index_name, doc_type='_doc', body=data_dict, id=topic_id, op_type="create")
        es.indices.refresh(index=index_name)

In [17]:
def vectorize(dataset):
    tfidf = TfidfVectorizer(stop_words='english')
    nmf_dataset = tfidf.fit_transform(dataset)
    
    count = CountVectorizer(stop_words='english')
    lda_dataset = count.fit_transform(dataset)
    
    return nmf_dataset, tfidf, lda_dataset, count

def calc_LDA_NMF(nmf_dataset, tfidf, lda_dataset, count, n_topics, n_top_words=20,
                 max_df=0.95, min_df=2, stop_words='english', flag=0):
    #print('For', n_topics, 'topics:-')    
    nmf = NMF(n_components=n_topics, 
              random_state=1,
              alpha=.1, 
              l1_ratio=.5)
    nmf = nmf.fit(nmf_dataset)
    
    
    lda = LatentDirichletAllocation(n_components=n_topics, 
                                    max_iter=5,
                                    learning_method='online',
                                    learning_offset=50.,
                                    random_state=0)
    lda = lda.fit(lda_dataset)
    
    #print("NMF:")
    _1, _2 = print_top_words(nmf, tfidf.get_feature_names(), n_top_words, flag)
    del _1, _2
    
    #print("LDA:")
    topiclist, wordlist = print_top_words(lda, count.get_feature_names(), n_top_words, flag)
    return topiclist, wordlist
    

In [8]:
request_body = {
                'mappings': {
                    '_doc': {
                        'properties': {
                            'topic_id': {'type': 'text'},
                            'topic_words': {'type': 'text'},
                            'no_of_topics': {'type': 'text'} 
                            }}}
                }
if es.indices.exists_alias('p2a*'):
    print("deleting old 'p2a_news_index' index...")
    es.indices.delete('p2a*')
print("creating 'p2a_news_index' index...")
es.indices.create(index = 'p2a_news_index', body = request_body)

deleting old 'p2a_news_index' index...
creating 'p2a_news_index' index...


{'acknowledged': True, 'shards_acknowledged': True, 'index': 'p2a_news_index'}

In [9]:
request_body = {
                'mappings': {
                    '_doc': {
                        'properties': {
                            'topic_id': {'type': 'text'},
                            'topic_words': {'type': 'text'},
                            'no_of_topics': {'type': 'text'}
                            }}}
                }

if es.indices.exists_alias('p2b*'):
    print("deleting old 'p2b_duc_index' index...")
    es.indices.delete('p2b*')
print("creating 'p2b_duc_index' index...")
es.indices.create(index = 'p2b_duc_index', body = request_body)

deleting old 'p2b_duc_index' index...
creating 'p2b_duc_index' index...


{'acknowledged': True, 'shards_acknowledged': True, 'index': 'p2b_duc_index'}

# 20 NG

In [10]:
nmf_dataset, tfidf, lda_dataset, count= vectorize(news_dataset.data)
topiclist, wordlist = calc_LDA_NMF(nmf_dataset, tfidf, lda_dataset, count, 10)
push_to_elastic(topiclist, wordlist, 'p2a_news_index', 10)

For 10 topics:-
NMF:
Topic #0: (people-1.04) (don-1.03) (just-0.98) (like-0.84) (think-0.83) (know-0.68) (time-0.60) (good-0.60) (right-0.51) (ve-0.50) (make-0.48) (say-0.48) (want-0.46) (did-0.46) (really-0.45) (way-0.44) (new-0.40) (use-0.40) (going-0.39) (said-0.38) 

Topic #1: (windows-2.15) (file-1.03) (dos-1.00) (files-0.77) (window-0.67) (program-0.56) (use-0.45) (running-0.40) (ms-0.38) (version-0.38) (using-0.36) (problem-0.32) (server-0.32) (pc-0.30) (screen-0.29) (ftp-0.29) (run-0.29) (os-0.28) (application-0.27) (software-0.27) 

Topic #2: (god-2.57) (jesus-1.03) (bible-0.58) (christ-0.46) (faith-0.46) (believe-0.41) (christians-0.40) (christian-0.39) (heaven-0.27) (hell-0.25) (sin-0.25) (life-0.24) (truth-0.24) (church-0.23) (lord-0.23) (belief-0.22) (say-0.22) (does-0.21) (eternal-0.20) (existence-0.20) 

Topic #3: (geb-0.75) (dsl-0.74) (chastity-0.74) (n3jxp-0.74) (cadre-0.74) (shameful-0.74) (pitt-0.74) (intellect-0.74) (skepticism-0.73) (surrender-0.72) (gordon-0.71) (

In [11]:
topiclist, wordlist = calc_LDA_NMF(nmf_dataset, tfidf, lda_dataset, count, 20)
push_to_elastic(topiclist, wordlist, 'p2a_news_index', 20)

For 20 topics:-
NMF:
Topic #0: (don-1.08) (just-1.06) (like-0.92) (think-0.84) (time-0.66) (good-0.65) (ve-0.57) (know-0.56) (make-0.50) (want-0.50) (say-0.49) (really-0.48) (right-0.48) (way-0.47) (did-0.47) (use-0.46) (new-0.42) (going-0.41) (ll-0.40) (things-0.38) 

Topic #1: (windows-2.83) (dos-1.29) (ms-0.48) (running-0.38) (os-0.35) (version-0.35) (microsoft-0.33) (nt-0.29) (using-0.26) (mouse-0.25) (drivers-0.24) (driver-0.23) (run-0.23) (use-0.22) (pc-0.20) (software-0.20) (memory-0.19) (problem-0.19) (screen-0.19) (printer-0.19) 

Topic #2: (god-2.57) (jesus-1.04) (bible-0.58) (christ-0.46) (faith-0.46) (believe-0.41) (christians-0.40) (christian-0.39) (heaven-0.27) (hell-0.25) (sin-0.25) (truth-0.24) (church-0.24) (life-0.24) (lord-0.23) (say-0.22) (belief-0.22) (eternal-0.20) (existence-0.20) (man-0.19) 

Topic #3: (geb-0.75) (dsl-0.75) (chastity-0.75) (n3jxp-0.75) (cadre-0.75) (shameful-0.74) (pitt-0.74) (intellect-0.74) (skepticism-0.74) (surrender-0.73) (gordon-0.72) (ban

In [12]:
topiclist, wordlist = calc_LDA_NMF(nmf_dataset, tfidf, lda_dataset, count, 50)
push_to_elastic(topiclist, wordlist, 'p2a_news_index', 50)

For 50 topics:-
NMF:
Topic #0: (don-1.00) (think-0.96) (like-0.92) (time-0.71) (say-0.53) (make-0.51) (did-0.51) (way-0.50) (really-0.49) (want-0.48) (going-0.43) (ll-0.41) (said-0.41) (things-0.40) (new-0.39) (sure-0.37) (believe-0.36) (thing-0.36) (point-0.36) (didn-0.36) 

Topic #1: (windows-3.20) (ms-0.37) (nt-0.34) (microsoft-0.32) (running-0.29) (os-0.27) (run-0.20) (program-0.20) (drivers-0.19) (driver-0.19) (memory-0.18) (comp-0.17) (using-0.16) (version-0.16) (printer-0.16) (font-0.15) (files-0.14) (ini-0.14) (pc-0.13) (graphics-0.12) 

Topic #2: (god-3.05) (believe-0.42) (bible-0.42) (faith-0.37) (truth-0.26) (existence-0.26) (hell-0.25) (belief-0.25) (heaven-0.23) (christians-0.22) (christ-0.21) (atheism-0.20) (lord-0.20) (life-0.19) (atheists-0.18) (exist-0.18) (eternal-0.17) (christian-0.17) (satan-0.17) (say-0.16) 

Topic #3: (geb-0.76) (dsl-0.75) (n3jxp-0.75) (chastity-0.75) (cadre-0.75) (shameful-0.75) (pitt-0.74) (intellect-0.74) (skepticism-0.74) (surrender-0.73) (gor

Topic #0: (db-504.07) (cx-388.61) (w7-320.07) (chz-285.59) (6um-264.63) (lk-244.53) (ah-233.27) (uw-232.04) (mv-213.29) (t7-210.22) (bh-187.70) (ck-182.45) (hz-181.00) (34u-172.49) (w1-169.28) (c8v-161.42) (c_-154.64) (7u-152.78) (17-152.13) (pl-147.48) 

Topic #1: (op_cols-29.90) (scsiha-19.44) (iici-14.15) (repaired-11.65) (cones-11.58) (disabling-11.06) (refund-10.30) (hernia-9.96) (decreasing-9.21) (prozac-8.58) (ac-8.18) (tpa-7.00) (huji-6.95) (corelscsi-6.79) (xterms-6.45) (vms-6.29) (potent-6.28) (compass6-6.22) (compass7-6.22) (tomography-6.13) 

Topic #2: (de7-25.76) (uccxkvb-21.73) (qumran-11.66) (u5x-7.34) (holler-7.21) (thedm-6.97) (matchups-6.42) (probabilistic-6.31) (compass2-6.22) (pnmtops-4.48) (bjz-4.41) (tbh1-4.22) (otago-3.79) (_ww-3.73) (duplicated-3.57) (lucy-3.48) (yorker-3.16) (2fvpmantel-3.12) (rocker-3.06) (zbx05x-3.02) 

Topic #3: (enviroleague-19.33) (rcs-11.15) (00100010b-11.04) (madonna-10.95) (mov-9.07) (wingert-8.20) (wraps-7.94) (immaculate-7.48) (whoah-

Topic #44: (falklands-29.91) (anania-21.27) (festival-16.97) (shirak-16.53) (argentine-13.55) (muslimzade-13.19) (orbeli-11.92) (chkdsk-10.78) (carcinogenic-10.57) (coating-9.97) (flavoring-7.90) (kin-7.72) (meade-7.39) (hay-7.37) (lick-7.34) (winmisc-7.17) (tyukhik-7.16) (monk-7.13) (introns-7.00) (v386-6.80) 

Topic #45: (argumentum-25.54) (oxalic-17.88) (decoding-16.83) (locutus-16.15) (lankford-12.83) (rib-10.64) (whiten-10.04) (torre-9.36) (gilkey-8.84) (bede-8.79) (tcpview-7.73) (koc-7.42) (paseo-6.96) (alicea-6.72) (scian-5.68) (xmol-5.68) (stratavision-5.56) (welty-5.41) (1gig-5.28) (nwo-5.06) 

Topic #46: (10-769.13) (15-609.77) (16-576.60) (12-573.61) (20-534.34) (14-523.91) (18-461.76) (13-458.04) (17-452.07) (30-383.97) (21-383.35) (24-362.16) (26-357.20) (23-356.32) (55-355.72) (22-321.11) (27-311.94) (la-299.83) (28-290.51) (vs-281.18) 

Topic #47: (people-3359.84) (don-2953.67) (just-2682.57) (like-2551.14) (think-2471.25) (know-2193.11) (time-2178.30) (new-1677.55) (goo

# DUC Data

In [13]:
nmf_dataset, tfidf, lda_dataset, count= vectorize(duc_data)
topiclist, wordlist = calc_LDA_NMF(nmf_dataset, tfidf, lda_dataset, count, 10)
push_to_elastic(topiclist, wordlist, 'p2b_duc_index', 10)

For 10 topics:-
NMF:
Topic #0: (police-0.83) (gates-0.36) (officers-0.34) (commission-0.23) (said-0.23) (brutality-0.23) (department-0.18) (angeles-0.18) (los-0.18) (chief-0.16) (mr-0.13) (city-0.13) (officer-0.11) (report-0.11) (complaints-0.10) (black-0.10) (force-0.10) (racism-0.08) (mayor-0.08) (king-0.08) 

Topic #1: (hurricane-0.91) (hurricanes-0.32) (sheets-0.28) (said-0.22) (storms-0.21) (mph-0.20) (storm-0.20) (atlantic-0.18) (winds-0.18) (florida-0.17) (gilbert-0.16) (tropical-0.15) (hugo-0.14) (season-0.14) (gray-0.13) (center-0.12) (forecasters-0.12) (miami-0.10) (damage-0.09) (weather-0.08) 

Topic #2: (forest-0.56) (fires-0.36) (acres-0.34) (said-0.28) (firefighters-0.24) (park-0.20) (national-0.16) (service-0.14) (blaze-0.12) (yellowstone-0.12) (rain-0.12) (burned-0.11) (contained-0.10) (wilderness-0.10) (monday-0.10) (lightning-0.09) (california-0.08) (burn-0.08) (area-0.08) (wyoming-0.07) 

Topic #3: (diamond-0.65) (beers-0.53) (diamonds-0.45) (cso-0.19) (market-0.15) 

In [14]:
topiclist, wordlist = calc_LDA_NMF(nmf_dataset, tfidf, lda_dataset, count, 20)
push_to_elastic(topiclist, wordlist, 'p2b_duc_index', 20)

For 20 topics:-
NMF:
Topic #0: (thomas-0.81) (court-0.27) (clarence-0.16) (black-0.14) (rights-0.14) (supreme-0.14) (senate-0.13) (nomination-0.12) (law-0.11) (civil-0.10) (judge-0.09) (affirmative-0.09) (said-0.08) (confirmation-0.08) (sen-0.08) (box-0.08) (abortion-0.08) (yale-0.08) (nominee-0.07) (danforth-0.07) 

Topic #1: (hurricane-0.91) (hurricanes-0.32) (sheets-0.28) (said-0.21) (storms-0.21) (mph-0.20) (storm-0.20) (atlantic-0.18) (winds-0.18) (florida-0.17) (gilbert-0.16) (tropical-0.15) (hugo-0.14) (season-0.14) (gray-0.13) (center-0.12) (forecasters-0.12) (miami-0.10) (damage-0.09) (weather-0.08) 

Topic #2: (forest-0.56) (fires-0.36) (acres-0.34) (said-0.26) (firefighters-0.24) (park-0.21) (national-0.16) (service-0.14) (blaze-0.12) (yellowstone-0.12) (rain-0.12) (burned-0.11) (contained-0.10) (wilderness-0.10) (monday-0.09) (lightning-0.09) (burn-0.08) (california-0.08) (wyoming-0.07) (area-0.07) 

Topic #3: (diamond-0.65) (beers-0.53) (diamonds-0.46) (cso-0.19) (market-0

In [15]:
topiclist, wordlist = calc_LDA_NMF(nmf_dataset, tfidf, lda_dataset, count, 50)
push_to_elastic(topiclist, wordlist, 'p2b_duc_index', 50)

For 50 topics:-
NMF:
Topic #0: (zucker-0.00) (entail-0.00) (entire-0.00) (enticing-0.00) (enthusiasts-0.00) (enthusiastic-0.00) (enthusiasm-0.00) (entertainer-0.00) (enterprises-0.00) (enterprise-0.00) (entering-0.00) (entered-0.00) (enter-0.00) (entails-0.00) (ensuring-0.00) (encouraged-0.00) (ensures-0.00) (ensured-0.00) (ensure-0.00) (ensuing-0.00) 

Topic #1: (hurricane-0.85) (sheets-0.35) (mph-0.22) (hurricanes-0.21) (gilbert-0.20) (said-0.18) (winds-0.18) (storm-0.15) (tropical-0.15) (storms-0.15) (atlantic-0.14) (center-0.13) (hugo-0.13) (forecasters-0.12) (season-0.11) (category-0.10) (florida-0.09) (pressure-0.08) (miami-0.08) (caribbean-0.07) 

Topic #2: (forest-0.54) (fires-0.35) (acres-0.34) (said-0.25) (firefighters-0.22) (park-0.21) (national-0.16) (service-0.13) (blaze-0.13) (yellowstone-0.12) (rain-0.12) (burned-0.11) (contained-0.10) (wilderness-0.10) (monday-0.09) (lightning-0.09) (burn-0.08) (wyoming-0.07) (california-0.07) (acre-0.07) 

Topic #3: (diamond-0.65) (bee

In [19]:
request_body = {
                'mappings': {
                    '_doc': {
                        'properties': {
                            'doc_id': {'type': 'text'},
                            'topic_words': {'type': 'text'},
                            }}}
                }
if es.indices.exists_alias('p2c*'):
    print("deleting old 'p2c_news_index' index...")
    es.indices.delete('p2c*')
print("creating 'p2c_news_index' index...")
es.indices.create(index = 'p2c_news_index', body = request_body)
i = 0
for doc in news_dataset.data:
    if i%1000==0:
        print(i)
    try:
        nmf_dataset, tfidf, lda_dataset, count= vectorize([doc])
        topiclist, wordlist = calc_LDA_NMF(nmf_dataset, tfidf, lda_dataset, count, 1, n_top_words=5, flag=1)
        doc_id = news_dataset.filenames[i][39:]
        data_dict = {
                                "doc_id": doc,
                                "topic_words": wordlist[0][10:]
                    }
        es.index(index='p2c_news_index', doc_type='_doc', body=data_dict, id=doc_id, op_type="create")
        es.indices.refresh(index='p2c_news_index')
    except Exception:
        pass
    i+=1

deleting old 'p2c_news_index' index...
creating 'p2c_news_index' index...
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000


In [18]:
request_body = {
                'mappings': {
                    '_doc': {
                        'properties': {
                            'doc_id': {'type': 'text'},
                            'topic_words': {'type': 'text'},
                            }}}
                }
if es.indices.exists_alias('p2d*'):
    print("deleting old 'p2d_duc_index' index...")
    es.indices.delete('p2d*')
print("creating 'p2d_duc_index' index...")
es.indices.create(index = 'p2d_duc_index', body = request_body)
i = 0
for doc in duc_data:
    if i%100==0:
        print(i)
        print(item)
    try:
        nmf_dataset, tfidf, lda_dataset, count= vectorize([doc])
        topiclist, wordlist = calc_LDA_NMF(nmf_dataset, tfidf, lda_dataset, count, 1, n_top_words=5, flag=1)
        item = datafiles[i]
        doc_id = str.lower(item[item.rfind('/')+1:]) + '.txt'
        data_dict = {
                                "doc_id": doc_id,
                                "topic_words": wordlist[0][10:]
                    }
        es.index(index='p2d_duc_index', doc_type='_doc', body=data_dict, id=doc_id, op_type="create")
        es.indices.refresh(index='p2d_duc_index')
    except Exception:
        pass
    i+=1

deleting old 'p2d_duc_index' index...
creating 'p2d_duc_index' index...
0
/Users/sarthak/Backup/Data/DUC2001/FT931-11394
100
/Users/sarthak/Backup/Data/DUC2001/FBIS3-11919
200
/Users/sarthak/Backup/Data/DUC2001/FT931-341
300
/Users/sarthak/Backup/Data/DUC2001/AP901231-0012
