In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups
from os import listdir
from os.path import isfile, join
import numpy as np
import logging
import math
from elasticsearch import Elasticsearch

In [2]:
es = Elasticsearch()
logging.getLogger("elasticsearch").setLevel(logging.ERROR)
es.indices.get_alias('*')

{'.kibana': {'aliases': {}},
 'news_index': {'aliases': {}},
 'p2c_news_index': {'aliases': {}},
 'p2b_duc_index': {'aliases': {}},
 'p2a_news_index': {'aliases': {}},
 'duc_index': {'aliases': {}},
 'p2d_duc_index': {'aliases': {}}}

In [3]:
news_dataset = fetch_20newsgroups(shuffle=True, random_state=1, data_home ='/Users/sarthak/Backup/Data',
                             remove=('headers', 'footers', 'quotes'))

In [47]:
mypath ='/Users/sarthak/Backup/Data/DUC2001/'
gpath = '/Users/sarthak/Backup/Data/DUC2001/Summaries/'
datafiles = [(mypath+f) for f in listdir(mypath) if isfile(join(mypath, f))]
datalist = []
summarylist = []
flist = []
for item in datafiles:
    f = open(item, 'r')
    filename = str.lower(item[item.rfind('/')+1:]) + '.txt'
    try:
        content = f.read()
        if content.find('[Text]') > -1:
            content = content[content.find('[Text]')+6:]
            content = content[:content.find('<')]
        else:
            content = content[content.find('<TEXT>')+6:]
            content = content[:content.find('<')]
        datalist.append(content)
        if isfile(join(gpath,filename)):
            gf = open(gpath+filename, 'r')
            gcontent = gf.read()
            if gcontent.find('Abstract:') > -1:
                gcontent = gcontent[gcontent.find('Abstract:')+10:]
            if gcontent.find('Introduction:') > -1:
                gcontent = gcontent[gcontent.find('Introduction:')+13:]
        else:
            gcontent = ''
        summarylist.append(gcontent)
        flist.append(filename)
    except Exception:
        pass
duc_data = np.array(datalist)
duc_sdata = np.array(summarylist)

In [5]:
def kl_divergence(summary_freq, doc_freq):
    sum_val = 0
    for w in summary_freq:
        frequency = doc_freq.get(w)
        if frequency:
            sum_val += frequency * math.log(frequency / summary_freq[w])
    return sum_val

In [100]:
def init(doc):
    count = CountVectorizer(stop_words='english')
    indexes = count.fit_transform([doc]).toarray().ravel()
    lda = LatentDirichletAllocation(n_components=1, 
                                        max_iter=5,
                                        learning_method='online',
                                        learning_offset=50.,
                                        random_state=0)
    _ = lda.fit([indexes])
    words = count.get_feature_names()
    kl_word_dict = dict(zip(words, indexes))
    lda_word_dict = dict(zip(words, lda.components_.ravel()))
    return count, lda, kl_word_dict, lda_word_dict

def kl_summary_calc(doc, count, kl_word_dict, k):
    kl_summary = ""
    sentance_list = doc.split('.')
    for i in range(len(sentance_list)):
        kl_div_list = []
        for sentance in sentance_list:
            current = kl_summary + sentance
            try:
                wordcount = count.fit_transform([current]).toarray().ravel()
            except ValueError:
                continue
            words = count.get_feature_names()
            temp = dict(zip(words, wordcount))
            kl_div_list.append(kl_divergence(temp, kl_word_dict))
        small = np.array(kl_div_list).argsort()[0]
        kl_summary += sentance_list[small] + " "
        sentance_list.pop(small)
        if i == k:
            break
    return kl_summary

def lda_summary_calc(doc, count, lda, kl_word_dict, lda_word_dict, k):
    lda_summary = ""
    sentance_list = doc.split('.')
    for i in range(len(sentance_list)):
        lda_div_list = []
        for sentance in sentance_list:
            current = lda_summary + sentance
            try:
                wordcount = count.fit_transform([current]).toarray().ravel()
            except ValueError:
                continue
            _ = lda.fit([wordcount])
            wordcount = lda.components_.ravel()
            words = count.get_feature_names()
            temp = dict(zip(words, wordcount))
            lda_div_list.append(kl_divergence(temp, lda_word_dict))
        small = np.array(lda_div_list).argsort()[0]
        lda_summary += sentance_list[small] + " "
        sentance_list.pop(small)
        if i == k:
            break
    return lda_summary

def main_function(dataset, k=5):
    klist = []
    ldalist = []
    i=0
    for doc in dataset:
        try:
            i+=1
            if i%1000==0:
                print(i)
            count, lda, kl_word_dict, lda_word_dict = init(doc)
            kl_summary = kl_summary_calc(doc, count, kl_word_dict, k)
            lda_summary = lda_summary_calc(doc, count, lda, kl_word_dict, lda_word_dict, k)
            klist.append(kl_summary)
            ldalist.append(lda_summary)
        except ValueError:
            klist.append("")
            ldalist.append("")
            print('Could not parse document no - ', i)
            continue
    return klist, ldalist

In [94]:
def export_es_duc(duc_data, duc_sdata, filenames, klist, ldalist):
    request_body = {
                    'mappings': {
                        '_doc': {
                            'properties': {
                                'doc_id': {'type': 'text'},
                                'doc_text': {'type': 'text'},
                                'gold_summary' : {'type': 'text'},
                                'kl_summary' : {'type': 'text'},
                                'lda_summary' : {'type': 'text'},
                                }}}
                    }
    if es.indices.exists_alias('p3a*'):
        print("deleting old 'p3a_duc_index' index...")
        es.indices.delete('p3a*')
    print("creating 'p3a_duc_index' index...")
    es.indices.create(index = 'p3a_duc_index', body = request_body)
    for i in range(len(duc_data)):
        filename = filenames[i]
        content = duc_data[i]
        gc = duc_sdata[i]
        ks = klist[i]
        ls = ldalist[i]
        data_dict = {
                            "doc_id": filename,
                            "doc_text": content,
                            "gold_summary": gc,
                            "kl_summary" : ks,
                            "lda_summary" : ls
                    }
        es.index(index='p3a_duc_index', doc_type='_doc', body=data_dict, id=filename, op_type="create")
        es.indices.refresh(index='p3a_duc_index')

In [104]:
def export_es_news(news_dataset, klist, ldalist):
    request_body = {
                    'mappings': {
                        '_doc': {
                            'properties': {
                                'doc_id': {'type': 'text'},
                                'doc_text': {'type': 'text'},
                                'kl_summary' : {'type': 'text'},
                                'lda_summary' : {'type': 'text'},
                                }}}
                    }
    if es.indices.exists_alias('p3b*'):
        print("deleting old 'p3b_news_index' index...")
        es.indices.delete('p3b*')
    print("creating 'p3b_news_index' index...")
    es.indices.create(index = 'p3b_news_index', body = request_body)
    for i in range(len(news_dataset.data)):
        if i%1000==0:
            print(i)
        filename = news_dataset.filenames[i][39:]
        content = news_dataset.data[i]
        ks = klist[i]
        ls = ldalist[i]
        data_dict = {
                            "doc_id": filename,
                            "doc_text": content,
                            "kl_summary" : ks,
                            "lda_summary" : ls
                    }
        es.index(index='p3b_news_index', doc_type='_doc', body=data_dict, id=filename, op_type="create")
        es.indices.refresh(index='p3b_news_index')

In [95]:
klist, ldalist = main_function(duc_data)

6 B 1
8 B 2
13 B 3
16 B 4
19 B 5
21 B 6
28 B 7
33 B 8
36 B 9
39 B 10
42 B 11
45 B 12
52 B 13
55 B 14
66 B 15
71 B 16
79 B 17
89 B 18
91 B 19
92 B 20
100 B 21
105 B 22
106 B 23
107 B 24
111 B 25
112 B 26
114 B 27
118 B 28
119 B 29
136 B 30
137 B 31
138 B 32
141 B 33
143 B 34
164 B 35
168 B 36
183 B 37
208 B 38
213 B 39
217 B 40
219 B 41
222 B 42
235 B 43
236 B 44
262 B 45
266 B 46
270 B 47
272 B 48
280 B 49
282 B 50
284 B 51
285 B 52
287 B 53
292 B 54
296 B 55
297 B 56
302 B 57


In [98]:
export_es_duc(duc_data, duc_sdata, flist, klist, ldalist)

deleting old 'p3a_duc_index' index...
creating 'p3a_duc_index' index...


In [101]:
klist, ldalist = main_function(news_dataset.data)

Could not parse document no -  52
Could not parse document no -  90
Could not parse document no -  151
Could not parse document no -  256
Could not parse document no -  312
Could not parse document no -  339
Could not parse document no -  360
Could not parse document no -  362
Could not parse document no -  403
Could not parse document no -  409
Could not parse document no -  433
Could not parse document no -  440
Could not parse document no -  441
Could not parse document no -  483
Could not parse document no -  484
Could not parse document no -  501
Could not parse document no -  564
Could not parse document no -  599
Could not parse document no -  665
Could not parse document no -  733
Could not parse document no -  765
Could not parse document no -  810
Could not parse document no -  867
Could not parse document no -  906
Could not parse document no -  913
Could not parse document no -  915
Could not parse document no -  976
1000
Could not parse document no -  1015
Could not parse 

Could not parse document no -  8330
Could not parse document no -  8391
Could not parse document no -  8407
Could not parse document no -  8437
Could not parse document no -  8461
Could not parse document no -  8575
Could not parse document no -  8634
Could not parse document no -  8651
Could not parse document no -  8711
Could not parse document no -  8727
Could not parse document no -  8737
Could not parse document no -  8773
Could not parse document no -  8854
Could not parse document no -  8934
Could not parse document no -  8955
Could not parse document no -  8977
Could not parse document no -  8978
9000
Could not parse document no -  9009
Could not parse document no -  9033
Could not parse document no -  9072
Could not parse document no -  9095
Could not parse document no -  9109
Could not parse document no -  9210
Could not parse document no -  9223
Could not parse document no -  9235
Could not parse document no -  9238
Could not parse document no -  9257
Could not parse documen

In [106]:
export_es_news(news_dataset, klist, ldalist)

deleting old 'p3b_news_index' index...
creating 'p3b_news_index' index...
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
