In [1]:
import logging
from sklearn.datasets import fetch_20newsgroups
from elasticsearch import Elasticsearch
from os import listdir
from os.path import isfile, join

In [2]:
es = Elasticsearch()
logging.getLogger("elasticsearch").setLevel(logging.ERROR)
es.indices.get_alias('*')

{'news_index': {'aliases': {}},
 'p3a_duc_index': {'aliases': {}},
 'duc_index': {'aliases': {}},
 'p2a_news_index': {'aliases': {}},
 'p2b_duc_index': {'aliases': {}},
 'p2c_news_index': {'aliases': {}},
 'p3b_news_index': {'aliases': {}},
 '.kibana': {'aliases': {}},
 'p2d_duc_index': {'aliases': {}}}

# 20 news group

In [3]:
news_dataset = fetch_20newsgroups(data_home ='/Users/sarthak/Backup/Data', shuffle=True, random_state=42, remove=('headers', 'footers', 'quotes'))

In [4]:
request_body = {
                'mappings': {
                    '_doc': {
                        'properties': {
                            'doc_id': {'type': 'text'},
                            'doc_text': {'type': 'text'},
                            }}}
                }
print("creating 'news_index' index...")
if es.indices.exists_alias('n*'):
    es.indices.delete('n*')
es.indices.create(index = 'news_index', body = request_body)

creating 'news_index' index...


{'acknowledged': True, 'shards_acknowledged': True, 'index': 'news_index'}

In [5]:
for i in range(len(news_dataset.data)):
    data = news_dataset.data[i]
    doc_id = news_dataset.filenames[i][39:]
    data_dict = {
                            "doc_id": doc_id,
                            "doc_text": data
                }
    es.index(index='news_index', doc_type='_doc', body=data_dict, id=doc_id, op_type="create")
    es.indices.refresh(index='news_index')

# DUC

In [6]:
mypath ='/Users/sarthak/Backup/Data/DUC2001/'
gpath = '/Users/sarthak/Backup/Data/DUC2001/Summaries/'
datafiles = [(mypath+f) for f in listdir(mypath) if isfile(join(mypath, f))]

In [7]:
request_body = {
                'mappings': {
                    '_doc': {
                        'properties': {
                            'doc_id': {'type': 'text'},
                            'doc_text': {'type': 'text'},
                            'gold_summary' : {'type': 'text'},
                            }}}
                }
print("creating 'duc_index' index...")
if es.indices.exists_alias('d*'):
    es.indices.delete('d*')
es.indices.create(index = 'duc_index', body = request_body)

creating 'duc_index' index...


{'acknowledged': True, 'shards_acknowledged': True, 'index': 'duc_index'}

In [8]:
for item in datafiles:
    f = open(item, 'r')
    #print(f)
    filename = str.lower(item[item.rfind('/')+1:]) + '.txt'
    try:
        content = f.read()
        if content.find('[Text]') > -1:
            content = content[content.find('[Text]')+6:]
            content = content[:content.find('<')]
        else:
            content = content[content.find('<TEXT>')+6:]
            content = content[:content.find('<')]
        if isfile(join(gpath,filename)):
            gf = open(gpath+filename, 'r')
            gcontent = gf.read()
            if gcontent.find('Abstract:') > -1:
                gcontent = gcontent[gcontent.find('Abstract:')+10:]
            if gcontent.find('Introduction:') > -1:
                gcontent = gcontent[gcontent.find('Introduction:')+13:]
            data_dict = {
                            "doc_id": filename,
                            "doc_text": content,
                            "gold_summary" : gcontent
                        }
        else:
            data_dict = {
                            "doc_id": filename,
                            "doc_text": content,
                        }
        es.index(index='duc_index', doc_type='_doc', body=data_dict, id=filename, op_type="create")
        es.indices.refresh(index='duc_index')
    except UnicodeDecodeError:
        pass