In [1]:
from elasticsearch import Elasticsearch
es = Elasticsearch([{'host': 'localhost', 'port': 9200}])

In [9]:
es.cluster.health()

{'active_primary_shards': 5,
 'active_shards': 5,
 'active_shards_percent_as_number': 100.0,
 'cluster_name': 'smartpub',
 'delayed_unassigned_shards': 0,
 'initializing_shards': 0,
 'number_of_data_nodes': 1,
 'number_of_in_flight_fetch': 0,
 'number_of_nodes': 1,
 'number_of_pending_tasks': 0,
 'relocating_shards': 0,
 'status': 'green',
 'task_max_waiting_in_queue_millis': 0,
 'timed_out': False,
 'unassigned_shards': 0}

In [None]:
es.nodes.stats()

# 2 Full Index

In [4]:
from pymongo import MongoClient
from elasticsearch import Elasticsearch
from elasticsearch import helpers
import nltk
import config as cfg

sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')

In [5]:
client = MongoClient('localhost:' + str(cfg.mongoDB_Port))
db = client.pub
pub = client.pub.publications
es = Elasticsearch(
    [{'host': 'localhost', 'port': 9200}], timeout=30, max_retries=10, retry_on_timeout=True
)
es.cluster.health(wait_for_status='yellow', request_timeout=1)

{'active_primary_shards': 0,
 'active_shards': 0,
 'active_shards_percent_as_number': 100.0,
 'cluster_name': 'smartpub',
 'delayed_unassigned_shards': 0,
 'initializing_shards': 0,
 'number_of_data_nodes': 1,
 'number_of_in_flight_fetch': 0,
 'number_of_nodes': 1,
 'number_of_pending_tasks': 0,
 'relocating_shards': 0,
 'status': 'green',
 'task_max_waiting_in_queue_millis': 0,
 'timed_out': False,
 'unassigned_shards': 0}

In [61]:
def return_chapters(mongo_string_search, db):
    results = db.publications.find(mongo_string_search)
    chapters = list()
    chapter_nums = list()
    list_of_docs = list()
    merged_chapters = list()
    
    my_dict = {
        "_id": "",
        "title": "",
        "content": "",
        "journal": "",
        "year":""
    }
    for i, r in enumerate(results):
        # try:
        # list_of_sections = list()
        my_dict['_id'] = r['_id']
        my_dict['title'] = r['title']
        try:
            my_dict['journal'] = r['booktitle']
        except: 
            pass
        try:
            my_dict['journal'] = r['journal']
        except: 
            pass
        try:
            my_dict['year'] = r['year']
        except: 
            pass
        try:
            my_dict['content'] = r['content']['fulltext']
        except:
            my_dict['content'] = ""
            # print(my_dict)
            # sys.exit(1)

        list_of_docs.append(my_dict)

        my_dict = {
            "_id": "",
            "title": "",
            "content": "",
            "journal": "",
            "year": ""
        }

    return list_of_docs

In [62]:
filter_conference = ["WWW", "ICSE", "VLDB", "JCDL", "TREC",  "SIGIR", "ICWSM", "ECDL", "ESWC", "TPDL", 
                     'pbio', 'pgen', 'bmcgen', 'bmcbiot', 'bmcneur', 'bmceb', 'genbio', 'bcr']

list_of_pubs = []

for booktitle in filter_conference:
    mongo_string_search = {'$or': [{'$and': [{'booktitle': booktitle}, {'content.fulltext': {'$exists': True}}]} ,
                                   {'$and': [{'journal': booktitle},   {'content.fulltext': {'$exists': True}}]} ]}
    list_of_pubs.append(return_chapters(mongo_string_search, db))

In [64]:
len(list_of_pubs)

18

In [None]:
for pubs in list_of_pubs:
    actions = []
    
    for cur in pubs:
        text = cur["content"]
        
        print(cur['_id'])
        print(cur['journal'])

        actions.append({
                    "_index": "ir",
                    "_type": "publications",
                    "_id" : cur['_id'],
                    "journal": cur['journal'],
                    "year": cur['year'],
                    "_source" : {
                        "text" : text,
                        "title": cur["title"]
                    }
                })

    if len(actions) == 0:
            continue

    res = helpers.bulk(es, actions)
    print(res)

# 3 Twosent Index

In [95]:
from pymongo import MongoClient
from elasticsearch import Elasticsearch
from elasticsearch import helpers
import math
import requests
import nltk
import _pickle as cPickle
import config as cfg
import logging

###############################
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
client = MongoClient('localhost:' + str(cfg.mongoDB_Port))
pub = client.pub.publications
db = client.pub
es = Elasticsearch([{'host': 'localhost', 'port': 9200}],
                   timeout=30, max_retries=10, retry_on_timeout=True)
es.cluster.health(wait_for_status='yellow', request_timeout=1)

{'active_primary_shards': 10,
 'active_shards': 10,
 'active_shards_percent_as_number': 100.0,
 'cluster_name': 'smartpub',
 'delayed_unassigned_shards': 0,
 'initializing_shards': 0,
 'number_of_data_nodes': 1,
 'number_of_in_flight_fetch': 0,
 'number_of_nodes': 1,
 'number_of_pending_tasks': 0,
 'relocating_shards': 0,
 'status': 'green',
 'task_max_waiting_in_queue_millis': 0,
 'timed_out': False,
 'unassigned_shards': 0}

In [99]:
def return_chapters(mongo_string_search, db):
    # mongo_string_search = {"dblpkey": "{}".format(dblkey)}
    results = db.publications.find(mongo_string_search)
    chapters = list()
    chapter_nums = list()
    list_of_docs = list()
    # list_of_abstracts = list()
    merged_chapters = list()
    
    my_dict = {
        "_id": "",
        "paragraphs": list(),
        "title": ""
    }
    
    for i, r in enumerate(results):
        # try:
        # list_of_sections = list()
        my_dict['_id'] = r['_id']
        my_dict['title'] = r['title']
        paragraphs = []
        
#         ########################################################
        try:
            for chapter in r['content']['chapters']:
                # print(r['dblpkey'])
                if (chapter == {}):
                    continue
                    # remove the filter that removes related works
                    # elif str(chapter['title']).lower() in filter_chapters:
                    # print(chapter['title'])

                # print(chapter['title'])
                for paragraph in chapter['paragraphs']:
                    if paragraph == {}:
                        continue
                    paragraphs.append(paragraph)
                    
            my_dict['paragraphs'] = paragraphs

        except:
            logging.exception('No paragraphs in '+r['_id'], exc_info=True)
            continue
#         ########################################################    

        list_of_docs.append(my_dict)
        my_dict = {
            "_id": "",
            "paragraphs": list(),
            "title": ""
        }

    return list_of_docs

In [101]:
filter_conference = ["WWW", "ICSE", "VLDB", "JCDL", "TREC",  "SIGIR", "ICWSM", "ECDL", "ESWC", "TPDL", 
                     'pbio', 'pgen', 'bmcgen', 'bmcbiot', 'bmcneur', 'bmceb', 'genbio', 'bcr']

list_of_pubs = []

for booktitle in filter_conference:
    mongo_string_search = {'$or': [{'$and': [{'booktitle': booktitle}, {'content.fulltext': {'$exists': True}}]} ,
                                   {'$and': [{'journal': booktitle},   {'content.fulltext': {'$exists': True}}]} ]}
    list_of_pubs.append(return_chapters(mongo_string_search, db))

print("Total journals:", len(list_of_pubs))

ERROR:root:No text in conf_vldb_ChenCLR05
Traceback (most recent call last):
  File "<ipython-input-99-60f98a35fd22>", line 25, in return_chapters
    for chapter in r['content']['chapters']:
KeyError: 'chapters'
ERROR:root:No text in conf_vldb_ChanI99
Traceback (most recent call last):
  File "<ipython-input-99-60f98a35fd22>", line 25, in return_chapters
    for chapter in r['content']['chapters']:
KeyError: 'chapters'
ERROR:root:No text in conf_vldb_PardonA00
Traceback (most recent call last):
  File "<ipython-input-99-60f98a35fd22>", line 25, in return_chapters
    for chapter in r['content']['chapters']:
KeyError: 'chapters'
ERROR:root:No text in conf_vldb_Novaretti01
Traceback (most recent call last):
  File "<ipython-input-99-60f98a35fd22>", line 25, in return_chapters
    for chapter in r['content']['chapters']:
KeyError: 'chapters'
ERROR:root:No text in conf_vldb_KoudasIM00
Traceback (most recent call last):
  File "<ipython-input-99-60f98a35fd22>", line 25, in return_chapters


Total journals: 18


In [None]:
for pubs in list_of_pubs:
    for paper in pubs:
        actions = []
        cleaned = []
        datasetsent = []
        othersent = []
        
        for paragraph in paper['paragraphs']:
            if paragraph == {}:
                continue
            lines = (sent_detector.tokenize(paragraph.strip()))
            
            if len(lines) < 3:
                continue

            for i in range(len(lines)):
                words = nltk.word_tokenize(lines[i])
                lengths = [len(x) for x in words]
                average = sum(lengths) / len(lengths)
                if average < 4:
                    continue
                    
                twosentences = ''
                try:
                    twosentences = lines[i] + ' ' + lines[i-1]

                except:
                    twosentences = lines[i] + ' ' + lines[i+1]
                    
                datasetsent.append(twosentences)

            #cleaned.append(paragraph)

        for num, parag in enumerate(datasetsent):
            actions.append({
                "_index": "twosent",
                "_type": "twosentnorules",
                "_id": paper['_id'] + str(num),

                "_source" : {
                    "title" : paper['title'],
                    "content.chapter.sentpositive" : parag,
                    "paper_id":paper['_id']
                }})
            
        if len(actions) == 0:
            continue

#         res = helpers.bulk(es, actions)
#         print(res)

# 4 Using ES 

In [19]:
res = es.search(index="ir", body={"query": {"match_all": {}}}, size = 3)
print("Got %d Hits:" % res['hits']['total'])
for hit in res['hits']['hits']:
    print(hit['_source']['title'])#, hit['publication'])
    print('_'*10)

Got 55 Hits:
Relationship of patients' age to histopathological features of breast tumours in  BRCA1  and  BRCA2  and mutation-negative breast cancer families
__________
Gene Structure Evolution of the Na + -Ca 2+  Exchanger ( NCX ) Family
__________
Erratum to: The UBC-40 Urothelial Bladder Cancer Cell Line Index: a genomic resource for functional studies
__________


In [34]:
res = es.search(index="twosent", body={"query": {"match_all": {}}}, size = 100)
print("Got %d Hits:" % res['hits']['total'])
for hit in res['hits']['hits']:
    if 'the' in hit['_source']['content.chapter.sentpositive']:
        print(hit['_source']['content.chapter.sentpositive'])
    print('_'*10)

Got 9658 Hits:
Whole exome DNA sequencing (WES) or whole genome DNA sequencing (WGS) allows identification of single nucleotide mutations or polymorphisms in all exonic regions or the entire human genome, respectively, while messenger RNA sequencing (RNA-Seq) enables quantitative analysis of gene expression. Recent advances in sequencing technology enable investigation of entire genomes at increasingly fine resolution.
__________
This approach allows the investigator to uncover the instances of complete or near allele silencing, which would be impossible using only RNA-Seq data. The expression state of the heterozygous loci detected in WES or WGS assays can be investigated in a matched RNA-Seq sample from the same individual, leading to a detailed map of the ASE activity.
__________
In addition, data from multiple heterozygous single nucleotide variants (SNVs) in the same gene must be integrated, and the large number of tested genes requires appropriate statistical treatment of the mul

In [14]:
res = es.search(index = "ir", body = {"query": {"match": {"_id" : "PMC4303232"}}}, size = 200)

print("Got %d Hits:" % res['hits']['total'])
for hit in res['hits']['hits']:
    print(hit["_id"])

Got 0 Hits:


In [24]:
res = es.search(index = "ir", body = {"query": {"match": {"title" : "cancer"}}}, size = 5)

print("Got %d Hits:" % res['hits']['total'])
for hit in res['hits']['hits']:
    print(hit["_id"], hit['_source']['title'], hit['_score'])

Got 7 Hits:
PMC_1851393 Risk of second primary cancer in men with breast cancer 3.4148667
PMC_4165366 MBASED: allele-specific expression detection in cancer tissues and cell lines 2.2526138
PMC_4681452 Erratum to: The UBC-40 Urothelial Bladder Cancer Cell Line Index: a genomic resource for functional studies 1.5718848
PMC_1175056 Relationship of patients' age to histopathological features of breast tumours in  BRCA1  and  BRCA2  and mutation-negative breast cancer families 1.4536651
PMC_5387009 Prognosis of residual axillary disease after neoadjuvant chemotherapy in clinically node-positive breast cancer patients: isolated tumor cells and micrometastases carry a better prognosis than macrometastases 1.3517907


In [17]:
res = es.search(index = "ir", body = {"query": {"match": {"title" : "genetic"}}}, size = 5)

print("Got %d Hits:" % res['hits']['total'])
for hit in res['hits']['hits']:
    print(hit['_id'], hit['_source']['title'])

Got 1 Hits:
PMC_3492105 Extreme genetic diversity in the lizard  Atlantolacerta andreanskyi  (Werner, 1929): A montane cryptic species complex


In [18]:
res = es.search(index = "ir", body = {"query": {'query_string' : {
                                        'query': '*onstruct*',
                                        'fields': ['title']}}}, size = 20)

print("Got %d Hits:" % res['hits']['total'])
for hit in res['hits']['hits']:
    print(hit['_id'], hit['_source']['title'])
    print('_'*100)

Got 1 Hits:
PMC_3078863 Rapid obtention of stable, bioluminescent tumor cell lines using a tCD2-luciferase chimeric construct
____________________________________________________________________________________________________


# 5 Doc2vec Indexing

In [33]:
from pymongo import MongoClient
from elasticsearch import Elasticsearch
from elasticsearch import helpers
import nltk
from nltk import tokenize
import config as cfg

sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
client = MongoClient('localhost:' + str(cfg.mongoDB_Port))
pub = client.pub.publications
db=client.pub

es = Elasticsearch([{'host': 'localhost', 'port': 9200}],timeout=30, max_retries=10, retry_on_timeout=True)

# es.cluster.health(wait_for_status='yellow', request_timeout=1)

In [None]:
papernames=[]
###############################
file = open(cfg.ROOTPATH + '/data/allcorpus_papers.txt','r')
text = file.read()
sentences = tokenize.sent_tokenize(text)
count = 0
docLabels = []
actions = []

for i, sent in enumerate(sentences):
    try:
        neighbors = sentences[i + 1]
        neighbor_count = count + 1
        
    except:
        neighbors = sentences[i -1]
        neighbor_count = count - 1

    docLabels.append(count)

    actions.append({
       "_index": "devtwosentnew",
       "_type": "devtwosentnorulesnew",
       "_id":count,

       "_source" : {
           "content.chapter.sentpositive" : sent,
           "content.chapter.sentnegtive": neighbors,
           "neighborcount":neighbor_count
       }})
    count = count + 1

print(len(sentences))
print(len(docLabels))

res = helpers.bulk(es, actions)
print(res)

# 5 Preparing doc2vec

In [112]:
from pymongo import MongoClient
from elasticsearch import Elasticsearch
from elasticsearch import helpers
import math
import nltk
import string
import config as cfg
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')

###############################

client = MongoClient('localhost:' + str(cfg.mongoDB_Port))
db = client.pub
pub = client.pub.publications
es = Elasticsearch(
    [{'host': 'localhost', 'port': 9200}], timeout=30, max_retries=10, retry_on_timeout=True
)
es.cluster.health(wait_for_status='yellow', request_timeout=1)

list_of_pubs=[]

def returnnames(mongo_string_search, db):
    # mongo_string_search = {"dblpkey": "{}".format(dblkey)}
    results = db.publications.find(mongo_string_search)
    chapters = list()
    chapter_nums = list()
    list_of_docs = list()
    # list_of_abstracts = list()
    merged_chapters = list()
    my_dict = {
        "_id": "",

    }
    for i, r in enumerate(results):
        # try:
        # list_of_sections = list()
        my_dict['_id'] = r['_id']
        list_of_docs.append((my_dict))

        my_dict = {
            "_id": "",

        }

    return list_of_docs

In [113]:
filter_conference = ["WWW", "ICSE", "VLDB", "PVLDB", "JCDL", "TREC",  "SIGIR", "ICWSM", "ECDL", "ESWC"]

for booktitle in filter_conference:
    mongo_string_search = {'$and': [{'booktitle': booktitle}, {'content.fulltext': {'$exists': True}}]}
    list_of_pubs.append(returnnames(mongo_string_search, db))

In [114]:
mongo_string_search = {'$and': [{'domain': 'Biomedical'}, {'content.fulltext': {'$exists': True}}]}
list_of_pubs.append(returnnames(mongo_string_search, db))

In [None]:
papersText = []

for pubs in list_of_pubs:

    for cur in pubs:

        query = {"query":
            {"match": {
                "_id": {
                    "query": cur['_id'],
                    "operator": "and"
                }
            }
            }
        }

        res = es.search(index = "ir", body = query, size = 200)

        for doc in res['hits']['hits']:
            # sentence = doc["_source"]["text"].replace(',', ' ')
            fulltext = doc["_source"]["text"]

            fulltext = fulltext.translate(str.maketrans('', '', string.punctuation))
            
            papersText.append(fulltext.lower())
            
papersText = " ".join(papersText)

f1 = open("data/word2vecData.txt", "w")
f1.write(papersText)
f1.close()