In [26]:
from elasticsearch import Elasticsearch
from rake_nltk import Rake

In [27]:
es = Elasticsearch([{
    'host': 'nyuvis-web.poly.edu',
    'port': 80,
    'url_prefix': 'es'
},])

In [64]:
#get keywords list from document titles

title_list = []
tot_num_docs = es.count(index = 'nips_papers')['count']

res = es.search(index = 'nips_papers', body = {
    "size": tot_num_docs,
    "_source": ['id', 'year', 'title']
})

for entry in res['hits']['hits']:
    rake = Rake(min_length = 1, max_length = 3)
    rake.extract_keywords_from_text(entry['_source']['title'])
    keywords = rake.get_ranked_phrases_with_scores()
    for word in keywords:
        title_list.append({
            'doc_id': entry['_source']['id'],
            'year': entry['_source']['year'],
            'keyword': word[1],
            'RAKE_score_title': word[0]
        })

title_list

[{'RAKE_score_title': 9.0,
  'doc_id': '1001',
  'keyword': 'neural network ensembles',
  'year': '1994'},
 {'RAKE_score_title': 4.0,
  'doc_id': '1001',
  'keyword': 'cross validation',
  'year': '1994'},
 {'RAKE_score_title': 4.0,
  'doc_id': '1001',
  'keyword': 'active learning',
  'year': '1994'},
 {'RAKE_score_title': 9.0,
  'doc_id': '1006',
  'keyword': 'volatile analogue amorphous',
  'year': '1994'},
 {'RAKE_score_title': 4.0,
  'doc_id': '1006',
  'keyword': 'silicon memories',
  'year': '1994'},
 {'RAKE_score_title': 4.0,
  'doc_id': '1006',
  'keyword': 'pulsestream synapses',
  'year': '1994'},
 {'RAKE_score_title': 1.0, 'doc_id': '1006', 'keyword': 'non', 'year': '1994'},
 {'RAKE_score_title': 1.0,
  'doc_id': '1007',
  'keyword': 'play',
  'year': '1994'},
 {'RAKE_score_title': 1.0,
  'doc_id': '1007',
  'keyword': 'learning',
  'year': '1994'},
 {'RAKE_score_title': 1.0,
  'doc_id': '1007',
  'keyword': 'game',
  'year': '1994'},
 {'RAKE_score_title': 1.0,
  'doc_id': 

In [65]:
#get keywords list from document abstracts

abstract_list = []
tot_num_docs = es.count(index = 'nips_papers', body = {
    'query': {
        'bool': {
            'must_not': {
                'match': { 'abstract': 'Abstract Missing' }
            }
        }
    }
})['count']

res = es.search(index = 'nips_papers', body = {
    '_source': ['id', 'year', 'abstract'],
    'size': tot_num_docs,
    'query': {
        'bool': {
            'must_not': {
                'match': {
                    'abstract': 'Abstract Missing'
                }
            }
        }
    }
})

for entry in res['hits']['hits']:
    rake = Rake(min_length = 1, max_length = 3)
    rake.extract_keywords_from_text(entry['_source']['abstract'])
    keywords = rake.get_ranked_phrases_with_scores()
    for word in keywords:
        abstract_list.append({
            'doc_id': entry['_source']['id'],
            'year': entry['_source']['year'],
            'keyword': word[1],
            'RAKE_score_abstract': word[0]
        })
        
abstract_list

[{'RAKE_score_abstract': 9.0,
  'doc_id': '3168',
  'keyword': 'theoretically founded method',
  'year': '2007'},
 {'RAKE_score_abstract': 9.0,
  'doc_id': '3168',
  'keyword': 'information bottleneck optimization',
  'year': '2007'},
 {'RAKE_score_abstract': 9.0,
  'doc_id': '3168',
  'keyword': 'convergence properties feasible',
  'year': '2007'},
 {'RAKE_score_abstract': 9.0,
  'doc_id': '3168',
  'keyword': 'common benchmark tasks',
  'year': '2007'},
 {'RAKE_score_abstract': 8.0,
  'doc_id': '3168',
  'keyword': 'rather complex rule',
  'year': '2007'},
 {'RAKE_score_abstract': 8.0,
  'doc_id': '3168',
  'keyword': 'different principal components',
  'year': '2007'},
 {'RAKE_score_abstract': 8.0,
  'doc_id': '3168',
  'keyword': 'additional target signal',
  'year': '2007'},
 {'RAKE_score_abstract': 5.0,
  'doc_id': '3168',
  'keyword': 'target signal',
  'year': '2007'},
 {'RAKE_score_abstract': 5.0,
  'doc_id': '3168',
  'keyword': 'principal components',
  'year': '2007'},
 {'R

In [66]:
#add both keywords list to a combo dictionary

combo = {}

for entry in title_list:
    if entry['keyword'] not in combo:
        combo[entry['keyword']] = {
            entry['year']: { 
                entry['doc_id']: (entry['RAKE_score_title'], 't')
            }
        }
    else:
        combo[entry['keyword']].update({
            entry['year']: {
                entry['doc_id']: (entry['RAKE_score_title'], 't')
            }
        })

for entry in abstract_list:
    if entry['keyword'] not in combo:
        combo[entry['keyword']] = {
            entry['year']: {
                entry['doc_id']: (entry['RAKE_score_abstract'], 'a')
            }
        }
    else:
        if entry['year'] not in combo[entry['keyword']]:
            combo[entry['keyword']].update({
                entry['year']: {
                    entry['doc_id']: (entry['RAKE_score_abstract'], 'a')
                }
            })
        else:
            combo[entry['keyword']][entry['year']].update({
                entry['doc_id']: (entry['RAKE_score_abstract'], 'a')
            })

combo

{'many algorithms': {'2016': {'6308': (4.0, 'a')}},
 'probabilistic integrators': {'2015': {'5749': (4.5, 'a')}},
 'subset ranking problems': {'2013': {'4906': (9.0, 'a')}},
 'new direction': {'2008': {'3460': (4.0, 'a')},
  '2014': {'5451': (4.0, 'a')},
  '2017': {'6883': (4.0, 'a'), '7161': (4.0, 'a')}},
 'visualizing group structure': {'1998': {'1552': (9.0, 't')}},
 'vocabulary size instead': {'2012': {'4613': (9.0, 'a')}},
 'submodular objective functions': {'2010': {'4106': (7.666666666666666,
    'a')}},
 'clustering models': {'2012': {'4493': (3.5, 'a')}},
 'adjacent vertices': {'2012': {'4731': (3.666666666666667, 'a')}},
 'visual genome dataset': {'2017': {'6812': (9.0, 'a')}},
 'second order information': {'2010': {'4017': (9.0, 'a')},
  '2016': {'6054': (9.0, 'a')}},
 'retrieving': {'2010': {'4088': (1.0, 'a')},
  '2013': {'5058': (1.0, 'a')},
  '2016': {'6410': (1.0, 'a')},
  '2017': {'6981': (1.0, 'a')}},
 'existing al heuristics': {'2017': {'7010': (8.666666666666666, 'a

In [79]:
#transform the nested combo dictionary into a flattened list (to transfer to csv)

flattened_combo = []

for word, years in combo.items():
    for year, doc in years.items():
        final_score = 0
        for doc_id, score in doc.items():
            if score[1] == 'a':
                final_score += score[0]
            elif score[1] == 't':
                final_score += score[0] * 15
        flattened_combo.append({
            'keyword': word,
            'year': year,
            'score': final_score
        })

flattened_combo

[{'keyword': 'many algorithms', 'score': 4.0, 'year': '2016'},
 {'keyword': 'probabilistic integrators', 'score': 4.5, 'year': '2015'},
 {'keyword': 'subset ranking problems', 'score': 9.0, 'year': '2013'},
 {'keyword': 'new direction', 'score': 4.0, 'year': '2014'},
 {'keyword': 'new direction', 'score': 4.0, 'year': '2008'},
 {'keyword': 'new direction', 'score': 8.0, 'year': '2017'},
 {'keyword': 'visualizing group structure', 'score': 135.0, 'year': '1998'},
 {'keyword': 'vocabulary size instead', 'score': 9.0, 'year': '2012'},
 {'keyword': 'submodular objective functions',
  'score': 7.666666666666666,
  'year': '2010'},
 {'keyword': 'clustering models', 'score': 3.5, 'year': '2012'},
 {'keyword': 'adjacent vertices', 'score': 3.666666666666667, 'year': '2012'},
 {'keyword': 'visual genome dataset', 'score': 9.0, 'year': '2017'},
 {'keyword': 'second order information', 'score': 9.0, 'year': '2010'},
 {'keyword': 'second order information', 'score': 9.0, 'year': '2016'},
 {'keywor

In [80]:
#transforming scores to having max 1

max_score = 0.0

for entry in flattened_combo:
    if entry['score'] > max_score:
        max_score = entry['score']

for entry in flattened_combo:
    entry['score'] = entry['score']/max_score

In [81]:
flattened_combo

[{'keyword': 'many algorithms', 'score': 0.01646090534979424, 'year': '2016'},
 {'keyword': 'probabilistic integrators',
  'score': 0.018518518518518517,
  'year': '2015'},
 {'keyword': 'subset ranking problems',
  'score': 0.037037037037037035,
  'year': '2013'},
 {'keyword': 'new direction', 'score': 0.01646090534979424, 'year': '2014'},
 {'keyword': 'new direction', 'score': 0.01646090534979424, 'year': '2008'},
 {'keyword': 'new direction', 'score': 0.03292181069958848, 'year': '2017'},
 {'keyword': 'visualizing group structure',
  'score': 0.5555555555555556,
  'year': '1998'},
 {'keyword': 'vocabulary size instead',
  'score': 0.037037037037037035,
  'year': '2012'},
 {'keyword': 'submodular objective functions',
  'score': 0.03155006858710562,
  'year': '2010'},
 {'keyword': 'clustering models',
  'score': 0.01440329218106996,
  'year': '2012'},
 {'keyword': 'adjacent vertices',
  'score': 0.015089163237311387,
  'year': '2012'},
 {'keyword': 'visual genome dataset',
  'score': 

In [82]:
import csv

with open('RAKE_lc_9_1.csv', 'w', newline='') as f:
    header_present = False
    for word in flattened_combo:
        if not header_present:
            w = csv.DictWriter(f, word.keys())
            w.writeheader()
            header_present = True
        w.writerow(word)