In [1]:
from elasticsearch import Elasticsearch
import csv

es = Elasticsearch([{'host': 'nyuvis-web.poly.edu', 'port': 80, 'url_prefix': 'es'}])

In [39]:
#get 20 signifiant terms from each year

sig_terms_res = es.search(index = 'nips_papers', body = {
    'size': 0,
    'aggs': {
        'keywords_by_year': {
            'terms': {
                'field': 'year',
                'size': 50
            },
            'aggs': {
                'keywords': {
                    'significant_terms': {
                        'field': 'title',
                        'size': 20
                    }
                }
            }
        }
    }
})

sig_terms_res['aggregations']['keywords_by_year']['buckets']

[{'doc_count': 679,
  'key': '2017',
  'keywords': {'bg_count': 7241,
   'buckets': [{'bg_count': 153,
     'doc_count': 55,
     'key': 'deep',
     'score': 0.22952093651433012},
    {'bg_count': 34,
     'doc_count': 22,
     'key': 'adversarial',
     'score': 0.19117554557161373},
    {'bg_count': 16,
     'doc_count': 14,
     'key': 'gan',
     'score': 0.17177702200021255},
    {'bg_count': 23,
     'doc_count': 12,
     'key': 'generative',
     'score': 0.08065867573976962},
    {'bg_count': 16,
     'doc_count': 9,
     'key': 'scalable',
     'score': 0.06625563127791237},
    {'bg_count': 4,
     'doc_count': 4,
     'key': 'fairness',
     'score': 0.05693202990623394},
    {'bg_count': 5,
     'doc_count': 4,
     'key': 'gated',
     'score': 0.04436742068492824},
    {'bg_count': 3,
     'doc_count': 3,
     'key': 'predicting',
     'score': 0.042699022429675444},
    {'bg_count': 3,
     'doc_count': 3,
     'key': 'acceleration',
     'score': 0.042699022429675444},

In [40]:
#put all the keywords into a set (uniqueness)

sig_terms_set = set()

for year in sig_terms_res['aggregations']['keywords_by_year']['buckets']:
    for word in year['keywords']['buckets']:
        sig_terms_set.add(word['key'])

sig_terms_set

{'2',
 '3d',
 'acceleration',
 'action',
 'active',
 'activity',
 'adaptation',
 'adaptive',
 'additive',
 'adversarial',
 'agent',
 'aggregation',
 'algorithm',
 'analog',
 'analysi',
 'annealing',
 'application',
 'approach',
 'approximate',
 'approximation',
 'architecture',
 'armed',
 'array',
 'artificial',
 'associative',
 'assumption',
 'asymmetric',
 'asynchronous',
 'attention',
 'attribute',
 'auditory',
 'backpropagation',
 'balanced',
 'bandit',
 'based',
 'basi',
 'bayesian',
 'behavior',
 'binary',
 'binding',
 'boosting',
 'bound',
 'budget',
 'cell',
 'central',
 'chain',
 'channel',
 'character',
 'chip',
 'circuit',
 'class',
 'classification',
 'classifier',
 'clustering',
 'code',
 'coding',
 'collaborative',
 'color',
 'competition',
 'competitive',
 'composition',
 'compressed',
 'computation',
 'computational',
 'computer',
 'conjugate',
 'connection',
 'connectionist',
 'context',
 'contour',
 'control',
 'convex',
 'convolution',
 'convolutional',
 'coordinate'

In [41]:
from rake_nltk import Rake

title_list = []
tot_num_docs = es.count(index = 'nips_papers')['count']

res = es.search(index = 'nips_papers', body = {
    "size": tot_num_docs,
    "_source": ['id', 'year', 'title']
})

for entry in res['hits']['hits']:
    rake = Rake(min_length = 1, max_length = 3)
    rake.extract_keywords_from_text(entry['_source']['title'])
    keywords = rake.get_ranked_phrases_with_scores()
    for word in keywords:
        title_list.append({
            'doc_id': entry['_source']['id'],
            'year': entry['_source']['year'],
            'keyword': word[1],
            'RAKE_score_title': word[0]
        })
        
abstract_list = []
tot_num_docs = es.count(index = 'nips_papers', body = {
    'query': {
        'bool': {
            'must_not': {
                'match': { 'abstract': 'Abstract Missing' }
            }
        }
    }
})['count']

res = es.search(index = 'nips_papers', body = {
    '_source': ['id', 'year', 'abstract'],
    'size': tot_num_docs,
    'query': {
        'bool': {
            'must_not': {
                'match': {
                    'abstract': 'Abstract Missing'
                }
            }
        }
    }
})

for entry in res['hits']['hits']:
    rake = Rake(min_length = 1, max_length = 3)
    rake.extract_keywords_from_text(entry['_source']['abstract'])
    keywords = rake.get_ranked_phrases_with_scores()
    for word in keywords:
        abstract_list.append({
            'doc_id': entry['_source']['id'],
            'year': entry['_source']['year'],
            'keyword': word[1],
            'RAKE_score_abstract': word[0]
        })

In [42]:
combo = {}

for entry in title_list:
    if entry['keyword'] not in combo:
        combo[entry['keyword']] = {
            entry['year']: { 
                entry['doc_id']: (entry['RAKE_score_title'], 't')
            }
        }
    else:
        combo[entry['keyword']].update({
            entry['year']: {
                entry['doc_id']: (entry['RAKE_score_title'], 't')
            }
        })

for entry in abstract_list:
    if entry['keyword'] not in combo:
        combo[entry['keyword']] = {
            entry['year']: {
                entry['doc_id']: (entry['RAKE_score_abstract'], 'a')
            }
        }
    else:
        if entry['year'] not in combo[entry['keyword']]:
            combo[entry['keyword']].update({
                entry['year']: {
                    entry['doc_id']: (entry['RAKE_score_abstract'], 'a')
                }
            })
        else:
            combo[entry['keyword']][entry['year']].update({
                entry['doc_id']: (entry['RAKE_score_abstract'], 'a')
            })
            
combo

{'tailed': {'2017': {'7065': (1.0, 'a'),
   '7190': (1.0, 'a'),
   '7208': (1.0, 'a'),
   '7278': (1.5, 'a')}},
 'solving expensive black': {'2017': {'6780': (9.0, 'a')}},
 'strong loss bounds': {'2011': {'4388': (7.333333333333334, 'a')}},
 'histogram estimation': {'2017': {'6948': (4.0, 'a')}},
 'illuminants': {'2015': {'5864': (1.5, 'a')}},
 'batch learning algorithm': {'2008': {'3514': (8.666666666666666, 'a')}},
 'parallel mixture': {'2001': {'1949': (4.0, 't')}},
 'algorithm': {'1997': {'5220': (1.0, 'a')},
  '2001': {'2092': (1.0, 't')},
  '2004': {'2616': (1.0, 't')},
  '2007': {'3164': (1.0, 'a'),
   '3176': (1.0, 'a'),
   '3178': (1.0, 'a'),
   '3186': (1.5, 'a'),
   '3190': (1.0, 'a'),
   '3200': (1.0, 'a'),
   '3233': (1.0, 'a'),
   '3249': (1.5, 'a'),
   '3265': (1.5, 'a'),
   '3289': (1.0, 'a'),
   '3303': (1.0, 'a'),
   '3305': (1.0, 'a'),
   '3329': (2.0, 'a'),
   '3348': (2.0, 'a'),
   '3352': (1.6666666666666667, 'a')},
  '2008': {'3389': (1.0, 'a'),
   '3393': (1.5, 

In [43]:
filtered_combo = {}

for sig_term in sig_terms_set:
    for item in combo.items():
        if sig_term in item[0]:
            filtered_combo[item[0]] = item[1]
            
filtered_combo

{'strong loss bounds': {'2011': {'4388': (7.333333333333334, 'a')}},
 'batch learning algorithm': {'2008': {'3514': (8.666666666666666, 'a')}},
 'parallel mixture': {'2001': {'1949': (4.0, 't')}},
 'algorithm': {'1997': {'5220': (1.0, 'a')},
  '2001': {'2092': (1.0, 't')},
  '2004': {'2616': (1.0, 't')},
  '2007': {'3164': (1.0, 'a'),
   '3176': (1.0, 'a'),
   '3178': (1.0, 'a'),
   '3186': (1.5, 'a'),
   '3190': (1.0, 'a'),
   '3200': (1.0, 'a'),
   '3233': (1.0, 'a'),
   '3249': (1.5, 'a'),
   '3265': (1.5, 'a'),
   '3289': (1.0, 'a'),
   '3303': (1.0, 'a'),
   '3305': (1.0, 'a'),
   '3329': (2.0, 'a'),
   '3348': (2.0, 'a'),
   '3352': (1.6666666666666667, 'a')},
  '2008': {'3389': (1.0, 'a'),
   '3393': (1.5, 'a'),
   '3414': (1.0, 'a'),
   '3420': (1.0, 'a'),
   '3423': (1.0, 'a'),
   '3426': (1.0, 'a'),
   '3428': (1.0, 'a'),
   '3431': (1.5, 'a'),
   '3444': (1.0, 'a'),
   '3446': (1.0, 'a'),
   '3478': (2.0, 'a'),
   '3479': (1.0, 'a'),
   '3485': (1.0, 'a'),
   '3486': (2.0, '

In [44]:
flattened_combo = []

for word, years in filtered_combo.items():
    for year, doc in years.items():
        final_score = 0
        for doc_id, score in doc.items():
            if score[1] == 'a':
                final_score += score[0]
            elif score[1] == 't':
                final_score += score[0] * 9
        flattened_combo.append({
            'keyword': word,
            'year': year,
            'score': final_score
        })
        
flattened_combo

[{'keyword': 'strong loss bounds', 'score': 7.333333333333334, 'year': '2011'},
 {'keyword': 'batch learning algorithm',
  'score': 8.666666666666666,
  'year': '2008'},
 {'keyword': 'parallel mixture', 'score': 36.0, 'year': '2001'},
 {'keyword': 'algorithm', 'score': 62.666666666666664, 'year': '2010'},
 {'keyword': 'algorithm', 'score': 65.53333333333333, 'year': '2011'},
 {'keyword': 'algorithm', 'score': 9.0, 'year': '2001'},
 {'keyword': 'algorithm', 'score': 67.91666666666667, 'year': '2013'},
 {'keyword': 'algorithm', 'score': 51.666666666666664, 'year': '2009'},
 {'keyword': 'algorithm', 'score': 86.66666666666667, 'year': '2015'},
 {'keyword': 'algorithm', 'score': 116.41666666666667, 'year': '2016'},
 {'keyword': 'algorithm', 'score': 43.416666666666664, 'year': '2012'},
 {'keyword': 'algorithm', 'score': 35.91666666666667, 'year': '2008'},
 {'keyword': 'algorithm', 'score': 1.0, 'year': '1997'},
 {'keyword': 'algorithm', 'score': 91.36666666666669, 'year': '2014'},
 {'keywo

In [45]:
max_score = 0.0

for entry in flattened_combo:
    if entry['score'] > max_score:
        max_score = entry['score']

for entry in flattened_combo:
    entry['score'] = entry['score']/max_score
    
flattened_combo

[{'keyword': 'strong loss bounds',
  'score': 0.03841117415975556,
  'year': '2011'},
 {'keyword': 'batch learning algorithm',
  'score': 0.04539502400698384,
  'year': '2008'},
 {'keyword': 'parallel mixture', 'score': 0.18856394587516367, 'year': '2001'},
 {'keyword': 'algorithm', 'score': 0.3282409428197293, 'year': '2010'},
 {'keyword': 'algorithm', 'score': 0.34325621999127015, 'year': '2011'},
 {'keyword': 'algorithm', 'score': 0.04714098646879092, 'year': '2001'},
 {'keyword': 'algorithm', 'score': 0.3557398515931907, 'year': '2013'},
 {'keyword': 'algorithm', 'score': 0.270624181580096, 'year': '2009'},
 {'keyword': 'algorithm', 'score': 0.4539502400698385, 'year': '2015'},
 {'keyword': 'algorithm', 'score': 0.6097773897861196, 'year': '2016'},
 {'keyword': 'algorithm', 'score': 0.227411610650371, 'year': '2012'},
 {'keyword': 'algorithm', 'score': 0.18812745525971192, 'year': '2008'},
 {'keyword': 'algorithm', 'score': 0.005237887385421213, 'year': '1997'},
 {'keyword': 'algor

In [46]:
with open('RAKE_lc_9_1_filtered.csv', 'w', newline='') as f:
    header_present = False
    for word in flattened_combo:
        if not header_present:
            w = csv.DictWriter(f, word.keys())
            w.writeheader()
            header_present = True
        w.writerow(word)