In [1]:
from elasticsearch import Elasticsearch
from rake_nltk import Rake

es = Elasticsearch([{'host': 'nyuvis-web.poly.edu', 'port': 80, 'url_prefix': 'es'}])

In [2]:
doc_list = []

year = "2007"

while year != "2018":
    num_docs = es.count(index = 'nips_papers', body = {
        "query": {
            "bool": {
                "must": {
                    "match": { "year" : year }
                },
                "must_not": {
                    "match": {
                        "abstract": "Abstract Missing"
                    }
                }
            }
        }
    })['count']
    res = es.search(index = 'nips_papers', body = {
        "size": num_docs,
        "_source": ['id', 'year', 'title', 'abstract'],
        "query": {
            "bool": {
                "must": {
                    "match": { "year" : year }
                },
                "must_not": {
                    "match": {
                        "abstract": "Abstract Missing"
                    }
                }
            }
        }
    })
    for entry in res['hits']['hits']:
        doc_list.append(entry['_source'])
    tmp = int(year) + 1
    year = str(tmp)

doc_list

[{'abstract': 'It is known that determinining whether a DEC-POMDP, namely, a cooperative partially observable stochastic game (POSG), has a cooperative strategy with positive expected reward is complete for NEXP. It was not known until now how cooperation affected that complexity. We show that, for competitive POSGs, the complexity of determining whether one team has a positive-expected-reward strategy is complete for the class NEXP with an oracle for NP.',
  'id': '3163',
  'title': 'Competition Adds Complexity',
  'year': '2007'},
 {'abstract': 'We propose a model that leverages the millions of clicks received by web search engines, to predict document relevance. This allows the comparison of ranking functions when clicks are available but complete relevance judgments are not. After an initial training phase using a set of relevance judgments paired with click data, we show that our model can predict the relevance score of documents that have not been judged. These predictions can be

In [3]:
#making sure doc_list only contains documents from years 2007 to 2017

years = ['2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017']
not_in_range = False

for doc in doc_list:
    if doc['year'] not in years:
        not_in_range = True
        break
        
not_in_range

False

In [4]:
len(doc_list)

3804

In [5]:
#obtain a set of significant terms (unigrams) for filtering after RAKE

unigrams = set()

res = es.search(index = "nips_papers", body = {
    "size": 0,
    "aggs": {
        "years": {
            "terms": { "field" : "year" },
            "aggs": {
                "keywords": {
                    "significant_terms": {
                        "field": "title",
                        "size": 30
                    }
                }
            }
        }
    }
})

tmp = res['aggregations']['years']['buckets']

for year in tmp:
    words = year['keywords']['buckets']
    for word in words:
        unigrams.add(word['key'])
        
unigrams

{'3d',
 'acceleration',
 'action',
 'active',
 'adaptation',
 'additive',
 'adversarial',
 'agent',
 'aggregation',
 'algorithm',
 'alignment',
 'analysi',
 'annotation',
 'armed',
 'assumption',
 'asynchronous',
 'attention',
 'attribute',
 'average',
 'balanced',
 'bandit',
 'bayesian',
 'binary',
 'bound',
 'budget',
 'carlo',
 'cascade',
 'case',
 'chain',
 'choice',
 'class',
 'classifier',
 'coding',
 'cold',
 'collaborative',
 'completion',
 'composition',
 'compressed',
 'conjugate',
 'construction',
 'context',
 'contextual',
 'convex',
 'convolution',
 'convolutional',
 'coordinate',
 'coupled',
 'cover',
 'crowdsourcing',
 'dataset',
 'decentralized',
 'decision',
 'decomposition',
 'deep',
 'dependence',
 'descent',
 'design',
 'detection',
 'determinantal',
 'dictionary',
 'dimensional',
 'disentangled',
 'distributed',
 'distribution',
 'domain',
 'driven',
 'dropout',
 'dynamical',
 'effect',
 'embedding',
 'estimating',
 'estimation',
 'evaluation',
 'expectation',
 'ex

In [6]:
len(unigrams)

232

In [7]:
#RAKE list for both title and abstracts (keywords from length 1 to 3)

k_list = []

for doc in doc_list:
    r = Rake(min_length = 1, max_length = 3)
    r.extract_keywords_from_text(doc['title'])
    keywords = r.get_ranked_phrases_with_scores()
    for word in keywords:
        k_list.append({
            'doc_id': doc['id'],
            'year': doc['year'],
            'keyword': word[1],
            'RAKE_score': word[0]
        })
    r.extract_keywords_from_text(doc['abstract'])
    keywords = r.get_ranked_phrases_with_scores()
    for word in keywords:
        k_list.append({
            'doc_id': doc['id'],
            'year': doc['year'],
            'keyword': word[1],
            'RAKE_score': word[0]
        })
        
k_list

[{'RAKE_score': 9.0,
  'doc_id': '3163',
  'keyword': 'competition adds complexity',
  'year': '2007'},
 {'RAKE_score': 6.5,
  'doc_id': '3163',
  'keyword': 'positive expected reward',
  'year': '2007'},
 {'RAKE_score': 4.5,
  'doc_id': '3163',
  'keyword': 'reward strategy',
  'year': '2007'},
 {'RAKE_score': 4.0, 'doc_id': '3163', 'keyword': 'posg ),', 'year': '2007'},
 {'RAKE_score': 4.0,
  'doc_id': '3163',
  'keyword': 'determinining whether',
  'year': '2007'},
 {'RAKE_score': 4.0,
  'doc_id': '3163',
  'keyword': 'cooperative strategy',
  'year': '2007'},
 {'RAKE_score': 4.0,
  'doc_id': '3163',
  'keyword': 'cooperation affected',
  'year': '2007'},
 {'RAKE_score': 4.0,
  'doc_id': '3163',
  'keyword': 'competitive posgs',
  'year': '2007'},
 {'RAKE_score': 3.5,
  'doc_id': '3163',
  'keyword': 'class nexp',
  'year': '2007'},
 {'RAKE_score': 2.0, 'doc_id': '3163', 'keyword': 'positive', 'year': '2007'},
 {'RAKE_score': 2.0, 'doc_id': '3163', 'keyword': 'expected', 'year': '20

In [8]:
len(k_list)

176983

In [9]:
#filter the keywords list using significant terms set

filtered_list = []

for word in k_list:
    tokens = word['keyword'].split()
    for token in tokens:
        if token in unigrams:
            filtered_list.append(word)
            break
            
len(filtered_list)

37714

In [10]:
filtered_list

[{'RAKE_score': 6.5,
  'doc_id': '3163',
  'keyword': 'positive expected reward',
  'year': '2007'},
 {'RAKE_score': 4.5,
  'doc_id': '3163',
  'keyword': 'reward strategy',
  'year': '2007'},
 {'RAKE_score': 4.0,
  'doc_id': '3163',
  'keyword': 'cooperative strategy',
  'year': '2007'},
 {'RAKE_score': 3.5,
  'doc_id': '3163',
  'keyword': 'class nexp',
  'year': '2007'},
 {'RAKE_score': 2.0, 'doc_id': '3163', 'keyword': 'positive', 'year': '2007'},
 {'RAKE_score': 1.0, 'doc_id': '3163', 'keyword': 'oracle', 'year': '2007'},
 {'RAKE_score': 1.0, 'doc_id': '3190', 'keyword': 'modeling', 'year': '2007'},
 {'RAKE_score': 4.0,
  'doc_id': '3190',
  'keyword': 'ranking functions',
  'year': '2007'},
 {'RAKE_score': 4.0,
  'doc_id': '3190',
  'keyword': 'novel formalization',
  'year': '2007'},
 {'RAKE_score': 4.0,
  'doc_id': '3190',
  'keyword': 'general enough',
  'year': '2007'},
 {'RAKE_score': 3.5,
  'doc_id': '3190',
  'keyword': 'better ranking',
  'year': '2007'},
 {'RAKE_score': 

In [11]:
#make the score out of 1.0

max_score = 0

for word in filtered_list:
    if word['RAKE_score']>max_score:
        max_score = word['RAKE_score']
        
for word in filtered_list:
    word['RAKE_score'] = word['RAKE_score']/max_score
    
filtered_list

[{'RAKE_score': 0.7222222222222222,
  'doc_id': '3163',
  'keyword': 'positive expected reward',
  'year': '2007'},
 {'RAKE_score': 0.5,
  'doc_id': '3163',
  'keyword': 'reward strategy',
  'year': '2007'},
 {'RAKE_score': 0.4444444444444444,
  'doc_id': '3163',
  'keyword': 'cooperative strategy',
  'year': '2007'},
 {'RAKE_score': 0.3888888888888889,
  'doc_id': '3163',
  'keyword': 'class nexp',
  'year': '2007'},
 {'RAKE_score': 0.2222222222222222,
  'doc_id': '3163',
  'keyword': 'positive',
  'year': '2007'},
 {'RAKE_score': 0.1111111111111111,
  'doc_id': '3163',
  'keyword': 'oracle',
  'year': '2007'},
 {'RAKE_score': 0.1111111111111111,
  'doc_id': '3190',
  'keyword': 'modeling',
  'year': '2007'},
 {'RAKE_score': 0.4444444444444444,
  'doc_id': '3190',
  'keyword': 'ranking functions',
  'year': '2007'},
 {'RAKE_score': 0.4444444444444444,
  'doc_id': '3190',
  'keyword': 'novel formalization',
  'year': '2007'},
 {'RAKE_score': 0.4444444444444444,
  'doc_id': '3190',
  'k

In [12]:
def avg(curr_score, new_score, curr_count):
    numerator = curr_score*curr_count + new_score
    denominator = curr_count+1
    average = numerator/denominator
    return average

In [13]:
#make a nested dictionary-- keyword: { year: (count, avg_score) }

info_1 = {}

for word in filtered_list:
    if word['keyword'] in info_1:
        if word['year'] in info_1[word['keyword']]:
            #add count, take average
            prev_count = info_1[word['keyword']][word['year']][0]
            avg_score = avg(info_1[word['keyword']][word['year']][1], word['RAKE_score'], prev_count)
            info_1[word['keyword']][word['year']] = (prev_count+1, avg_score)
        else:
            info_1[word['keyword']][word['year']] = (1, word['RAKE_score'])
    else:
        info_1[word['keyword']] = { word['year']: (1, word['RAKE_score']) }

In [14]:
info_1

{'optimal regret minimization': {'2014': (1, 1.0)},
 'memorization task': {'2017': (1, 0.4444444444444444)},
 'highly variable indicator': {'2008': (1, 1.0)},
 'setting helps': {'2016': (1, 0.4444444444444444)},
 'applying functional alignment': {'2017': (1, 0.8888888888888888)},
 'graphical gaussian vector': {'2012': (2, 0.9722222222222222)},
 'large noise': {'2013': (1, 0.4444444444444444)},
 'sample predictive performance': {'2007': (1, 1.0)},
 'ranking annotators': {'2011': (1, 0.4444444444444444)},
 'probabilistic code': {'2013': (1, 0.5), '2016': (1, 0.4444444444444444)},
 'dynamical framework': {'2014': (1, 0.4444444444444444)},
 'spatiotemporal modeling': {'2016': (1, 0.4444444444444444)},
 'conditional convolutional connections': {'2015': (1, 1.0)},
 'performance benefits': {'2009': (1, 0.4444444444444444)},
 'rice distribution': {'2013': (1, 0.4444444444444444)},
 'parametric mapping': {'2015': (1, 0.4166666666666667)},
 'large images': {'2009': (1, 0.5), '2014': (2, 0.472222

In [15]:
len(info_1)

19147

In [16]:
#get all unigrams

unigrams = set()

for word in info_1.keys():
    if len(word.split()) == 1:
        unigrams.add(word)
        
len(unigrams)

222

In [17]:
#nested dictionary: per unigram, list of years and unigram counts, per year, count of each variation

info_2 = []

for uni in unigrams:
    for word, years in info_1.items():
        if word == uni:
            info_2.append({
                'word': uni,
                'variations': [],
                'total_count': []
            })
            
info_2

[{'total_count': [], 'variations': [], 'word': 'multivariate'},
 {'total_count': [], 'variations': [], 'word': 'sensing'},
 {'total_count': [], 'variations': [], 'word': 'regularization'},
 {'total_count': [], 'variations': [], 'word': 'maximum'},
 {'total_count': [], 'variations': [], 'word': 'probabilistic'},
 {'total_count': [], 'variations': [], 'word': 'symmetric'},
 {'total_count': [], 'variations': [], 'word': 'construction'},
 {'total_count': [], 'variations': [], 'word': 'deep'},
 {'total_count': [], 'variations': [], 'word': 'structured'},
 {'total_count': [], 'variations': [], 'word': 'transfer'},
 {'total_count': [], 'variations': [], 'word': 'modeling'},
 {'total_count': [], 'variations': [], 'word': 'decentralized'},
 {'total_count': [], 'variations': [], 'word': 'shot'},
 {'total_count': [], 'variations': [], 'word': 'making'},
 {'total_count': [], 'variations': [], 'word': 'conjugate'},
 {'total_count': [], 'variations': [], 'word': 'metric'},
 {'total_count': [], 'vari

In [18]:
len(info_2)

222

In [19]:
for entry in info_2:
    uni = entry['word']
    for word, years in info_1.items():
        if len(years.items()) >= 2:
            tokens = word.split()
            for token in tokens:
                if token == uni:
                    entry['variations'].append({
                        'n-gram': word,
                        'years': [ { 'year': int(year), 'count': count[0] } for year, count in years.items() ]
                    })
                    if len(entry['total_count']) == 0:
                        for year, count in years.items():
                            entry['total_count'].append({
                                'year': int(year),
                                'count': count[0]
                            })
                    else:
                        for year, count in years.items():
                            in_entry = False
                            for ct in entry['total_count']:
                                if int(year) == ct['year']:
                                    in_entry = True
                                    ct['count'] += count[0]
                            if not in_entry:
                                entry['total_count'].append({
                                    'year': int(year),
                                    'count': count[0]
                                })
                    break

In [20]:
info_2

[{'total_count': [{'count': 1, 'year': 2009},
   {'count': 1, 'year': 2014},
   {'count': 1, 'year': 2010},
   {'count': 2, 'year': 2016},
   {'count': 2, 'year': 2017},
   {'count': 1, 'year': 2015}],
  'variations': [{'n-gram': 'multivariate regression',
    'years': [{'count': 1, 'year': 2009},
     {'count': 1, 'year': 2014},
     {'count': 1, 'year': 2010}]},
   {'n-gram': 'multivariate losses',
    'years': [{'count': 1, 'year': 2016},
     {'count': 1, 'year': 2017},
     {'count': 1, 'year': 2015}]},
   {'n-gram': 'multivariate hawkes process',
    'years': [{'count': 1, 'year': 2016}, {'count': 1, 'year': 2017}]}],
  'word': 'multivariate'},
 {'total_count': [{'count': 5, 'year': 2016},
   {'count': 3, 'year': 2017},
   {'count': 2, 'year': 2010},
   {'count': 4, 'year': 2009},
   {'count': 1, 'year': 2015},
   {'count': 2, 'year': 2013},
   {'count': 7, 'year': 2012},
   {'count': 3, 'year': 2014},
   {'count': 5, 'year': 2011},
   {'count': 2, 'year': 2008}],
  'variations':

In [21]:
import json

with open('final.json', 'w+') as f:
    json.dump(info_2, f, indent=4)