In [47]:
from elasticsearch import Elasticsearch

es = Elasticsearch([{'host':'nyuvis-web.poly.edu', 'port': 80, 'url_prefix':'es'}])

In [48]:
doc_count_year = [] #list of doc_counts by year (each entry = dictionary)

curr_yr = '1987'
done = False

while not done:
    my_dict = es.count(index="nips_papers", body={
        'query': {
            'match': {
                'year': curr_yr
            }
        }
    })
    doc_count_year.append({curr_yr:my_dict['count']})
    temp = int(curr_yr) + 1
    if temp == 2018:
        done = True
    curr_yr = str(temp)
    
doc_count_year

[{'1987': 90},
 {'1988': 94},
 {'1989': 101},
 {'1990': 143},
 {'1991': 144},
 {'1992': 127},
 {'1993': 158},
 {'1994': 140},
 {'1995': 152},
 {'1996': 152},
 {'1997': 150},
 {'1998': 151},
 {'1999': 150},
 {'2000': 152},
 {'2001': 197},
 {'2002': 207},
 {'2003': 198},
 {'2004': 207},
 {'2005': 207},
 {'2006': 204},
 {'2007': 217},
 {'2008': 250},
 {'2009': 262},
 {'2010': 292},
 {'2011': 306},
 {'2012': 368},
 {'2013': 360},
 {'2014': 411},
 {'2015': 403},
 {'2016': 569},
 {'2017': 679}]

In [105]:
#multi-set analysis: each year in aggregation of years contains an aggregation of significant terms
sig_terms_res = es.search(index = "nips_papers", body = {
    "size": 0,
    "aggs": {
        "keywords_by_years": {
            "terms": {
                "field": "year",
                "size": 50
            },
            "aggs": {
                "keywords": {
                    "significant_terms": {
                        "field": "title",
                        "gnd": {},
                        "size": 30
                    }
                }
            }
        }
    }
})

words_list = []
data = sig_terms_res['aggregations']['keywords_by_years']['buckets']

for year in data:
    words = year['keywords']['buckets']
    for word in words:
        if word['key'] not in words_list:
            words_list.append(word['key'])

sig_terms_res = es.search(index = "nips_papers", body = {
    "size": 0,
    "aggs": {
        "keywords_by_years": {
            "terms": {
                "field": "year",
                "size": 50
            },
            "aggs": {
                "keywords": {
                    "terms": {
                        "field": "title",
                        "include": words_list,
                        "size": len(words_list)
                    }
                }
            }
        }
    }
})

sig_terms_res

{'_shards': {'failed': 0, 'skipped': 0, 'successful': 5, 'total': 5},
 'aggregations': {'keywords_by_years': {'buckets': [{'doc_count': 679,
     'key': '2017',
     'keywords': {'buckets': [{'doc_count': 180, 'key': 'learning'},
       {'doc_count': 85, 'key': 'network'},
       {'doc_count': 66, 'key': 'model'},
       {'doc_count': 63, 'key': 'neural'},
       {'doc_count': 61, 'key': 'deep'},
       {'doc_count': 33, 'key': 'data'},
       {'doc_count': 32, 'key': 'algorithm'},
       {'doc_count': 32, 'key': 'multi'},
       {'doc_count': 29, 'key': 'optimization'},
       {'doc_count': 27, 'key': 'gaussian'},
       {'doc_count': 27, 'key': 'online'},
       {'doc_count': 25, 'key': 'adversarial'},
       {'doc_count': 25, 'key': 'inference'},
       {'doc_count': 25, 'key': 'stochastic'},
       {'doc_count': 24, 'key': 'reinforcement'},
       {'doc_count': 22, 'key': 'dynamic'},
       {'doc_count': 22, 'key': 'efficient'},
       {'doc_count': 22, 'key': 'method'},
       {'d

In [106]:
import csv

#columns needed: 1) keyword, 2) year, 3) score, 4) score proportion, 5) fg_count, 6) doc_count by year,
# 7) proportion in fg, 8) proportion in bg

sig_list = []

data = sig_terms_res['aggregations']['keywords_by_years']['buckets']
tot_num_docs = es.count(index="nips_papers")['count']

#get a list of keywords and how many times they show up in the entire index
cnt_keywords = {}
for year in data:
    words = year['keywords']['buckets']
    for word in words:
        if word['key'] not in cnt_keywords:
            cnt_keywords[word['key']] = word['doc_count']
        else:
            cnt_keywords[word['key']] += word['doc_count']

cnt_keywords

{'2': 11,
 '3d': 36,
 'acceleration': 7,
 'accuracy': 18,
 'action': 44,
 'active': 88,
 'activity': 41,
 'adaptation': 47,
 'adaptive': 161,
 'additive': 15,
 'adversarial': 43,
 'agent': 30,
 'aggregation': 13,
 'algorithm': 330,
 'alignment': 22,
 'analog': 67,
 'analysi': 244,
 'annealing': 10,
 'annotation': 8,
 'application': 162,
 'approach': 140,
 'approximate': 75,
 'approximation': 96,
 'architecture': 44,
 'armed': 20,
 'array': 10,
 'artificial': 20,
 'associative': 35,
 'assumption': 6,
 'asymmetric': 7,
 'asynchronous': 17,
 'attention': 41,
 'attribute': 12,
 'auditory': 30,
 'augmented': 10,
 'automatic': 25,
 'average': 16,
 'backpropagation': 27,
 'balanced': 11,
 'bandit': 92,
 'based': 280,
 'basi': 30,
 'baye': 40,
 'bayesian': 293,
 'behavior': 23,
 'belief': 59,
 'binding': 10,
 'blind': 28,
 'boosting': 58,
 'bound': 124,
 'budget': 11,
 'carlo': 53,
 'cascade': 17,
 'case': 28,
 'cell': 39,
 'central': 5,
 'chain': 24,
 'channel': 21,
 'character': 12,
 'chip':

In [107]:
for year in data:
    words = year['keywords']['buckets']
    
    #creating csv lines
    for word in words:
        bg_prop = cnt_keywords[word['key']]/tot_num_docs
        fg_prop = word['doc_count']/year['doc_count']
        line = {
            'score': fg_prop/bg_prop,
            'bg_prop': bg_prop,
            'fg_prop': fg_prop,
            'total_docs_year': year['doc_count'],
            'doc_count': word['doc_count'],
            'year': year['key'],
            'keyword': word['key']
        }
        sig_list.append(line)
        
        
with open('sig_terms_res_v2.csv', 'w', newline='') as f:
    header_present = False
    for doc in sig_list:
        if not header_present:
            w = csv.DictWriter(f, doc.keys())
            w.writeheader()
            header_present = True
        w.writerow(doc)