In [1]:
from rake_nltk import Rake
from elasticsearch import Elasticsearch
import csv

In [2]:
es = Elasticsearch([{
    'host': 'nyuvis-web.poly.edu',
    'port': 80,
    'url_prefix': 'es'
}])

In [3]:
title_list = []
tot_num_docs = es.count(index = 'nips_papers')['count']

res = es.search(index = 'nips_papers', body = {
    "size": tot_num_docs,
    "_source": ['id', 'year', 'title']
})

for entry in res['hits']['hits']:
    title_list.append(entry['_source'])

len(title_list)

7241

In [4]:
keywords_list = []

for doc in title_list:
    rake = Rake()
    rake.extract_keywords_from_text(doc['title'])
    keywords = rake.get_ranked_phrases_with_scores()
    for word in keywords:
        keywords_list.append({
            'doc_id': doc['id'],
            'year': doc['year'],
            'keyword': word[1],
            'RAKE score': word[0]
        })

keywords_list

[{'RAKE score': 9.0,
  'doc_id': '1001',
  'keyword': 'neural network ensembles',
  'year': '1994'},
 {'RAKE score': 4.0,
  'doc_id': '1001',
  'keyword': 'cross validation',
  'year': '1994'},
 {'RAKE score': 4.0,
  'doc_id': '1001',
  'keyword': 'active learning',
  'year': '1994'},
 {'RAKE score': 16.0,
  'doc_id': '1004',
  'keyword': 'iceg morphology classification using',
  'year': '1994'},
 {'RAKE score': 16.0,
  'doc_id': '1004',
  'keyword': 'analogue vlsi neural network',
  'year': '1994'},
 {'RAKE score': 9.0,
  'doc_id': '1006',
  'keyword': 'volatile analogue amorphous',
  'year': '1994'},
 {'RAKE score': 4.0,
  'doc_id': '1006',
  'keyword': 'silicon memories',
  'year': '1994'},
 {'RAKE score': 4.0,
  'doc_id': '1006',
  'keyword': 'pulsestream synapses',
  'year': '1994'},
 {'RAKE score': 1.0, 'doc_id': '1006', 'keyword': 'non', 'year': '1994'},
 {'RAKE score': 1.0, 'doc_id': '1007', 'keyword': 'play', 'year': '1994'},
 {'RAKE score': 1.0, 'doc_id': '1007', 'keyword': '

In [5]:
'''with open('RAKE_title_keywords.csv', 'w', newline = '') as f:
    header_present = False
    for doc in keywords_list:
        if not header_present:
            w = csv.DictWriter(f, doc.keys())
            w.writeheader()
            header_present = True
        w.writerow(doc)'''

"with open('RAKE_title_keywords.csv', 'w', newline = '') as f:\n    header_present = False\n    for doc in keywords_list:\n        if not header_present:\n            w = csv.DictWriter(f, doc.keys())\n            w.writeheader()\n            header_present = True\n        w.writerow(doc)"

In [9]:
abstract_list = []
tot_num_docs = es.count(index = 'nips_papers', body = {
    'query': {
        'bool': {
            'must_not': {
                'match': { 'abstract': 'Abstract Missing' }
            }
        }
    }
})['count']

res = es.search(index = 'nips_papers', body = {
    '_source': ['id', 'year', 'abstract'],
    'size': tot_num_docs,
    'query': {
        'bool': {
            'must_not': {
                'match': {
                    'abstract': 'Abstract Missing'
                }
            }
        }
    }
})

res

{'_shards': {'failed': 0, 'skipped': 0, 'successful': 5, 'total': 5},
 'hits': {'hits': [{'_id': '3168',
    '_index': 'nips_papers',
    '_score': 1.0,
    '_source': {'abstract': 'We show that under suitable assumptions (primarily linearization) a simple and perspicuous online learning rule for Information Bottleneck optimization with spiking neurons can be derived. This rule performs on common benchmark tasks as well as a rather complex rule that has previously been proposed \\cite{KlampflETAL:07b}. Furthermore, the transparency of this new learning rule makes a theoretical analysis of its convergence properties feasible. A variation of this learning rule (with sign changes) provides a theoretically founded method for performing Principal Component Analysis {(PCA)} with spiking neurons. By applying this rule to an ensemble of neurons, different principal components of the input can be extracted. In addition, it is possible to preferentially extract those principal components from in

In [10]:
for entry in res['hits']['hits']:
    abstract_list.append(entry['_source'])

abstract_list

[{'abstract': 'We show that under suitable assumptions (primarily linearization) a simple and perspicuous online learning rule for Information Bottleneck optimization with spiking neurons can be derived. This rule performs on common benchmark tasks as well as a rather complex rule that has previously been proposed \\cite{KlampflETAL:07b}. Furthermore, the transparency of this new learning rule makes a theoretical analysis of its convergence properties feasible. A variation of this learning rule (with sign changes) provides a theoretically founded method for performing Principal Component Analysis {(PCA)} with spiking neurons. By applying this rule to an ensemble of neurons, different principal components of the input can be extracted. In addition, it is possible to preferentially extract those principal components from incoming signals $X$ that are related or are not related to some additional target signal $Y_T$. In a biological interpretation, this target signal $Y_T$ (also called re

In [11]:
keywords_list = []

for doc in abstract_list:
    rake = Rake()
    rake.extract_keywords_from_text(doc['abstract'])
    keywords = rake.get_ranked_phrases_with_scores()
    for word in keywords:
        keywords_list.append({
            'doc_id': doc['id'],
            'year': doc['year'],
            'keyword': word[1],
            'RAKE score': word[0]
        })

keywords_list

[{'RAKE score': 43.5,
  'doc_id': '3168',
  'keyword': 'performing principal component analysis {( pca )}',
  'year': '2007'},
 {'RAKE score': 16.0,
  'doc_id': '3168',
  'keyword': 'could represent proprioceptive feedback',
  'year': '2007'},
 {'RAKE score': 16.0,
  'doc_id': '3168',
  'keyword': 'also called relevance variable',
  'year': '2007'},
 {'RAKE score': 14.0,
  'doc_id': '3168',
  'keyword': 'perspicuous online learning rule',
  'year': '2007'},
 {'RAKE score': 14.0,
  'doc_id': '3168',
  'keyword': 'new learning rule makes',
  'year': '2007'},
 {'RAKE score': 9.5,
  'doc_id': '3168',
  'keyword': 'different principal components',
  'year': '2007'},
 {'RAKE score': 9.0,
  'doc_id': '3168',
  'keyword': 'theoretically founded method',
  'year': '2007'},
 {'RAKE score': 9.0,
  'doc_id': '3168',
  'keyword': 'information bottleneck optimization',
  'year': '2007'},
 {'RAKE score': 9.0,
  'doc_id': '3168',
  'keyword': 'convergence properties feasible',
  'year': '2007'},
 {'RA

In [12]:
with open('RAKE_abstract_keywords.csv', 'w', newline = '') as f:
    header_present = False
    for doc in keywords_list:
        if not header_present:
            w = csv.DictWriter(f, doc.keys())
            w.writeheader()
            header_present = True
        w.writerow(doc)