In [1]:
# Give access to utils
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
import util

In [2]:
# Load an example publication returned
fname = 'nature07365.html'
with open(fname, 'r') as example_html:
    html = example_html.read()

In [3]:
# Import into Beautiful Soup
from bs4 import BeautifulSoup
import re
soup = BeautifulSoup(html, 'html.parser')

# Find article sections
sections = soup.findAll('section', {'aria-labelledby': re.compile(r".*")})
len(sections)

title_el = soup.find('h1', {'data-article-title': re.compile(r".*")})
title_el

<h1 class="tighten-line-height small-space-below" data-article-title="" itemprop="name headline">Transient nature of late Pleistocene climate variability</h1>

In [4]:
time_el = soup.find('time')
time_el

<time datetime="2008-03-21">21 March 2008</time>

In [5]:
# We define a method that turns HTML into text
# Source: https://stackoverflow.com/questions/328356/extracting-text-from-html-file-using-python
def html2txt(html):
    soup = BeautifulSoup(html, 'html.parser')

    # Kill all script, style, and link elements
    for script in soup(['script', 'style', 'a', 'h2']):
        script.extract()    # rip it out
        
    # Get text
    text = soup.get_text()
    
    return text

In [6]:
title = html2txt(str(title_el))
time = html2txt(str(time_el))

In [7]:
import re

# Certain sections contain useless information
skip_sections = set([
    'acknowledgements',
    'references',
    'author-information',
    'supplementary-information',
    'article-comments'
])

section_texts = []
section_titles = []

# Turn each section that does have good info into text
for section in sections:
    
    # Skip useless sections
    s_title = section['aria-labelledby']
    if s_title in skip_sections:
        continue
    
    # Otherwise we turn the section into plain text
    text = html2txt(unicode(section))
    text = re.sub(r'[^\x00-\x7f]',r' ', text) # remove non-unicode
    section_texts.append(text)
    section_titles.append(s_title)
    

In [8]:
texts = '\n'.join(section_texts)
#print texts

Certain forums have recommended number substition - could be worth looking into. Just tokens for negative, 0-1, and positive numbers. 

Additionally we want to split by everything that is not a letter

In [9]:
def is_number(s):
    return s[0].isdigit()

# Check if numbers can be identified
is_number('18Ocal')

True

In [10]:
import re

# Split the string by space first
texts_word_lists = []
for text in section_texts:
    text_list = text.split(' ')
    text_list_out = []
    for word in text_list:
        if not word:
            continue
        elif is_number(word):
            text_list_out.append('TOKEN_NUMBER')
        else:
            subwords = re.split('[^a-zA-Z]', word)
            text_list_out.extend(filter(None, subwords))
    texts_word_lists.append(text_list_out)
    
#texts_word_lists

In [11]:
section_texts[1]

u'A recent composite of benthic  18Ocal (that is, calcite) records summarizes the general pattern of climate back to 3.0 Myr ago (). A 500-kyr moving boxcar window (see ref. ) highlights long-term changes in  18Ocal variability (). This window length was chosen to mute glacial interglacial variations and any possible influence of the 413-kyr eccentricity cycle. There is an increase in variability after the Pliocene/Pleistocene boundary ( 1.8 Myr ago), with clear evidence for steps  665 kyr and  885 kyr ago. The trend is similar () to strontium isotope changes that are sometimes interpreted to reflect variations in continental weathering.Figure 1: Evidence for trends in climate variability in the Plio-Pleistocene.a, Plio-Pleistocene composite  18Ocal record from ref. ; more positive values indicate colder climates. ( 18O = (18O/16O)sample/(18O/16O)standard - 1 is a measure of the difference in a sample, with respect to a standard, in the stable isotope ratios of 18O to 16O, usually expr

In [15]:
# The last step is to ingest each section text
# We want to include: section title, section text, article title
import json
import dateutil.parser

date = dateutil.parser.parse(time)

result = {}
result['article_title'] = title
result['article_date'] = date
result['section_title'] = section_titles[0]
result['text'] = section_texts[0]

result

{'article_date': datetime.datetime(2008, 3, 21, 0, 0),
 'article_title': u'Transient nature of late Pleistocene climate variability',
 'section_title': u'abstract',
 'text': u"Climate in the early Pleistocene varied with a period of 41 kyr and was related to variations in Earth's obliquity. About 900 kyr ago, variability increased and oscillated primarily at a period of  100 kyr, suggesting that the link was then with the eccentricity of Earth's orbit. This transition has often,,, been attributed to a nonlinear response to small changes in external boundary conditions. Here we propose that increasing variablility within the past million years may indicate that the climate system was approaching a second climate bifurcation point, after which it would transition again to a new stable state characterized by permanent mid-latitude Northern Hemisphere glaciation. From this perspective the past million years can be viewed as a transient interval in the evolution of Earth's climate. We suppo

In [16]:
# We have a good bit of data. But we still need an index and mapping
from datetime import datetime
from elasticsearch import Elasticsearch
from elasticsearch import Elasticsearch, RequestsHttpConnection

host = 'search-earth-data-joihn22ik6zepetuzpa4hdvik4.us-east-1.es.amazonaws.com'

es = Elasticsearch(
    hosts=[{'host': host, 'port': 443}],
    use_ssl=True,
    verify_certs=True,
    connection_class=RequestsHttpConnection
)

In [17]:
es.info()

{u'cluster_name': u'632795836081:earth-data',
 u'cluster_uuid': u'MEcY8X53QLKrkYhp4RgFnA',
 u'name': u'dTPFcJ8',
 u'tagline': u'You Know, for Search',
 u'version': {u'build_date': u'2017-12-07T01:43:54.348Z',
  u'build_hash': u'd951bbf',
  u'build_snapshot': False,
  u'lucene_version': u'7.0.1',
  u'minimum_index_compatibility_version': u'5.0.0',
  u'minimum_wire_compatibility_version': u'5.6.0',
  u'number': u'6.0.1'}}

In [19]:
res = es.index(index='nature', doc_type='section', id=1, body=result)

In [20]:
res

{u'_id': u'1',
 u'_index': u'nature',
 u'_primary_term': 1,
 u'_seq_no': 2,
 u'_shards': {u'failed': 0, u'successful': 1, u'total': 2},
 u'_type': u'section',
 u'_version': 3,
 u'result': u'updated'}

In [48]:
# However we still want the mapping type of the section to be English
mapping = es.indices.get_mapping(index='nature', doc_type='section')

import json
mapping_str = json.dumps(mapping)
print mapping_str

{"nature": {"mappings": {"section": {"properties": {"article_title": {"fields": {"keyword": {"ignore_above": 256, "type": "keyword"}}, "type": "text"}, "text": {"fields": {"keyword": {"ignore_above": 256, "type": "keyword"}}, "type": "text"}, "article_date": {"type": "date"}, "section_title": {"fields": {"keyword": {"ignore_above": 256, "type": "keyword"}}, "type": "text"}}}}}}


In [49]:
# Now let's tweak the mapping so that document bodies are analyzed as English
mapping_body = mapping['nature']['mappings']['section']
mapping_body['properties']['text'] = {
    u'type': u'text',
    u'analyzer': u'english'
}
mapping_body


{u'properties': {u'article_date': {u'type': u'date'},
  u'article_title': {u'fields': {u'keyword': {u'ignore_above': 256,
     u'type': u'keyword'}},
   u'type': u'text'},
  u'section_title': {u'fields': {u'keyword': {u'ignore_above': 256,
     u'type': u'keyword'}},
   u'type': u'text'},
  u'text': {u'analyzer': u'english', u'type': u'text'}}}

In [50]:
resp = es.indices.create(index='nature-english')
resp

{u'acknowledged': True,
 u'index': u'nature-english',
 u'shards_acknowledged': True}

In [52]:
es.indices.put_mapping(index='nature-english', doc_type='section', body=mapping_body)

{u'acknowledged': True}

In [53]:
# Make sure we still have result
result

{'article_date': datetime.datetime(2008, 3, 21, 0, 0),
 'article_title': u'Transient nature of late Pleistocene climate variability',
 'section_title': u'abstract',
 'text': u"Climate in the early Pleistocene varied with a period of 41 kyr and was related to variations in Earth's obliquity. About 900 kyr ago, variability increased and oscillated primarily at a period of  100 kyr, suggesting that the link was then with the eccentricity of Earth's orbit. This transition has often,,, been attributed to a nonlinear response to small changes in external boundary conditions. Here we propose that increasing variablility within the past million years may indicate that the climate system was approaching a second climate bifurcation point, after which it would transition again to a new stable state characterized by permanent mid-latitude Northern Hemisphere glaciation. From this perspective the past million years can be viewed as a transient interval in the evolution of Earth's climate. We suppo

In [70]:
#resp = es.index(index='nature-english', doc_type='section', id=1, body=result)
#resp
import copy

doc = copy.deepcopy(result)
#doc['_type'] = 'section'
docs = [doc]

from elasticsearch.helpers import bulk
resp = bulk(es, docs, stats_only=True, index='nature-english', doc_type='section')


In [71]:
resp

(1, 0)

In [56]:
# The final piece is in the script: ingest all docs in data-html
# let's examine the output

import cPickle as pickle
REVISION_NUM = 1
doc_list = pickle.load(open('doc%d.p' % REVISION_NUM, 'rb'))

In [57]:
len(doc_list)

187