In [14]:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk, streaming_bulk
import json
import math
import numbers
import pandas as pd

In [39]:
host_url = ['http://127.0.0.1:9200']

In [40]:
es_conn = Elasticsearch(host_url)

In [41]:
def sanitize(value):
    if isinstance(value, numbers.Number) and math.isnan(value):
        return None
    else:
        return value

def parse_metadata(metadata_df):
    for index, row in metadata_df.iterrows():
        yield json.dumps({'index' : 'metadata',
        'doc_type' : 'record',
        'doc' : {
        'schema' : sanitize(row[0]),
        'uuid' : sanitize(row[1]), 
        'id' : sanitize(row[2]), 
        'title' : sanitize(row[3]), 
        'abstract' : sanitize(row[4]), 
        'keyword' : sanitize(row[5]),
        'link' : sanitize(row[6]),
        'responsibleParty' : sanitize(row[7]),
        'metadatacreationdate' : sanitize(row[8]),
        'geoBox' : sanitize(row[9]),
        'image' : sanitize(row[10]),
        'LegalConstraints' : sanitize(row[11]),
        'temporalExtent' : sanitize(row[12]),
        'parentId' : sanitize(row[13]),
        'datasetcreationdate' : sanitize(row[14]),
        'Constraints' : sanitize(row[15]),
        'SecurityConstraints' : sanitize(row[16])
              },}, 
        allow_nan = False)

In [42]:
df = pd.read_csv('clean_metadata.csv')

In [43]:
bulk(es_conn, parse_metadata(df), index = 'metadata', doc_type = 'record')

(864, [])

In [44]:
print (es_conn.count(index='metadata')['count'], 'documents in index')

864 documents in index


In [45]:
es_conn.search(index = 'metadata')

{'took': 4,
 'timed_out': False,
 '_shards': {'total': 5, 'successful': 5, 'skipped': 0, 'failed': 0},
 'hits': {'total': 864,
  'max_score': 1.0,
  'hits': [{'_index': 'metadata',
    '_type': 'record',
    '_id': 'QeJ4DGkBjyVARiGt5y-7',
    '_score': 1.0,
    '_source': {'index': 'metadata',
     'doc_type': 'record',
     'doc': {'schema': ' iso19139',
      'uuid': '50e557e7e4b0a4aa5bb02c02',
      'id': '1291',
      'title': 'Trophic transfer efficiency of methylmercury and inorganic mercury to lake trout <i>Salvelinus namaycush</i> from its prey',
      'abstract': 'Based on a laboratory experiment, we estimated the net trophic transfer efficiency of methylmercury to lake trout <i>Salvelinus namaycush</i> from its prey to be equal to 76.6 %. Under the assumption that gross trophic transfer efficiency of methylmercury to lake trout from its prey was equal to 80 %, we estimated that the rate at which lake trout eliminated methylmercury was 0.000244 day<sup>−1</sup>. Our laboratory

In [37]:
es_conn.search(index="metadata", body={"query": 
{"multi_match": 
{'query': 'Milwaukee', 
'fields': ['schema', 'uuid', 'id', 'title', 'abstract', 'keyword', 'geoBox', 'image', 'responsibleParty', 'link', 'LegalConstraints']}}})

{'took': 1,
 'timed_out': False,
 '_shards': {'total': 5, 'successful': 5, 'skipped': 0, 'failed': 0},
 'hits': {'total': 0, 'max_score': None, 'hits': []}}

In [50]:
es_conn.search(index = 'metadata', doc_type = 'record', body = {"query" : {"match" : {"keyword": "shore"}}})

{'took': 1,
 'timed_out': False,
 '_shards': {'total': 5, 'successful': 5, 'skipped': 0, 'failed': 0},
 'hits': {'total': 0, 'max_score': None, 'hits': []}}