In [30]:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk, streaming_bulk
import json
import math
import numbers
import pandas as pd

In [31]:
host_url = ['http://127.0.0.1:9200']

In [32]:
es_conn = Elasticsearch(host_url)

In [70]:
def sanitize(value):
    if isinstance(value, numbers.Number) and math.isnan(value):
        return None
    else:
        return value

def parse_metadata(metadata_df):
    for index, row in metadata_df.iterrows():
        yield json.dumps({'schema' : sanitize(row[0]),
        'uuid' : sanitize(row[1]), 
        'id' : sanitize(row[2]), 
        'title' : sanitize(row[3]), 
        'abstract' : sanitize(row[4]), 
        'keyword' : sanitize(row[5]),
        'link' : sanitize(row[6]),
        'responsibleParty' : sanitize(row[7]),
        'metadatacreationdate' : sanitize(row[8]),
        'geoBox' : sanitize(row[9]),
        'image' : sanitize(row[10]),
        'LegalConstraints' : sanitize(row[11]),
        'temporalExtent' : sanitize(row[12]),
        'parentId' : sanitize(row[13]),
        'datasetcreationdate' : sanitize(row[14]),
        'Constraints' : sanitize(row[15]),
        'SecurityConstraints' : sanitize(row[16])
              }, 
        allow_nan = False)

In [71]:
df = pd.read_csv('clean_metadata.csv')

In [54]:
##Should only be run once for 
bulk(es_conn, parse_metadata(df), index = 'metadata', doc_type = 'record')

(864, [])

In [59]:
print (es_conn.count(index='metadata')['count'], 'documents in index')

864 documents in index


In [56]:
##Sample search query to show that it works!

es_conn.search(index="metadata", doc_type = 'record', body={"query": 
{"multi_match": 
{'query': 'Milwaukee', 
'fields': ['schema', 'uuid', 'id', 'title', 'abstract', 'keyword', 'geoBox', 'image', 'responsibleParty', 'link', 'LegalConstraints']}}})

{'took': 3,
 'timed_out': False,
 '_shards': {'total': 5, 'successful': 5, 'skipped': 0, 'failed': 0},
 'hits': {'total': 7,
  'max_score': 7.421124,
  'hits': [{'_index': 'metadata',
    '_type': 'record',
    '_id': 'PugUF2kB3T1IAmSeHoqf',
    '_score': 7.421124,
    '_source': {'schema': ' iso19139',
     'uuid': '8e94fefa-ba6b-44eb-b843-6968bc67ba39',
     'id': '1394',
     'title': 'Milwaukee Buoy ATW10 (45182) [out of the water for the season]',
     'abstract': 'The University of Wisconsin-Milwaukee School of Freshwater Sciences operates a monitoring buoy just north of Milwaukee off of Atwater Beach.  Approximately 1 km offshore, in 10 meters of water, the buoy provides real-time water quality and meteorological data.  The buoy has a unique CO2 sampling system that provides a measure of primary production in the near-shore zone.',
     'keyword': 'WindDirection WindSpeed Wind Gust AirTemperature WaterTemperature Relative Humidity Significant Wave Period Significant Wave Height 