In [11]:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk, streaming_bulk
import json
import math
import numbers
import pandas as pd

In [12]:
host_url = ['https://search-glos-metadata-jy4xxxs6o26fgmdj7guj32nvje.us-east-2.es.amazonaws.com']

In [13]:
es_conn = Elasticsearch(host_url)

In [14]:
def sanitize(value):
    if isinstance(value, numbers.Number) and math.isnan(value):
        return None
    else:
        return value

def parse_metadata(metadata_df):
    for index, row in metadata_df.iterrows():
        yield json.dumps({'schema' : sanitize(row[0]),
        'uuid' : sanitize(row[1]), 
        'id' : sanitize(row[2]), 
        'title' : sanitize(row[3]), 
        'abstract' : sanitize(row[4]), 
        'keyword' : sanitize(row[5]),
        'link' : sanitize(row[6]),
        'responsibleParty' : sanitize(row[7]),
        'metadatacreationdate' : sanitize(row[8]),
        'geoBox' : sanitize(row[9]),
        'image' : sanitize(row[10]),
        'LegalConstraints' : sanitize(row[11]),
        'temporalExtent' : sanitize(row[12]),
        'parentId' : sanitize(row[13]),
        'datasetcreationdate' : sanitize(row[14]),
        'Constraints' : sanitize(row[15]),
        'SecurityConstraints' : sanitize(row[16])
              }, 
        allow_nan = False)

In [16]:
df = pd.read_csv('search/clean_metadata.csv')

In [17]:
##Should only be run once for 
bulk(es_conn, parse_metadata(df), index = 'metadata', doc_type = 'record')

(864, [])

In [18]:
print (es_conn.count(index='metadata')['count'], 'documents in index')

864 documents in index


In [19]:
##Sample search query to show that it works!

es_conn.search(index="metadata", doc_type = 'record', body={"query": 
{"multi_match": 
{'query': 'Milwaukee', 
'fields': ['schema', 'uuid', 'id', 'title', 'abstract', 'keyword', 'geoBox', 'image', 'responsibleParty', 'link', 'LegalConstraints']}}})

{'took': 135,
 'timed_out': False,
 '_shards': {'total': 5, 'successful': 5, 'skipped': 0, 'failed': 0},
 'hits': {'total': 7,
  'max_score': 7.367199,
  'hits': [{'_index': 'metadata',
    '_type': 'record',
    '_id': 'yVg0K2kBvWhuiexrSZS4',
    '_score': 7.367199,
    '_source': {'schema': ' iso19139',
     'uuid': '8e94fefa-ba6b-44eb-b843-6968bc67ba39',
     'id': '1394',
     'title': 'Milwaukee Buoy ATW10 (45182) [out of the water for the season]',
     'abstract': 'The University of Wisconsin-Milwaukee School of Freshwater Sciences operates a monitoring buoy just north of Milwaukee off of Atwater Beach.  Approximately 1 km offshore, in 10 meters of water, the buoy provides real-time water quality and meteorological data.  The buoy has a unique CO2 sampling system that provides a measure of primary production in the near-shore zone.',
     'keyword': 'WindDirection WindSpeed Wind Gust AirTemperature WaterTemperature Relative Humidity Significant Wave Period Significant Wave Heigh

In [21]:
es_conn.search(index = 'metadata', doc_type = 'record', body = {'query': {'match': {'abstract': 'water'}}
})

{'took': 35,
 'timed_out': False,
 '_shards': {'total': 5, 'successful': 5, 'skipped': 0, 'failed': 0},
 'hits': {'total': 282,
  'max_score': 1.9084189,
  'hits': [{'_index': 'metadata',
    '_type': 'record',
    '_id': 'nlg0K2kBvWhuiexrLZIJ',
    '_score': 1.9084189,
    '_source': {'schema': ' iso19139',
     'uuid': 'c81b0e67-616f-45e5-8828-fb32ff77c6b7',
     'id': '455',
     'title': 'Lake Simcoe Region Conservation Authority (LSRCA) Watershed Data',
     'abstract': 'Contains watershed data (water level, air temperature, water temperature, ground water, flow, rainfall) for 11 stations in the Lake Simcoe, Ontario watershed.',
     'keyword': 'water levels water level plots water level gauges water level gages air temperature water temperature groundwater flow rainfall Conservation Ontario LSCRA Lake Simcoe Ontario Beaver River Black River E. Holland River Hawkestone Creek Maskinonge River Pefferlaw River Tannery Creek Uxbridge Brook W. Holland River',
     'link': ' http://lsrc