## SETUP

In [1]:
# First load credentials
import json
(access, secret) = json.load(open('../credentials/elasticsearch.json'))

In [2]:
# Set up the request object
from elasticsearch import Elasticsearch
from elasticsearch import Elasticsearch, RequestsHttpConnection
from requests_aws4auth import AWS4Auth

host = 'search-e5ud2nsmhsp52orvyvpqur0k09x-bkkbyr2hfjeiyujryuqkm5y3hu.us-east-1.es.amazonaws.com'
region = 'us-east-1'

awsauth = AWS4Auth(access, secret, region, 'es')

es = Elasticsearch(
    hosts=[{'host': host, 'port': 443}],
    http_auth=awsauth,
    use_ssl=True,
    verify_certs=True,
    connection_class=RequestsHttpConnection
)

In [3]:
# Test that it is working
es.info()

{u'cluster_name': u'632795836081:e5ud2nsmhsp52orvyvpqur0k09x',
 u'cluster_uuid': u'ZtlOYew8SCS7wuE60S5QwQ',
 u'name': u'-w-P0bT',
 u'tagline': u'You Know, for Search',
 u'version': {u'build_date': u'2018-02-28T15:42:08.616107Z',
  u'build_hash': u'10b1edd',
  u'build_snapshot': False,
  u'lucene_version': u'7.2.1',
  u'minimum_index_compatibility_version': u'5.0.0',
  u'minimum_wire_compatibility_version': u'5.6.0',
  u'number': u'6.2.2'}}

## ANALYSIS

In [4]:
# First we create a custom analyzer for the index
ANALYZER_VERSION = '1'
ANALYZER_NAME = 'english'
analyzer_id = 'an-' + ANALYZER_VERSION + '-' + ANALYZER_NAME
an_prefix = analyzer_id + '-'
analyzer_settings = {
    "filter": {
        an_prefix + "english_stop": {
            "type":       "stop",
            "stopwords":  "_english_" 
        },
        an_prefix + "english_keywords": {
            "type":       "keyword_marker",
            "keywords":   ["example"] 
        },
        an_prefix + "english_stemmer": {
            "type":       "stemmer",
            "language":   "english"
        },
        an_prefix + "english_possessive_stemmer": {
            "type":       "stemmer",
            "language":   "possessive_english"
        }
    },
    "analyzer": {
        analyzer_id: {
            "tokenizer":  "standard",
            "filter": [
                an_prefix + "english_possessive_stemmer",
                "lowercase",
                an_prefix + "english_stop",
                an_prefix + "english_keywords",
                an_prefix + "english_stemmer"
            ]
        }
    }
}

In [5]:
# Try out the analyzer
# DO INDEX CREATION FIRST

test = "Please analyze this text for me."
test_body = {
    "analyzer": analyzer_id,
    "text": test
}
es.indices.analyze(index=INDEX_NAME, body=test_body)

SyntaxError: invalid syntax (<ipython-input-5-06bd7d2945b9>, line 8)

## MAPPINGS

In [None]:
# Define the index mapping type
MAPPING_NAME = '_doc'
mapping_type = {
    "date_detection": False,
    "properties": {
        "article_date": {"type": "date"},
        "article_title": {
            "fields": {
                "keyword": {
                    "ignore_above": 256,
                    "type": "keyword"
                }
            },
            "type": "text",
            "analyzer": analyzer_id
        },
        "text": {
            "type": "text",
            "analyzer": analyzer_id
        }
    },
    
    # We want unknown string fields to be analyzed - no keywords
    "dynamic_templates": [
        {
            "strings": {
                "match_mapping_type": "string",
                "mapping": {
                    "type": "text",
                    "analyzer": analyzer_id
                }
            }
        }
    ]
}

## INDEX CREATION

In [None]:
INDEX_NAME = 'nature-papers-english'

In [None]:
# Create the index with the analyzer
creation_body = {
    "settings": {
        "analysis": analyzer_settings
    },
    "mappings": {
        MAPPING_NAME: mapping_type
    }
}
es.indices.create(index=INDEX_NAME, body=creation_body)

## INDEX DESTRUCTION

In [None]:
TO_DELETE = 'nature-papers-english123'
es.indices.delete(index=TO_DELETE)