Use this file to manage indices and analyzers

In [None]:
CREDENTIALS_DIR = '../credentials' # make sure this is not checked
DATA_DIR        = '../data/nature-html/'
CRED_NAME       = 'elasticsearch'
CLOUD_URL       = 'ec2-34-201-3-67.compute-1.amazonaws.com'
PORT            = 9200

In [None]:
from elasticsearch import Elasticsearch
import json

In [None]:
# First load credentials
(user, secret) = json.load(open('../credentials/elasticsearch.json'))

In [None]:
es = Elasticsearch(['http://' + user + ':' + secret + '@' + CLOUD_URL + ':' + str(PORT)])

In [None]:
# Test that it is working
es.info()

Analysis

In [None]:
# First we create a custom analyzer for the index
ANALYZER_VERSION = '1'
ANALYZER_NAME = 'english'
analyzer_id = 'an-' + ANALYZER_VERSION + '-' + ANALYZER_NAME
an_prefix = analyzer_id + '-'
analyzer_settings = {
    "filter": {
        an_prefix + "english_stop": {
            "type":       "stop",
            "stopwords":  "_english_" 
        },
        an_prefix + "english_keywords": {
            "type":       "keyword_marker",
            "keywords":   ["example"] 
        },
        an_prefix + "english_stemmer": {
            "type":       "stemmer",
            "language":   "english"
        },
        an_prefix + "english_possessive_stemmer": {
            "type":       "stemmer",
            "language":   "possessive_english"
        }
    },
    "analyzer": {
        analyzer_id: {
            "tokenizer":  "standard",
            "filter": [
                an_prefix + "english_possessive_stemmer",
                "lowercase",
                an_prefix + "english_stop",
                an_prefix + "english_keywords",
                an_prefix + "english_stemmer"
            ]
        }
    }
}

In [None]:
# Try out the analyzer
# DO INDEX CREATION FIRST

test = "Please analyze this text for me."
test_body = {
    "analyzer": analyzer_id,
    "text": test
}
es.indices.analyze(index=INDEX_NAME, body=test_body)

Mappings

In [None]:
# Define the index mapping type
MAPPING_NAME = '_doc'
mapping_type = {
    "date_detection": False,
    "properties": {
        "article_date": {"type": "date"},
        "article_title": {
            "fields": {
                "keyword": {
                    "ignore_above": 256,
                    "type": "keyword"
                }
            },
            "type": "text",
            "analyzer": analyzer_id
        },
        "text": {
            "type": "text",
            "analyzer": analyzer_id
        }
    },
    
    # We want unknown string fields to be analyzed - no keywords
    "dynamic_templates": [
        {
            "strings": {
                "match_mapping_type": "string",
                "mapping": {
                    "type": "text",
                    "analyzer": analyzer_id
                }
            }
        }
    ]
}

Index creation

In [None]:
INDEX_NAME = 'nature-papers-1'

In [None]:
# Create the index with the analyzer
creation_body = {
    "settings": {
        "analysis": analyzer_settings
    },
    "mappings": {
        MAPPING_NAME: mapping_type
    }
}
es.indices.create(index=INDEX_NAME, body=creation_body)

Index destruction

In [None]:
TO_DELETE = 'nature-papers-english123'
es.indices.delete(index=TO_DELETE)