# Simulate Plans Data

In [3]:
import numpy as np
import json
import requests
import us
from elasticsearch import Elasticsearch
from faker import Factory

In [4]:
# Elasticsearch instance (hosted on AWS EC2)
es = Elasticsearch()

### Generate Simulated Data

In [16]:
# Simulate some data for testing

In [18]:
def simulate_data(output_path, N=10000):
    '''
    Simulate healthcare plans data in json format
    '''
    # Create fake names generator
    fake = Factory.create()
    fake.seed(1)
    np.random.seed(1)
    # Set parameters
    n_providers_pool = 50
    n_providers_per_plan = 10
    n_weights = 5
    # Initial data pool
    levels = ["Platinum", "Gold", "Silver", "Bronze", "Catastrophic"]
    states = us.states.mapping('abbr', 'name').keys()
    providers_pool = [dict(name=fake.name(), address=fake.address()) for i in range(n_providers_pool)]
    
    # Output    
    result = [dict(
                plan_name = fake.company(),
                level = np.random.choice(levels),
                premium = np.random.normal(loc=100, scale=15),
                weights = list(np.random.random(n_weights)),
                url = fake.url(),
                state = np.random.choice(states),
                providers = list(np.random.choice(providers_pool, size=n_providers_per_plan, replace=False))
              ) for i in range(N)]
                        
    json.dump(result, open(output_path, "w"), indent=2)

In [21]:
simulate_data("sim_plans_data_v3.json", 10000)

In [22]:
!head -n100 sim_plans_data_v3.json

[
  {
    "state": "OK", 
    "premium": 87.96740742027036, 
    "level": "Bronze", 
    "url": "http://www.adams.com/", 
    "weights": [
      0.092338594768797799, 
      0.1862602113776709, 
      0.34556072704304774, 
      0.39676747423066994, 
      0.53881673400335695
    ], 
    "providers": [
      {
        "name": "Cheryl Hancock", 
        "address": "18526 Klein Brook\nLake Brandonchester, OH 29752-6353"
      }, 
      {
        "name": "Stephanie Esparza", 
        "address": "95365 Clark Inlet\nEast Tabitha, MA 77772-9198"
      }, 
      {
        "name": "Lori Carlson", 
        "address": "73599 Sergio Terrace Apt. 359\nPort Nicholas, DE 01603-1489"
      }, 
      {
        "name": "Steve Davis", 
        "address": "USNV Owens\nFPO AA 60596"
      }, 
      {
        "name": "Kristin Jenkins", 
        "address": "USNS Brock\nFPO AE 67686"
      }, 
      {
        "name": "Kimberly Quinn", 
        "address": "12852 Cruz Missio

### Define ES Mapping

In [17]:
# Create Elasticsearch mapping

Index: Data  
Type: Plans  
Fields:  
- plan_name (string, raw)
- premium (nested, number)
- level (string, raw)
- url (string, raw)
- weights (number)
- state (string, non-analyzed)
- providers (nested)
    - name: string, raw
    - address: string, non-analyzed

In [23]:
!curl -XDELETE 'localhost:9200/data'

{"acknowledged":true}

In [24]:
def define_plan_mappings(es):
    # Mapping
    settings = {
        "settings": {
            "index": {
                "number_of_shards" : 5,
                "number_of_replicas" : 1                
            }
        },
        
        "mappings": {
            "plan": {
                "properties": {
                    "plan_name": {
                        "type": "string",
                        "index": "analyzed",
                        "fields": {
                            "raw": {
                                "type": "string",
                                "index": "not_analyzed"
                            }
                        }                    
                    },

                    "premium": {
                        "type": "float",                                                    
                    },

                    "level": {
                        "type": "string",
                        "index": "analyzed",
                        "fields": {
                            "raw": {
                                "type": "string",
                                "index": "not_analyzed"
                            }
                        }                    
                    },

                    "url": {
                        "type": "string",
                        "index": "not_analyzed"
                    },
                    
                    "weights": {
                        "type": "float",
                    },
                    
                    "state": {
                        "type": "string",
                        "index": "not_analyzed"
                    },

                    "providers": {
                        "type": "nested",
                        "properties": {
                            "name": {
                                "type": "string",
                                "index": "analyzed",
                                "fields": {
                                    "raw": {
                                        "type": "string",
                                        "index": "not_analyzed"
                                    }
                                }                                
                            },
                            "address": {
                                "type": "string",
                                "index": "not_analyzed",                                                       
                            }
                        }                                                                                            
                    }
                }                                 
            }    
        }        
    }
    
    # Define mappings in ES
    es.indices.create(index="data", body=settings)

In [25]:
define_plan_mappings(es)

In [26]:
!curl 'localhost:9200/data/_mapping/plan?pretty'

{
  "data" : {
    "mappings" : {
      "plan" : {
        "properties" : {
          "level" : {
            "type" : "string",
            "fields" : {
              "raw" : {
                "type" : "string",
                "index" : "not_analyzed"
              }
            }
          },
          "plan_name" : {
            "type" : "string",
            "fields" : {
              "raw" : {
                "type" : "string",
                "index" : "not_analyzed"
              }
            }
          },
          "premium" : {
            "type" : "float"
          },
          "providers" : {
            "type" : "nested",
            "properties" : {
              "address" : {
                "type" : "string",
                "index" : "not_analyzed"
              },
              "name" : {
                "type" : "string",
                "fields" : {
                  "raw" : {
                    "type" : "string",
           

### Load Data

In [27]:
def load_data(input_path, es):
    # Get data from file    
    data = json.load(open(input_path))
    # Add each plan
    for i, plan in enumerate(data):
        es.index(index='data', doc_type='plan', id=i, body=plan)

In [28]:
load_data("sim_plans_data_v3.json", es)

In [58]:
# Check data

In [29]:
!curl 'localhost:9200/_cat/indices?v'

health status index                  pri rep docs.count docs.deleted store.size pri.store.size 
yellow open   data                     5   1     110000            0     12.7mb         12.7mb 
yellow open   get-together             2   1         20            0     28.4kb         28.4kb 
yellow open   myindex                  5   1          0            0       800b           800b 
yellow open   november_2014_invoices   5   1          0            0       800b           800b 
yellow open   december_2014_invoices   5   1          0            0       800b           800b 
yellow open   blog                     5   1          1            0      3.6kb          3.6kb 
yellow open   logs                     5   1          1            0      3.7kb          3.7kb 


In [30]:
!curl 'localhost:9200/data/plan/0?pretty'

{
  "_index" : "data",
  "_type" : "plan",
  "_id" : "0",
  "_version" : 1,
  "found" : true,
  "_source" : {
    "premium" : 87.96740742027036,
    "providers" : [ {
      "name" : "Cheryl Hancock",
      "address" : "18526 Klein Brook\nLake Brandonchester, OH 29752-6353"
    }, {
      "name" : "Stephanie Esparza",
      "address" : "95365 Clark Inlet\nEast Tabitha, MA 77772-9198"
    }, {
      "name" : "Lori Carlson",
      "address" : "73599 Sergio Terrace Apt. 359\nPort Nicholas, DE 01603-1489"
    }, {
      "name" : "Steve Davis",
      "address" : "USNV Owens\nFPO AA 60596"
    }, {
      "name" : "Kristin Jenkins",
      "address" : "USNS Brock\nFPO AE 67686"
    }, {
      "name" : "Kimberly Quinn",
      "address" : "12852 Cruz Mission\nMarcusview, PR 74970"
    }, {
      "name" : "Daniel Burnett",
      "address" : "57634 Bowers Mount\nSouth Thomas, AS 95315-9758"
    }, {
      "name" : "Jeffery Russell",
      "address" : "5587 Thomas Grov