# Simulate Plans Data

In [1]:
import numpy as np
import json
import requests
import us
from elasticsearch import Elasticsearch
from faker import Factory

In [2]:
# Elasticsearch instance (hosted on AWS EC2)
es = Elasticsearch()

### Generate Simulated Data

In [16]:
# Simulate some data for testing

In [8]:
from itertools import izip

In [4]:
def generate_premiums(N):
    # Create premiums object for a few ages
    for i in range(N):
        premium = np.random.normal(loc=100, scale=15)
        current = dict(age_30=premium, age_40=(premium + 10), age_50=(premium + 20))
        yield current

In [22]:
def simulate_data(output_path, N=10000):
    '''
    Simulate healthcare plans data in json format
    '''
    # Create fake names generator
    fake = Factory.create()
    fake.seed(1)
    np.random.seed(1)
    # Set parameters
    n_providers_pool = 50
    n_providers_per_plan = 10
    # Initial data pool
    levels = ["Platinum", "Gold", "Silver", "Bronze", "Catastrophic"]
    states = us.states.mapping('abbr', 'name').keys()
    providers_pool = [dict(name=fake.name(), address=fake.address()) for i in range(n_providers_pool)]
    
    # Simulate data points
    plan_name_list = [fake.company() for i in range(N)]    
    level_array = np.random.choice(levels, size=N)
    url_list = [fake.url() for i in range(N)]
    state_array = np.random.choice(states, size=N)    
    providers_list = [list(np.random.choice(providers_pool, size=n_providers_per_plan, replace=False))
                      for i in range(N)]
    
    # Combine columns
    result = []
    for i, current_premium in izip(range(N), generate_premiums(N)):
        current_data = dict(
            plan_name = plan_name_list[i],
            premium = current_premium,
            level = level_array[i],
            url = url_list[i],
            state = state_array[i],
            providers = providers_list[i]
        )
        result.append(current_data)
    json.dump(result, open(output_path, "w"), indent=2)

In [23]:
simulate_data("sim_plans_data_v2.json", 10000)

In [25]:
!head -n100 sim_plans_data_v2.json

[
  {
    "state": "SC", 
    "premium": {
      "age_30": 91.30685150355869, 
      "age_50": 111.30685150355869, 
      "age_40": 101.30685150355869
    }, 
    "level": "Bronze", 
    "url": "http://www.feeney-denesik.com/", 
    "providers": [
      {
        "name": "Dr. Bell Lubowitz DDS", 
        "address": "24787 Bogisich Junctions\nLucienport, VI 81392"
      }, 
      {
        "name": "Channie Nolan DDS", 
        "address": "083 Daniel Roads Apt. 396\nKoelpinview, MO 71455-9813"
      }, 
      {
        "name": "Dr. Elza Stokes DDS", 
        "address": "2080 Windler Parkway Apt. 897\nRoscoeborough, GA 41162-4388"
      }, 
      {
        "name": "Lonnie Adams", 
        "address": "3584 Miller Landing Suite 460\nNorth Jesse, AL 11532"
      }, 
      {
        "name": "Corie Rowe PhD", 
        "address": "2869 Collins Oval\nNorth Cecilhaven, RI 36735-2028"
      }, 
      {
        "name": "Miss Erykah Lemke DDS", 
        "address": "3

### Define ES Mapping

In [17]:
# Create Elasticsearch mapping

Index: Data  
Type: Plans  
Fields:  
- plan_name (string, raw)
- premium (nested, number)
- level (string, raw)
- url (string, raw)
- state (string, non-analyzed)
- providers (nested)
    - name: string, raw
    - address: string, non-analyzed

In [26]:
!curl -XDELETE 'localhost:9200/data'

{"acknowledged":true}

In [30]:
def define_plan_mappings(es):
    # Mapping
    settings = {
        "settings": {
            "index": {
                "number_of_shards" : 5,
                "number_of_replicas" : 1                
            }
        },
        
        "mappings": {
            "plan": {
                "properties": {
                    "plan_name": {
                        "type": "string",
                        "index": "analyzed",
                        "fields": {
                            "raw": {
                                "type": "string",
                                "index": "not_analyzed"
                            }
                        }                    
                    },

                    "premium": {
                        "type": "nested",
                        "properties": {
                            "age_30": {"type": "float"},
                            "age_40": {"type": "float"},
                            "age_50": {"type": "float"}
                        }                                        
                    },

                    "level": {
                        "type": "string",
                        "index": "analyzed",
                        "fields": {
                            "raw": {
                                "type": "string",
                                "index": "not_analyzed"
                            }
                        }                    
                    },

                    "url": {
                        "type": "string",
                        "index": "not_analyzed"
                    },
                    
                    "state": {
                        "type": "string",
                        "index": "not_analyzed"
                    },

                    "providers": {
                        "type": "nested",
                        "properties": {
                            "name": {
                                "type": "string",
                                "index": "analyzed",
                                "fields": {
                                    "raw": {
                                        "type": "string",
                                        "index": "not_analyzed"
                                    }
                                }                                
                            },
                            "address": {
                                "type": "string",
                                "index": "not_analyzed",                                                       
                            }
                        }                                                                                            
                    }
                }                                 
            }    
        }        
    }
    
    # Define mappings in ES
    es.indices.create(index="data", body=settings)

In [31]:
define_plan_mappings(es)

In [32]:
!curl 'localhost:9200/data/_mapping/plan?pretty'

{
  "data" : {
    "mappings" : {
      "plan" : {
        "properties" : {
          "level" : {
            "type" : "string",
            "fields" : {
              "raw" : {
                "type" : "string",
                "index" : "not_analyzed"
              }
            }
          },
          "plan_name" : {
            "type" : "string",
            "fields" : {
              "raw" : {
                "type" : "string",
                "index" : "not_analyzed"
              }
            }
          },
          "premium" : {
            "type" : "nested",
            "properties" : {
              "age_30" : {
                "type" : "float"
              },
              "age_40" : {
                "type" : "float"
              },
              "age_50" : {
                "type" : "float"
              }
            }
          },
          "providers" : {
            "type" : "nested",
            "properties" : {
           

### Load Data

In [33]:
def load_data(input_path, es):
    # Get data from file    
    data = json.load(open(input_path))
    # Add each plan
    for i, plan in enumerate(data):
        es.index(index='data', doc_type='plan', id=i, body=plan)

In [34]:
load_data("sim_plans_data_v2.json", es)

In [58]:
# Check data

In [35]:
!curl 'localhost:9200/_cat/indices?v'

health status index                  pri rep docs.count docs.deleted store.size pri.store.size 
yellow open   data                     5   1     120000            0     12.5mb         12.5mb 
yellow open   get-together             2   1         20            0     28.4kb         28.4kb 
yellow open   myindex                  5   1          0            0       800b           800b 
yellow open   november_2014_invoices   5   1          0            0       800b           800b 
yellow open   december_2014_invoices   5   1          0            0       800b           800b 
yellow open   blog                     5   1          1            0      3.6kb          3.6kb 
yellow open   logs                     5   1          1            0      3.7kb          3.7kb 


In [36]:
!curl 'localhost:9200/data/plan/0?pretty'

{
  "_index" : "data",
  "_type" : "plan",
  "_id" : "0",
  "_version" : 1,
  "found" : true,
  "_source" : {
    "premium" : {
      "age_30" : 91.30685150355869,
      "age_50" : 111.30685150355869,
      "age_40" : 101.30685150355869
    },
    "providers" : [ {
      "name" : "Dr. Bell Lubowitz DDS",
      "address" : "24787 Bogisich Junctions\nLucienport, VI 81392"
    }, {
      "name" : "Channie Nolan DDS",
      "address" : "083 Daniel Roads Apt. 396\nKoelpinview, MO 71455-9813"
    }, {
      "name" : "Dr. Elza Stokes DDS",
      "address" : "2080 Windler Parkway Apt. 897\nRoscoeborough, GA 41162-4388"
    }, {
      "name" : "Lonnie Adams",
      "address" : "3584 Miller Landing Suite 460\nNorth Jesse, AL 11532"
    }, {
      "name" : "Corie Rowe PhD",
      "address" : "2869 Collins Oval\nNorth Cecilhaven, RI 36735-2028"
    }, {
      "name" : "Miss Erykah Lemke DDS",
      "address" : "33283 Colette Summit\nMuellerchester, KS 23241"
    }, {

In [39]:
!curl 'localhost:9200/data/plan/_search?pretty' -d '{\
"query": {\
"match": {\
"plan_name.raw": "Collier, Rau and Funk"}}}'

{
  "took" : 52,
  "timed_out" : false,
  "_shards" : {
    "total" : 5,
    "successful" : 5,
    "failed" : 0
  },
  "hits" : {
    "total" : 1,
    "max_score" : 10.386642,
    "hits" : [ {
      "_index" : "data",
      "_type" : "plan",
      "_id" : "0",
      "_score" : 10.386642,
      "_source" : {
        "premium" : {
          "age_30" : 91.30685150355869,
          "age_50" : 111.30685150355869,
          "age_40" : 101.30685150355869
        },
        "providers" : [ {
          "name" : "Dr. Bell Lubowitz DDS",
          "address" : "24787 Bogisich Junctions\nLucienport, VI 81392"
        }, {
          "name" : "Channie Nolan DDS",
          "address" : "083 Daniel Roads Apt. 396\nKoelpinview, MO 71455-9813"
        }, {
          "name" : "Dr. Elza Stokes DDS",
          "address" : "2080 Windler Parkway Apt. 897\nRoscoeborough, GA 41162-4388"
        }, {
          "name" : "Lonnie Adams",
          "address" : "3584 Miller Landing Su

In [46]:
# "Dr. Bell Lubowitz DDS"
!curl 'localhost:9200/data/plan/_search?pretty' -d '{\
"query": {\
    "nested": {\
        "path": "providers",\
        "filter": {\
            "term": {\
                "providers.name.raw": "Channie Nolan DDS"}}}}}'

{
  "took" : 43,
  "timed_out" : false,
  "_shards" : {
    "total" : 5,
    "successful" : 5,
    "failed" : 0
  },
  "hits" : {
    "total" : 1968,
    "max_score" : 1.0,
    "hits" : [ {
      "_index" : "data",
      "_type" : "plan",
      "_id" : "1494",
      "_score" : 1.0,
      "_source" : {
        "premium" : {
          "age_30" : 105.50996217076194,
          "age_50" : 125.50996217076194,
          "age_40" : 115.50996217076194
        },
        "providers" : [ {
          "name" : "Channie Nolan DDS",
          "address" : "083 Daniel Roads Apt. 396\nKoelpinview, MO 71455-9813"
        }, {
          "name" : "Dr. Marcie Schmitt DVM",
          "address" : "53179 Patty Hollow Suite 970\nWest Lucian, SD 66506"
        }, {
          "name" : "Mr. Daryl Fay I",
          "address" : "85619 Cartwright Highway\nJordinside, MD 63588-5502"
        }, {
          "name" : "Lupe Batz",
          "address" : "Unit 9254 Box 3099\nDPO AP 26956"
 

In [18]:
# "Dr. Bell Lubowitz DDS"
!curl 'localhost:9200/data/plan/_search?pretty' -d '{\
"query": {\
    "nested": {\
        "path": "providers",\
        "filter": {\
            "term": {\
                "providers.name.raw": "Channie Nolan DDS"}},\
        "inner_hits": {}\
}}}'

{
  "took" : 189,
  "timed_out" : false,
  "_shards" : {
    "total" : 5,
    "successful" : 5,
    "failed" : 0
  },
  "hits" : {
    "total" : 1968,
    "max_score" : 1.0,
    "hits" : [ {
      "_index" : "data",
      "_type" : "plan",
      "_id" : "1494",
      "_score" : 1.0,
      "_source" : {
        "premium" : {
          "age_30" : 105.50996217076194,
          "age_50" : 125.50996217076194,
          "age_40" : 115.50996217076194
        },
        "providers" : [ {
          "name" : "Channie Nolan DDS",
          "address" : "083 Daniel Roads Apt. 396\nKoelpinview, MO 71455-9813"
        }, {
          "name" : "Dr. Marcie Schmitt DVM",
          "address" : "53179 Patty Hollow Suite 970\nWest Lucian, SD 66506"
        }, {
          "name" : "Mr. Daryl Fay I",
          "address" : "85619 Cartwright Highway\nJordinside, MD 63588-5502"
        }, {
          "name" : "Lupe Batz",
          "address" : "Unit 9254 Box 3099\nDPO AP 26956"
        }, {
          "name" : "C

In [48]:
# Filter by state
!curl 'localhost:9200/data/plan/_search?pretty' -d '{\
"query": {\
    "filtered": {\
        "query": {\
            "match_all": {}\
        },\
        "filter": {\
        "term": {"state": "ME"}\
        }}}}'

{
  "took" : 60,
  "timed_out" : false,
  "_shards" : {
    "total" : 5,
    "successful" : 5,
    "failed" : 0
  },
  "hits" : {
    "total" : 164,
    "max_score" : 1.0,
    "hits" : [ {
      "_index" : "data",
      "_type" : "plan",
      "_id" : "1494",
      "_score" : 1.0,
      "_source" : {
        "premium" : {
          "age_30" : 105.50996217076194,
          "age_50" : 125.50996217076194,
          "age_40" : 115.50996217076194
        },
        "providers" : [ {
          "name" : "Channie Nolan DDS",
          "address" : "083 Daniel Roads Apt. 396\nKoelpinview, MO 71455-9813"
        }, {
          "name" : "Dr. Marcie Schmitt DVM",
          "address" : "53179 Patty Hollow Suite 970\nWest Lucian, SD 66506"
        }, {
          "name" : "Mr. Daryl Fay I",
          "address" : "85619 Cartwright Highway\nJordinside, MD 63588-5502"
        }, {
          "name" : "Lupe Batz",
          "address" : "Unit 9254 Box 3099\nDPO AP 26956"
  

In [52]:
!curl 'localhost:9200/data/plan/_search?pretty' -d '{\
"query": {\
    "match": {\
        "plan_name": "Nikolaus"\
    }\
}}'

{
  "took" : 18,
  "timed_out" : false,
  "_shards" : {
    "total" : 5,
    "successful" : 5,
    "failed" : 0
  },
  "hits" : {
    "total" : 37,
    "max_score" : 6.0785437,
    "hits" : [ {
      "_index" : "data",
      "_type" : "plan",
      "_id" : "7989",
      "_score" : 6.0785437,
      "_source" : {
        "premium" : {
          "age_30" : 77.02226245530724,
          "age_50" : 97.02226245530724,
          "age_40" : 87.02226245530724
        },
        "providers" : [ {
          "name" : "Terrie Klocko",
          "address" : "060 Pink Camp Apt. 941\nEast Kimberleeton, VI 99224"
        }, {
          "name" : "Moe Jast",
          "address" : "5956 Tilman Gardens\nKobeberg, LA 36186"
        }, {
          "name" : "Maynard Maggio",
          "address" : "7005 Davey Forks\nBrittanieton, AR 61803-8728"
        }, {
          "name" : "Mr. Darrion Dare V",
          "address" : "828 Boehm Bridge Apt. 721\nOthoton, DC 12743-4034"
       

In [60]:
!curl 'localhost:9200/data/plan/_search?pretty' -d '{\
"query": {\
    "nested": {\
        "path": "providers",\
        "query": {\
            "match": {\
                "providers.name": {\
                    "type": "phrase_prefix",\
                    "query": "nol",\
                    "max_expansions": 1\
                }\
            }\
    }}}}'

{
  "took" : 58,
  "timed_out" : false,
  "_shards" : {
    "total" : 5,
    "successful" : 5,
    "failed" : 0
  },
  "hits" : {
    "total" : 1968,
    "max_score" : 2.5643346,
    "hits" : [ {
      "_index" : "data",
      "_type" : "plan",
      "_id" : "1485",
      "_score" : 2.5643346,
      "_source" : {
        "premium" : {
          "age_30" : 109.03926849119046,
          "age_50" : 129.03926849119046,
          "age_40" : 119.03926849119046
        },
        "providers" : [ {
          "name" : "Isla Torphy",
          "address" : "PSC 2485, Box 2734\nAPO AE 69925-4782"
        }, {
          "name" : "Terrie Klocko",
          "address" : "060 Pink Camp Apt. 941\nEast Kimberleeton, VI 99224"
        }, {
          "name" : "Constantine Hansen",
          "address" : "PSC 7326, Box 8938\nAPO AP 92701"
        }, {
          "name" : "Lonnie Adams",
          "address" : "3584 Miller Landing Suite 460\nNorth Jesse, AL 11532"
        }, {
          "name" : "Isa D'Amore",


In [77]:
!curl 'localhost:9200/data/plan/_search?pretty' -d '{\
"query": {\
    "nested": {\
        "path": "providers",\
        "query": {\
            "match": {\
                "providers.name": {\
                    "type": "phrase_prefix",\
                    "query": "susie w"\
                }\
            }\
    }}}}'

{
  "took" : 131,
  "timed_out" : false,
  "_shards" : {
    "total" : 5,
    "successful" : 5,
    "failed" : 0
  },
  "hits" : {
    "total" : 2010,
    "max_score" : 12.796818,
    "hits" : [ {
      "_index" : "data",
      "_type" : "plan",
      "_id" : "909",
      "_score" : 12.796818,
      "_source" : {
        "premium" : {
          "age_30" : 96.90321438680202,
          "age_50" : 116.90321438680202,
          "age_40" : 106.90321438680202
        },
        "providers" : [ {
          "name" : "Boss Tromp",
          "address" : "8001 Cronin River\nCeceliaberg, WI 52635-3602"
        }, {
          "name" : "Susie Walsh",
          "address" : "7374 Adalyn Cove\nWest Ambers, FM 85292"
        }, {
          "name" : "Nelly Pollich",
          "address" : "7497 Dorian Vista Apt. 024\nHyattchester, TN 69553"
        }, {
          "name" : "Dempsey Dietrich",
          "address" : "48553 Epsie Cliff Suite 073\nDelphaland, NE 23269"
       

In [73]:
!curl 'localhost:9200/data/plan/_search?pretty' -d '{\
"query": {\
    "nested": {\
        "path": "providers",\
        "query": {\
            "prefix": {\
                "providers.name": "susie"\
            }\
    }}}}'

{
  "took" : 11,
  "timed_out" : false,
  "_shards" : {
    "total" : 5,
    "successful" : 5,
    "failed" : 0
  },
  "hits" : {
    "total" : 2010,
    "max_score" : 1.0,
    "hits" : [ {
      "_index" : "data",
      "_type" : "plan",
      "_id" : "1498",
      "_score" : 1.0,
      "_source" : {
        "premium" : {
          "age_30" : 81.49991986168219,
          "age_50" : 101.49991986168219,
          "age_40" : 91.49991986168219
        },
        "providers" : [ {
          "name" : "Dr. Lyndsey Gleason DVM",
          "address" : "867 Sharman Walk\nLake Vinson, DC 19916"
        }, {
          "name" : "Dempsey Dietrich",
          "address" : "48553 Epsie Cliff Suite 073\nDelphaland, NE 23269"
        }, {
          "name" : "Gerold Quigley",
          "address" : "0800 Katlyn Harbor\nLake Malissahaven, IN 25337"
        }, {
          "name" : "Deyanira Lindgren",
          "address" : "29466 Howell Groves\nWest Monica, VT 35552"
      

In [79]:
!curl 'localhost:9200/data/plan/_search?pretty' -d '{\
"query": {\
    "nested": {\
        "path": "providers",\
        "query": {\
            "match_phrase_prefix": {\
                "providers.name": {\
                    "query": "s"\
                }\
            }\
    }}}}'

{
  "took" : 39,
  "timed_out" : false,
  "_shards" : {
    "total" : 5,
    "successful" : 5,
    "failed" : 0
  },
  "hits" : {
    "total" : 8562,
    "max_score" : 1.1518447,
    "hits" : [ {
      "_index" : "data",
      "_type" : "plan",
      "_id" : "1529",
      "_score" : 1.1518447,
      "_source" : {
        "premium" : {
          "age_30" : 116.02683430639102,
          "age_50" : 136.02683430639104,
          "age_40" : 126.02683430639102
        },
        "providers" : [ {
          "name" : "Dempsey Dietrich",
          "address" : "48553 Epsie Cliff Suite 073\nDelphaland, NE 23269"
        }, {
          "name" : "Nelly Pollich",
          "address" : "7497 Dorian Vista Apt. 024\nHyattchester, TN 69553"
        }, {
          "name" : "Miss Bennie O'Hara DDS",
          "address" : "PSC 7875, Box 8821\nAPO AA 03508-6323"
        }, {
          "name" : "Moe Jast",
          "address" : "5956 Tilman Gardens\nKobeberg, LA 36186"
     

### Exists Query

In [4]:
# Check exists query
def exists_index():
    plan = {
        "premium" : {
            "age_30" : 50
        },
        "providers" : [{
          "name" : "Moe Jast",
          "address" : "5956 Tilman Gardens\nKobeberg, LA 36186"
        }],
        "url" : "http://trantow.com/",
        "level" : "Catastrophic",
        "state" : "DE",
        "plan_name" : "Yost, Auer and Haley - EXISTS"
    }
    es.index(index='data', doc_type='plan', id=10001, body=plan)

In [5]:
exists_index()

In [6]:
!curl 'localhost:9200/_cat/indices?v'

health status index                  pri rep docs.count docs.deleted store.size pri.store.size 
yellow open   data                     5   1     120003            0     12.5mb         12.5mb 
yellow open   get-together             2   1         20            0     28.4kb         28.4kb 
yellow open   myindex                  5   1          0            0       800b           800b 
yellow open   november_2014_invoices   5   1          0            0       800b           800b 
yellow open   december_2014_invoices   5   1          0            0       800b           800b 
yellow open   blog                     5   1          1            0      3.6kb          3.6kb 
yellow open   logs                     5   1          1            0      3.7kb          3.7kb 


In [7]:
!curl 'localhost:9200/data/plan/10001?pretty'

{
  "_index" : "data",
  "_type" : "plan",
  "_id" : "10001",
  "_version" : 1,
  "found" : true,
  "_source" : {
    "premium" : {
      "age_30" : 50
    },
    "level" : "Catastrophic",
    "url" : "http://trantow.com/",
    "providers" : [ {
      "name" : "Moe Jast",
      "address" : "5956 Tilman Gardens\nKobeberg, LA 36186"
    } ],
    "state" : "DE",
    "plan_name" : "Yost, Auer and Haley - EXISTS"
  }
}


In [16]:
!curl 'localhost:9200/data/plan/_search?pretty' -d '{\
"query": {\
    "nested": {\
        "path": "premium",\
        "query": {\
            "filtered": {\
                "query": {\
                    "match_all": {}\
                },\
                "filter": {\
                    "missing": { "field": "premium.age_50" }\
                }\
            }\
        }\
    }\
}\
}'

{
  "took" : 16,
  "timed_out" : false,
  "_shards" : {
    "total" : 5,
    "successful" : 5,
    "failed" : 0
  },
  "hits" : {
    "total" : 1,
    "max_score" : 1.0,
    "hits" : [ {
      "_index" : "data",
      "_type" : "plan",
      "_id" : "10001",
      "_score" : 1.0,
      "_source" : {
        "premium" : {
          "age_30" : 50
        },
        "level" : "Catastrophic",
        "url" : "http://trantow.com/",
        "providers" : [ {
          "name" : "Moe Jast",
          "address" : "5956 Tilman Gardens\nKobeberg, LA 36186"
        } ],
        "state" : "DE",
        "plan_name" : "Yost, Auer and Haley - EXISTS"
      }
    } ]
  }
}
