In [1]:
import elasticsearch

In [2]:
import json

In [3]:
import pandas

In [4]:
app = pandas.read_csv('AppleStore.csv')

In [5]:
desc = pandas.read_csv('appleStore_description.csv')

## 1. Data Preprocessing

### 1. 1 Merge two data sets on common column names

In [6]:
app.keys()

Index(['Unnamed: 0', 'id', 'track_name', 'size_bytes', 'currency', 'price',
       'rating_count_tot', 'rating_count_ver', 'user_rating',
       'user_rating_ver', 'ver', 'cont_rating', 'prime_genre',
       'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic'],
      dtype='object')

In [7]:
desc.keys()

Index(['id', 'track_name', 'size_bytes', 'app_desc'], dtype='object')

In [8]:
df_new = pandas.merge(app, desc, on=['id', 'track_name','size_bytes'])

In [9]:
#delete index column
df_new = df_new.drop('Unnamed: 0', axis=1)

In [10]:
df_new.rename(columns={'sup_devices.num': 'sup_devices_num', 
                       "ipadSc_urls.num":"ipadSc_urls_num",
                       "lang.num":"lang_num"}, inplace=True)

In [11]:
#check data types 
df_new.dtypes

id                    int64
track_name           object
size_bytes            int64
currency             object
price               float64
rating_count_tot      int64
rating_count_ver      int64
user_rating         float64
user_rating_ver     float64
ver                  object
cont_rating          object
prime_genre          object
sup_devices_num       int64
ipadSc_urls_num       int64
lang_num              int64
vpp_lic               int64
app_desc             object
dtype: object

In [12]:
# convert dataframe into dictionary for further usage
app_list = df_new.to_dict('records')

### 1.2 Add index info and convert into json file

In [13]:
action_list = []
for i in range(len(df_new)):
    # create index for each entry
    action = { "index" : { "_index" : "app", "_type" : "_doc", "_id" : i+1 } }
    action_list.append(action)
    

In [14]:
with open('apps.json', 'w') as outfile:
    for (idx, line) in zip(action_list, app_list):
        json.dump(idx, outfile)
        outfile.write('\n')
        json.dump(line, outfile)
        outfile.write('\n')

## 2. Index documents

In [15]:
es = elasticsearch.Elasticsearch()

### 2.1 Create mappings for index

```
{
  "app" : {
    "mappings" : {
      "properties" : {
        "app_desc" : {
          "type" : "text",
          "analyzer": "standard"
        },
        "cont_rating" : {
          "type" : "text"
        },
        "currency" : {
          "type" : "keyword"
        },
        "id" : {
          "type" : "integer"
        },
        "ipadSc_urls_num" : {
          "type" : "integer"
        },
        "lang_num" : {
          "type" : "integer"
        },
        "price" : {
          "type" : "float"
        },
        "prime_genre" : {
          "type" : "keyword"
        },
        "rating_count_tot" : {
          "type" : "integer"
        },
        "rating_count_ver" : {
          "type" : "integer"
        },
        "size_bytes" : {
          "type" : "integer"
        },
        "sup_devices_num" : {
          "type" : "integer"
        },
        "track_name" : {
          "type" : "text"
        },
        "user_rating" : {
          "type" : "float"
        },
        "user_rating_ver" : {
          "type" : "float"
        },
        "ver" : {
          "type" : "text"
        },
        "vpp_lic" : {
          "type" : "integer"
        }
      }
    }
  }
}
```

### 2.2 import json files 

```
curl -H 'Content-Type: application/x-ndjson' -XPOST 'localhost:9200/app/_bulk?pretty' --data-binary @apps.json
```

## 3. Run simple queries

### 3.1 Number of apps in each genre


__Query:__

```
GET app/_doc/_search?size=0
{
  "aggs": {
    "types_count": {
      "terms": {
        "field": "prime_genre.keyword"
      }
    }
  }
}
```

__Response:__
```
{...
"aggregations" : {
    "types_count" : {
      "doc_count_error_upper_bound" : 0,
      "sum_other_doc_count" : 943,
      "buckets" : [
        {
          "key" : "Games",
          "doc_count" : 3862
        },
        {
          "key" : "Entertainment",
          "doc_count" : 535
        },
        {
          "key" : "Education",
          "doc_count" : 453
        },
        {
          "key" : "Photo & Video",
          "doc_count" : 349
        },
        {
          "key" : "Utilities",
          "doc_count" : 248
        },
        {
          "key" : "Health & Fitness",
          "doc_count" : 180
        },
        {
          "key" : "Productivity",
          "doc_count" : 178
        },
        {
          "key" : "Social Networking",
          "doc_count" : 167
        },
        {
          "key" : "Lifestyle",
          "doc_count" : 144
        },
        {
          "key" : "Music",
          "doc_count" : 138
        }
      ]
    }

```
-------------

### 3.2 Number of apps in food&drink category?

__Query:__
```
GET app/_doc/_search
{
    "query": {
        "term": {
            "prime_genre.keyword": {
                "value": "Food & Drink",
                "boost": 1.0
            }
        }
    }
}
```

__Response: total 63 apps in food&drink genre , first result is "OpenTable - Restaurant Reservations"__

```
{...  
 "hits" : {
    "total" : {
      "value" : 63,
      "relation" : "eq"
    },
    "max_score" : 4.730519,
    "hits" : [
      {
        "_index" : "app",
        "_type" : "_doc",
        "_id" : "57",
        "_score" : 4.730519,
        "_source" : {
          "id" : 296581815,
          "track_name" : "OpenTable - Restaurant Reservations",
          "size_bytes" : 93420544,
          "currency" : "USD",
          "price" : 0.0,
          "rating_count_tot" : 113936,
          "rating_count_ver" : 150,
          "user_rating" : 4.5,
          "user_rating_ver" : 5.0,
          "ver" : "10.18.0",
          "cont_rating" : "4+",
          "prime_genre" : "Food & Drink",
          "sup_devices_num" : 37,
          "ipadSc_urls_num" : 1,
          "lang_num" : 6,
          "vpp_lic" : 1,
          "app_desc" : """... 
          }
    }
```

____________

###  3.3 Apps that contains breakfeast recipes? 

__Query:__

```
GET app/_doc/_search

{
    "query": {
        "match" : {
            "app_desc" : {
                "query" : "breakfast recipe",
                "fuzziness": "AUTO"
            }
        }
    }
}
```

__Response : returns total 723 hits, first result is "Deliciously Ella"__
```
{...
  "hits" : {
    "total" : {
      "value" : 723,
      "relation" : "eq"
    },
    "max_score" : 21.456356,
    "hits" : [
      {
        "_index" : "app",
        "_type" : "_doc",
        "_id" : "2468",
        "_score" : 21.456356,
        "_source" : {
          "id" : 813952961,
          "track_name" : "Deliciously Ella",
          "size_bytes" : 522395648,
          "currency" : "USD",
          "price" : 3.99,
          "rating_count_tot" : 62,
          "rating_count_ver" : 0,
          "user_rating" : 3.5,
          "user_rating_ver" : 0.0,
          "ver" : "4.0.0",
          "cont_rating" : "4+",
          "prime_genre" : "Food & Drink",
          "sup_devices_num" : 37,
          "ipadSc_urls_num" : 4,
          "lang_num" : 1,
          "vpp_lic" : 1,
          "app_desc" : """
          }
         }
```
-----

### 3.4 Game apps price distributions

__Query:__

```
GET app/_doc/_search?size=0

{
  "aggs": {
    "game_App": {
      "filter": {
        "term": {
          "prime_genre.keyword": "Games"
        }
      },
      "aggs": {
        "price_range": {
          "range": {
            "field": "price",
            "keyed": true,
            "ranges": [
              {"key" : "free",
                "to": 0.01
              },
              {"key" : "cheap",
                "from": 0.01,
                "to": 1
              },
              {"key" : "average",
                "from": 1,
                "to": 5
              },
              {"key" : "expensive",
                "from": 5
              }
            ]
          }
        }
      }
    }
  }
}
```

#### Response:

```
{...
    "aggregations" : {
    "game_App" : {
      "doc_count" : 3862,
      "price_range" : {
        "buckets" : {
          "free" : {
            "to" : 0.01,
            "doc_count" : 2257
          },
          "cheap" : {
            "from" : 0.01,
            "to" : 1.0,
            "doc_count" : 435
          },
          "average" : {
            "from" : 1.0,
            "to" : 5.0,
            "doc_count" : 937
          },
          "expensive" : {
            "from" : 5.0,
            "doc_count" : 233
          }
        }
      }
    }
    ```
---------------

### 3.5 Add more searching criteria

```
GET app/_doc/_search
{
  "query": {
    "bool": {
      "filter": [
        {
          "range": {
            "user_rating": {
              "gte": 4.0
            }
          }
        },
        {
          "term": {
            "prime_genre.keyword": "Food & Drink"
          }
        }
      ],
      "must": {
        "match": {
          "app_desc": {
            "query": "breakfast recipe",
            "fuzziness": "AUTO"
          }
        }
      }
    }
  }
}
```

__Response:__

``` {...
"hits" : {
    "total" : {
      "value" : 16,
      "relation" : "eq"
    },
    "max_score" : 16.91071,
    "hits" : [
      {
        "_index" : "app",
        "_type" : "_doc",
        "_id" : "5318",
        "_score" : 16.91071,
        "_source" : {
          "id" : 1079607960,
          "track_name" : "Oh She Glows - Healthy Plant-Based Recipes",
          "size_bytes" : 69006336,
          "currency" : "USD",
          "price" : 1.99,
          "rating_count_tot" : 764,
          "rating_count_ver" : 22,
          "user_rating" : 5.0,
          "user_rating_ver" : 4.5,
          "ver" : "1.1.6",
          "cont_rating" : "4+",
          "prime_genre" : "Food & Drink",
          "sup_devices_num" : 37,
          "ipadSc_urls_num" : 5,
          "lang_num" : 1,
          "vpp_lic" : 1,
          "app_desc" : """...}
          
          ```
          
------------

## 4. Document similarity search

__Add text analyzer to app_desc field__

```
PUT /app
{
  "settings": {
    "analysis": {
      "analyzer": {
        "english_analyzer": {
          "type": "standard",
          "stopwords": "_english_"
        }
      }
    }
  }
}

app/_mappings
{
...
"app_desc": {
      "type": "text",
      "fields": {
        "keyword": {
          "type": "keyword",
          "ignore_above": 256
        }
      },
      "analyzer": "english_analyzer"
      }
...
```

### 4.1 execute _"more like this"_ query for similarity search

```
GET app/_doc/_search
{
  "query": {
    "more_like_this": {
      "fields": ["app_desc"],
      "like": "breakfast egg and toast cooking recipe",
      "min_term_freq": 1,
      "max_query_terms": 50,
      "min_doc_freq": 4
    }
  }
}
```

__Response:__
```
 "hits" : {
    "total" : {
      "value" : 132,
      "relation" : "eq"
    },
    "max_score" : 20.824436,
    "hits" : [
      {
        "_index" : "app",
        "_type" : "_doc",
        "_id" : "5318",
        "_score" : 20.824436,
        "_source" : {
          "id" : 1079607960,
          "track_name" : "Oh She Glows - Healthy Plant-Based Recipes",
          "size_bytes" : 69006336,
          "currency" : "USD",
          "price" : 1.99,
          "rating_count_tot" : 764,
          "rating_count_ver" : 22,
          "user_rating" : 5.0,
          "user_rating_ver" : 4.5,
          "ver" : "1.1.6",
          "cont_rating" : "4+",
          "prime_genre" : "Food & Drink",
          "sup_devices_num" : 37,
          "ipadSc_urls_num" : 5,
          "lang_num" : 1,
          "vpp_lic" : 1,
          "app_desc" : """ We're thrilled to announce that we're one of App Store's Best of 2016!...}""" 
          ```
