In [1]:
import requests
import json
import os

DATA_DIR = os.path.join(os.path.dirname(os.getcwd()), 'data')
JSON_PATH = os.path.join(DATA_DIR, 'tmdb.json')
ELASTIC_URL = 'http://localhost:9200/'

## <center> Проверка связи с elastic

```
curl -XGET "http://elastic:9200/"
```

In [2]:
def check_conn():
    resp = requests.get(ELASTIC_URL)
    print(resp)
    return resp

In [3]:
resp = check_conn()

<Response [200]>


In [4]:
resp.json()

{'name': '0fzEmm7',
 'cluster_name': 'docker-cluster',
 'cluster_uuid': '0jl7TR0GTBS7WyV38Una8w',
 'version': {'number': '6.5.4',
  'build_flavor': 'default',
  'build_type': 'tar',
  'build_hash': 'd2ef93d',
  'build_date': '2018-12-17T21:17:40.758843Z',
  'build_snapshot': False,
  'lucene_version': '7.5.0',
  'minimum_wire_compatibility_version': '5.6.0',
  'minimum_index_compatibility_version': '5.0.0'},
 'tagline': 'You Know, for Search'}

In [5]:
ELASTIC_VERSION = resp.json()['version']['number']
ELASTIC_VERSION

'6.5.4'

## <center> Переиндексация всех документов

Удаляем и снова создаем индекс. С помощью bulk API индексируем документы.

```
curl -XDELETE "http://elastic:9200/tmdb"

curl -XPUT "http://elastic:9200/tmdb" -H 'Content-Type: application/json' -d'
{
    "settings" : {
        "index" : {
            "number_of_shards" : 1, 
            "number_of_replicas" : 0 
        }
    }
}'

```

Чтобы убедиться, что индекс создался с правильными настройками
```
curl -XGET "http://elastic:9200/tmdb/_settings"
```

In [6]:
def extract():
    with open(JSON_PATH, 'r') as f:
        return json.loads(f.read())

In [7]:
data = extract()

In [8]:
some_key = list(data.keys())[0]
data[some_key].keys()

dict_keys(['poster_path', 'production_countries', 'revenue', 'overview', 'video', 'id', 'genres', 'title', 'tagline', 'vote_count', 'homepage', 'belongs_to_collection', 'original_language', 'status', 'spoken_languages', 'imdb_id', 'adult', 'backdrop_path', 'production_companies', 'release_date', 'popularity', 'original_title', 'budget', 'cast', 'directors', 'vote_average', 'runtime'])

In [9]:
def reindex(analysisSettings={}, mappingSettings={}, movieDict={}):
    settings = {
        "settings" : {
            "index" : {
                "number_of_shards" : 1, 
                "number_of_replicas" : 0 
            }
        }
    }
    
    if analysisSettings:
        settings['analysis'] = analysisSettings

    if mappingSettings:
        settings['mappings'] = mappingSettings

    resp = requests.delete("http://localhost:9200/tmdb")
    
    put_headers = {'Content-Type': 'application/json'}
    resp = requests.put("http://localhost:9200/tmdb", 
                        data=json.dumps(settings), headers=put_headers)

    bulkMovies = ""

    for id, movie in movieDict.items(): 
        addCmd = {"index": {"_index": "tmdb",
                            "_type": "_doc",
                            "_id": movie["id"]}}
        bulkMovies += json.dumps(addCmd) + "\n" + json.dumps(movie) + "\n"
    

    headers = {'Content-Type': 'application/x-ndjson'}
    resp = requests.post("http://localhost:9200/_bulk", headers=headers, data=bulkMovies)
    return resp    

In [10]:
%%time
resp = reindex(movieDict=data)

CPU times: user 176 ms, sys: 7.57 ms, total: 183 ms
Wall time: 1.53 s


In [11]:
resp.status_code

200