In [4]:
import urllib.request
import os
import json
from elasticsearch import Elasticsearch

API_URL = 'https://api.themoviedb.org/3/'
API_URL_MOVIE_DISCOVER = API_URL + 'discover/movie?'
API_URL_MOVIE_GENRES = API_URL + 'genre/movie/list?'

ES_URL = "http://192.168.11.2:9201"

# a. way to open an url and read the contents
# contents = urllib.request.urlopen("https://stackoverflow.com/questions/645312/what-is-the-quickest-way-to-http-get-in-python").read()
# OR
# response = requests.get("https://jsonplaceholder.typicode.com/todos")
# todos = json.loads(response.text)

# b. get the api_key from the environment (host os) -> if environment var not set, got exception
_apiKey = os.environ['API_KEY']

# ----------------------------
# get movies by a given range
# ----------------------------
def _getMoviesByRange(startDate, endDate, pageNum):
    # validation on date(s); non "None" values
    if not startDate:
        print('missing StartDate')
        return
    if not endDate:
        print('missing EndDate')
        return
    # build api and execute
    api = API_URL_MOVIE_DISCOVER + 'api_key=' + _apiKey + \
        '&language=en-US&sort_by=release_date.asc&include_adult=false&include_video=false&' + \
        'primary_release_date.gte='+ startDate +'&primary_release_date.lte='+ endDate
    if not pageNum:
        api += '&page=1'
    else:
        api += '&page=' + str(pageNum)

    contents = urllib.request.urlopen(api).read()
    jsonContents = json.loads(contents)
    
    # return the contents in json
    return jsonContents

# ----------------------------
# get genres description
# ----------------------------
def _getGenreDescription():
    _api = API_URL_MOVIE_GENRES + 'api_key=' + _apiKey + '&language=en-US'
    _jsonContent = json.loads(urllib.request.urlopen(_api).read())
    
    return _jsonContent['genres']

# ----------------------------
# translate the genre_id to 
# genre_description as well
# ----------------------------
def _translateGenreId2Desc(jsonMovie, _jsonGenreArr):
    _movieGDescArr = []
    _movieGArr = jsonMovie['genre_ids']
    # movie's genre element 1 by 1
    for _movieG in _movieGArr:
        for _g in _jsonGenreArr:
            if _g['id'] == _movieG:
                _movieGDescArr.append(_g['name'])
                break
                
    # add back the description if it is non empty
    if len(_movieGDescArr) > 0:
        jsonMovie['genre_descs'] = _movieGDescArr
    
    return jsonMovie

# ----------------------------
def _indexToES(startDate, endDate, jsonResponse, jsonGenresArr):
    # total_results => e.g. 5542
    numResults = jsonResponse['total_results']
    numPages = jsonResponse['total_pages']
    body = ''
    
    # loop through all pages
    for i in range(numPages):
        # 0 based... so actual page => i+1
        if i == 0:
            results = jsonResponse['results']
            for res in results:
                # translate the genre
                res = _translateGenreId2Desc(res, jsonGenresArr)
                # add the action meta
                body += '{ "index" : { } }\n'
                body += json.dumps(res) + '\n'
        else:
            # prevent a newline at the back, set end=''
            if i % 5 == 0:
                print('.', end='')
                
            # need to query the next page
            newJResponse = _getMoviesByRange(startDate, endDate, i+1)
            results = newJResponse['results']
            for res in results:
                # translate the genre
                res = _translateGenreId2Desc(res, jsonGenresArr)
                # add the action meta
                body += '{ "index" : { } }\n'
                body += json.dumps(res) + '\n'
    
    # create es connector
    es = Elasticsearch([ES_URL])
    esResponse = es.bulk(body=body, index='movies_4_blogs', doc_type='_doc')
    #print(esResponse)
    print('all done, errors? ', esResponse['errors'], ' ', startDate, ' - ', endDate)

# c1. get the bunch of movies within 1980-01-01 to 1990-12-31
sDate = '2009-01-01'
eDate = '2010-12-31'
_json = _getMoviesByRange(sDate, eDate, 1)
_jsonGenres = _getGenreDescription()
_indexToES(sDate, eDate, _json, _jsonGenres)





...................................................................................................................................................................................all done, errors?  False   2007-01-01  -  2008-12-31
