In [None]:
from pyelasticsearch.client import ElasticSearch
import ast, csv, requests, json, operator

def list_to_string_ingredients(recipes):
    for recipe in recipes:
        recipe['ingredients'] = ', '.join(recipe['ingredients'])
    return recipes

def index_recipes_in_ES(index, recipes):
    es = ElasticSearch()
    map_id_esid = {}
    for recipe in recipes:
        res = es.index(index=index, doc_type='recipe', id=recipe['id'], doc=recipe)
        if res['created'] == False:
            print res

def build_mlt(nb, doc_id):
    mlt = {}
    mlt["from"] = 0
    mlt["size"] = nb
    mlt["query"] = {}
    mlt["query"]["more_like_this"] = {}
    mlt["query"]["more_like_this"]["fields"] = ["ingredients"]
    mlt["query"]["more_like_this"]["like"] = [{"_index" : "test-recipe","_type" : "recipe","_id" : doc_id}]
    mlt["query"]["more_like_this"]["min_term_freq"] = 1
    mlt["query"]["more_like_this"]["max_query_terms"] = 50
    mlt["query"]["more_like_this"]["minimum_should_match"] = "25%"
    return mlt

def diff_distribution(similar_recipes):
    distribution = {}
    for recipe in similar_recipes:
        if recipe['cuisine'] in distribution:
            distribution[recipe['cuisine']] += 1
        else:
            distribution[recipe['cuisine']] = 1
    for cuisine in distribution:
        distribution[cuisine] = distribution[cuisine] * 100 / len(similar_recipes) 
        distribution[cuisine] = distribution[cuisine] - CUISINE_DISTRIBUTION[cuisine]
    prediction = max(distribution.iteritems(), key=operator.itemgetter(1))[0]
    return prediction

def extract_from_json(json):
    hits = json['hits']['hits']
    recipes = [hit['_source'] for hit in hits]
    return recipes


def get_similar(nb, doc_id):
    mlt = build_mlt(nb, doc_id)
    response = requests.post("http://localhost:9200/train-recipe/recipe/_search", data=json.dumps(mlt))
    similar_recipes = extract_from_json(json.loads(response.text))
    return similar_recipes

def get_document_by_id(index, doc_id):
    es = ElasticSearch()
    response = es.get(index=index, doc_type='recipe', id=doc_id)
    return response['_source']

def predict(recipes_to_predict, predict_origin, mlt_nb=100):
    predictions = []
    for recipe in recipes_to_predict:
        similar_recipes = get_similar(mlt_nb, recipe['id'])
        prediction = predict_origin(similar_recipes)
        predictions.append((prediction, recipe['id']))
    return predictions

def to_csv(predictions):
    f = csv.writer(open("results.csv", "wb+"))
    f.writerow(["id", "cuisine"])
    for prediction in predictions:
        f.writerow([prediction[1], prediction[0]])

def get_cuisine_distribution(recipes):
    cuisine_distribution = {}
    for recipe in recipes:
        if recipe['cuisine'] in cuisine_distribution:
            cuisine_distribution[recipe['cuisine']] += 1
        else:
            cuisine_distribution[recipe['cuisine']] = 1
    for cuisine in cuisine_distribution:
        cuisine_distribution[cuisine] =  float(cuisine_distribution[cuisine])/float(len(recipes))
    return cuisine_distribution

data_file = open('data/train.json', 'r')
train_recipes = ast.literal_eval(data_file.read())
data_file = open('data/test.json', 'r')
test_recipes = ast.literal_eval(data_file.read())

CUISINE_DISTRIBUTION = get_cuisine_distribution(train_recipes)

train_recipes = list_to_string_ingredients(train_recipes)
test_recipes = list_to_string_ingredients(test_recipes)

index_recipes_in_ES('train-recipe', train_recipes)
index_recipes_in_ES('test-recipe', test_recipes)

results = predict(test_recipes, diff_distribution, 15)
to_csv(results)