In [174]:
import json
from time import sleep

import requests
from bs4 import BeautifulSoup


def parse(u):
    title = '-'
    submit_by = '-'
    description = '-'
    calories = 0
    ingredients = []
    rec = {}

    try:
        r = requests.get(u, headers=headers)

        if r.status_code == 200:
            html = r.text
            soup = BeautifulSoup(html, 'lxml')
            # title
            title_section = soup.select('.recipe-summary__h1')
            # submitter
            submitter_section = soup.select('.submitter__name')
            # description
            description_section = soup.select('.submitter__description')
            # ingredients
            ingredients_section = soup.select('.recipe-ingred_txt')

            # calories
            calories_section = soup.select('.calorie-count')
            if calories_section:
                calories = calories_section[0].text.replace('cals', '').strip()

            if ingredients_section:
                for ingredient in ingredients_section:
                    ingredient_text = ingredient.text.strip()
                    if 'Add all ingredients to list' not in ingredient_text and ingredient_text != '':
                        ingredients.append({'step': ingredient.text.strip()})

            if description_section:
                description = description_section[0].text.strip().replace('"', '')

            if submitter_section:
                submit_by = submitter_section[0].text.strip()

            if title_section:
                title = title_section[0].text

            rec = {'title': title, 'submitter': submit_by, 'description': description, 'calories': calories,
                   'ingredients': ingredients}
    except Exception as ex:
        print('Exception while parsing')
        print(str(ex))
    finally:
        return json.dumps(rec)


if __name__ == '__main__':
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36',
        'Pragma': 'no-cache'
    }
    url = 'https://www.allrecipes.com/recipes/96/salad/'
    r = requests.get(url, headers=headers)
    if r.status_code == 200:
        html = r.text
        soup = BeautifulSoup(html, 'lxml')
        links = soup.select('.fixed-recipe-card__h3 a')
        results_list =[]
        for link in links:
            sleep(2)
            result = parse(link['href'])
            results_list.append(result)
print (results_list)
            

['{"title": "Apple Avocado Salad with Tangerine Dressing", "submitter": "Donna Smally", "description": "This easy salad is perfect for a hot day and it is very filling. You wouldn\'t think this mixture of apples, avocados, blue cheese, and tangerine dressing would go together, but it\'s wonderful!", "calories": "144", "ingredients": [{"step": "1 (10 ounce) package baby greens"}, {"step": "1/4 cup chopped red onion"}, {"step": "1/2 cup chopped walnuts"}, {"step": "1/3 cup crumbled blue cheese"}, {"step": "2 teaspoons lemon zest"}, {"step": "1 apple - peeled, cored and sliced"}, {"step": "1 avocado - peeled, pitted and diced"}, {"step": "4 mandarin oranges, juiced"}, {"step": "1/2 lemon, juiced"}, {"step": "1/2 teaspoon lemon zest"}, {"step": "1 clove garlic, minced"}, {"step": "2 tablespoons olive oil"}, {"step": "salt to taste"}]}', '{"title": "All-American Loaded Baked Potato Salad", "submitter": "spicejenmom", "description": "This is a great twist on two all-American favorites--the p

In [175]:
import logging
def connect_elasticsearch():
    _es = None
    _es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
    if _es.ping():
        print('Yay Connect')
    else:
        print('Awww it could not connect!')
    return _es

if __name__ == '__main__':
    logging.basicConfig(level=logging.ERROR)

In [176]:
def create_index(es_object, index_name='recipes'):
    created = False
    # index settings
    """
    settings = {
        "settings": {
            "number_of_shards": 1,
            "number_of_replicas": 0
        },
        "mappings": {
            "salads": {
                "dynamic": "strict",
                "properties": {
                    "title": {
                        "type": "text"
                    },
                    "submitter": {
                        "type": "text"
                    },
                    "description": {
                        "type": "text"
                    },
                    "calories": {
                        "type": "integer"
                    },
                    "ingredients": {
                        "type": "nested",
                        "properties": {
                            "step": {"type": "text"}
                        }
                    },
                }
            }
        }

    }
    """
    try:
        if not es_object.indices.exists(index_name):
            # Ignore 400 means to ignore "Index Already Exist" error.
            es_object.indices.create(index=index_name, ignore=400, body=settings)
            print('Created Index')
        created = True
    except Exception as ex:
        print(str(ex))
    finally:
        return created

In [177]:
# Import Elasticsearch package 
from elasticsearch import Elasticsearch 
# Connect to the elastic cluster
es=Elasticsearch([{'host':'localhost','port':9200}])
es

<Elasticsearch([{'host': 'localhost', 'port': 9200}])>

In [178]:
#es.indices.delete(index = 'recipes')

In [179]:
connect_elasticsearch()

Yay Connect


<Elasticsearch([{'host': 'localhost', 'port': 9200}])>

In [180]:
create_index(connect_elasticsearch(),index_name='recipes')

Yay Connect


True

In [181]:
res= es.search(index='recipes')

In [182]:
res

{'took': 2,
 'timed_out': False,
 '_shards': {'total': 5, 'successful': 5, 'skipped': 0, 'failed': 0},
 'hits': {'total': 0, 'max_score': None, 'hits': []}}

In [183]:
es.indices.get_mapping('recipes')

{'recipes': {'mappings': {}}}

In [184]:
es.search(index = 'recipes')

{'took': 4,
 'timed_out': False,
 '_shards': {'total': 5, 'successful': 5, 'skipped': 0, 'failed': 0},
 'hits': {'total': 0, 'max_score': None, 'hits': []}}

In [185]:
def store_record(elastic_object, index_name, record):
    try:
        outcome = elastic_object.index(index=index_name, doc_type='salads', body=record)
    except Exception as ex:
        print('Error in indexing data')
        print(str(ex))

In [193]:
es.indices.get(index='recipes')

{'recipes': {'aliases': {},
  'mappings': {},
  'settings': {'index': {'creation_date': '1539083122536',
    'number_of_shards': '5',
    'number_of_replicas': '1',
    'uuid': '0CwaEGtCTUiNgoXQXxvQ_w',
    'version': {'created': '6040199'},
    'provided_name': 'recipes'}}}}

In [194]:
import json
json.loads(results_list[0])

{'title': 'Apple Avocado Salad with Tangerine Dressing',
 'submitter': 'Donna Smally',
 'description': "This easy salad is perfect for a hot day and it is very filling. You wouldn't think this mixture of apples, avocados, blue cheese, and tangerine dressing would go together, but it's wonderful!",
 'calories': '144',
 'ingredients': [{'step': '1 (10 ounce) package baby greens'},
  {'step': '1/4 cup chopped red onion'},
  {'step': '1/2 cup chopped walnuts'},
  {'step': '1/3 cup crumbled blue cheese'},
  {'step': '2 teaspoons lemon zest'},
  {'step': '1 apple - peeled, cored and sliced'},
  {'step': '1 avocado - peeled, pitted and diced'},
  {'step': '4 mandarin oranges, juiced'},
  {'step': '1/2 lemon, juiced'},
  {'step': '1/2 teaspoon lemon zest'},
  {'step': '1 clove garlic, minced'},
  {'step': '2 tablespoons olive oil'},
  {'step': 'salt to taste'}]}

In [195]:
for e in results_list:
    #print (e)
    store_record(connect_elasticsearch(),index_name='recipes', record = json.loads(e))

Yay Connect
Yay Connect
Yay Connect
Yay Connect
Yay Connect
Yay Connect
Yay Connect
Yay Connect
Yay Connect
Yay Connect
Yay Connect
Yay Connect
Yay Connect
Yay Connect
Yay Connect
Yay Connect
Yay Connect
Yay Connect
Yay Connect
Yay Connect
Yay Connect
Yay Connect
Yay Connect
Yay Connect
Yay Connect
Yay Connect
Yay Connect
Yay Connect
Yay Connect


In [196]:
es.search(index = 'recipes')

{'took': 7,
 'timed_out': False,
 '_shards': {'total': 5, 'successful': 5, 'skipped': 0, 'failed': 0},
 'hits': {'total': 29,
  'max_score': 1.0,
  'hits': [{'_index': 'recipes',
    '_type': 'salads',
    '_id': 'cNONWGYBUEkhmG_89Qw4',
    '_score': 1.0,
    '_source': {'title': 'Watermelon Basil Salad',
     'submitter': 'Chefthompson.com',
     'description': 'A quick salad of watermelon and basil. The chili powder plays well with the sweetness of the melon.',
     'calories': '10',
     'ingredients': [{'step': '1/4 cup basil leaves'},
      {'step': '4 cups 1/2-inch cubes watermelon'},
      {'step': '2 teaspoons lemon juice'},
      {'step': '1/4 teaspoon kosher salt'},
      {'step': '1/4 teaspoon chili powder'}]}},
   {'_index': 'recipes',
    '_type': 'salads',
    '_id': 'iNONWGYBUEkhmG_8_wxn',
    '_score': 1.0,
    '_source': {'title': 'Black Bean and Couscous Salad',
     'submitter': 'Paula',
     'description': 'This is a great salad for a buffet, with interesting textur