# Crawl keto website

In [37]:
import pandas as pd
import numpy as np
import requests
import re
from bs4 import BeautifulSoup

In [164]:
def get_content(url):
    html = requests.get(url).content
    soup = BeautifulSoup(html, 'html.parser')
    return soup

def get_element(soup, element):
    el = soup.find(element)
    return el

def get_elements(soup, element):
    el = soup.find_all(element)
    return el

In [17]:
url = "https://www.ruled.me/keto-buffalo-chicken-bowl/"
el = 'table'

soup = get_content(url)
table = get_element(soup, el)

## Cleaning the nutrition table

In [25]:
tds = table.find_all('td')

arr = list()

for t in tds:
    arr.append(t.text)
    
arr = np.array(arr)   

clean_df = pd.DataFrame(np.reshape(arr,(int(len(arr)/7), 7)))
clean_df.columns = clean_df.iloc[0,:].tolist()
clean_df.drop(clean_df.index[0], inplace=True)

In [26]:
clean_df

Unnamed: 0,NUTRITION,CALORIES,FAT,CARBS,FIBER,NET CARBS,PROTEIN
1,12.00 ounce cooked chicken,748,44.2,0.0,0.0,0.0,78.2
2,3.00 tablespoon butter,305,34.5,0.0,0.0,0.0,0.4
3,0.25 cup hot sauce,6,0.2,1.0,0.2,0.8,0.3
4,12.00 ounce romaine lettuce,58,1.0,11.2,7.1,4.1,4.2
5,3.00 ounce cooked bacon,466,36.8,1.2,0.0,1.2,30.4
6,0.25 cup banana peppers,8,0.2,1.7,1.1,0.6,0.5
7,4.00 medium green onion,19,0.1,4.4,1.6,2.8,1.1
8,3.00 ounce cherry tomatoes,15,0.2,3.3,1.0,2.3,0.8
9,1.00 medium avocado,301,27.7,15.6,12.2,3.3,3.5
10,6.00 tablespoon ranch dressing,387,40.1,5.3,0.0,5.3,1.2


## Convert metrics
* From once to gr
* From tablespoon to gr
* cup to gr

In [35]:
nutri_list = clean_df["NUTRITION"].tolist()
nutri_list

['12.00 ounce cooked chicken',
 '3.00 tablespoon butter',
 '0.25 cup hot sauce',
 '12.00 ounce romaine lettuce',
 '3.00 ounce cooked bacon',
 '0.25 cup banana peppers',
 '4.00 medium green onion',
 '3.00 ounce cherry tomatoes',
 '1.00 medium avocado',
 '6.00 tablespoon ranch dressing',
 'Totals',
 'Per Serving (/3)']

In [156]:
def find_numbers(nutrition):
        x = re.findall(r'(\d+)',nutrition)
        return float(x[0] + '.' + x[1])
    
def generalize_measurements(nutrition):
    measurements = ["ounce","tablespoon", "cup"]
    meas_dic = {
        "ounce":28.3495231,
        "tablespoon": 14.3, 
        "cup":128
    }
    
    nut_list = nutrition.split(" ")
        
    if len(nut_list) > 1:    
        if isfloat(nut_list[0]):
            if nut_list[1] in measurements:
                    grams = str(int(meas_dic[nut_list[1]] * float(nut_list[0])))
                    nut_list[0] = grams
                    nut_list[1] = "grams"
                    return ' '.join(nut_list)
            else:
                    grams = str(int(float(nut_list[0])))
                    nut_list[0] = grams
                    return ' '.join(nut_list)
                
    return nutrition        
        
def isfloat(num):
    try:
        float(num)
        return True
    except ValueError:
        return False

In [157]:
converted_nut = list()
for n in nutri_list:
    y = generalize_measurements(n)
    converted_nut.append(y)
converted_nut

['340 grams cooked chicken',
 '42 grams butter',
 '32 grams hot sauce',
 '340 grams romaine lettuce',
 '85 grams cooked bacon',
 '32 grams banana peppers',
 '4 medium green onion',
 '85 grams cherry tomatoes',
 '1 medium avocado',
 '85 grams ranch dressing',
 'Totals',
 'Per Serving (/3)']

## Translate

In [158]:
from deep_translator import GoogleTranslator

In [159]:
def translationFromEntoBg(arr):
    translated_arr = []
    for sentance in arr:     
        translated_arr.append(GoogleTranslator(source='auto', target='bg').translate(sentance))
    return translated_arr

In [162]:
translated = translationFromEntoBg(converted_nut)
translated

['340 грама варено пилешко',
 '42 грама масло',
 '32 грама лют сос',
 '340 грама маруля ромен',
 '85 грама варен бекон',
 '32 грама бананови чушки',
 '4 средни зелен лук',
 '85 грама чери домати',
 '1 средно авокадо',
 '85 грама ранчо дресинг',
 'Общо',
 'На порция (/3)']

In [163]:
clean_df["NUTRITION"] = translated
clean_df

Unnamed: 0,NUTRITION,CALORIES,FAT,CARBS,FIBER,NET CARBS,PROTEIN
1,340 грама варено пилешко,748,44.2,0.0,0.0,0.0,78.2
2,42 грама масло,305,34.5,0.0,0.0,0.0,0.4
3,32 грама лют сос,6,0.2,1.0,0.2,0.8,0.3
4,340 грама маруля ромен,58,1.0,11.2,7.1,4.1,4.2
5,85 грама варен бекон,466,36.8,1.2,0.0,1.2,30.4
6,32 грама бананови чушки,8,0.2,1.7,1.1,0.6,0.5
7,4 средни зелен лук,19,0.1,4.4,1.6,2.8,1.1
8,85 грама чери домати,15,0.2,3.3,1.0,2.3,0.8
9,1 средно авокадо,301,27.7,15.6,12.2,3.3,3.5
10,85 грама ранчо дресинг,387,40.1,5.3,0.0,5.3,1.2


## Get Images

In [190]:
import matplotlib.pyplot as plt
import matplotlib.image as mpim

In [205]:
title = get_element(soup, 'h1').text
title = title.lower().replace(' ','_')

images = get_elements(soup, 'img')
cnt = 1
for i in images[:1]:
    try:
        if int(i['width']) >= 600:
#             display_img(i['src'])
            lnk = i.get('src')
            print('images/'+title+'_'+str(cnt)+'.jpg')
            with open('images/'+title+'_'+str(cnt)+'.jpg', "r") as f:
                f.write(requests.get(lnk).content)
                cnt+=1

    except:
        print("An exception occurred")  

images/keto_buffalo_chicken_bowl_1.jpg
An exception occurred


In [181]:
import pandas as pd
from IPython.display import Image, HTML

def display_img(url):
    row_1 = [example.attrs['alt'], '<img src="images/book1.jpg"/>']
    df = pd.DataFrame(row_1).transpose()
    df.columns = ['Title', 'Cover']
    HTML(df.to_html(escape=False))