Scrap websites mentioned in file to collect food categories. It scrapps all pages and after that saves it all to a file.

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from bs4 import Comment

## Read file

You have to specified number of rows to read from a file - check first how many rows are there in the scv file.
Without 'nrows' parameter an exception occures.

In [78]:
df = pd.read_csv('food_data_excerpt/food_data_excerpt.csv', sep=',', encoding='utf8', nrows=336936)#, engine='python'

In [9]:
df.head()

Unnamed: 0,country,country_region,zip,city,date,target_link,origin_link,host_type,recipe_id
0,DE,7,47169,Duisburg,2013-02-28 23:59:59,/rezept/443097/Abendbrot-UEberbackenes-Broetch...,http://www.kochbar.de/kochen/Auflaeufe-zuberei...,Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0.1) ...,443097
1,DE,9,66564,Ottweiler,2013-03-01 00:00:01,/rezept/108268/Coq-au-vin.html,http://www.kochbar.de/rezepte/coq-au-vin.html,Mozilla/5.0 (Windows NT 5.1; rv:19.0) Gecko/20...,108268
2,DE,7,-1,Hattingen,2013-03-01 00:00:02,/rezept/231469/Kuchen-Blechkuchen-Kaesekuchen-...,-,Mozilla/5.0 (iPad; CPU OS 6_0_1 like Mac OS X)...,231469
3,DE,5,-1,Sulzbach,2013-03-01 00:00:04,/rezept/410839/Bier-Wackler-Dessert.html,http://www.google.de/url?sa=t&rct=j&q=wackler%...,Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKi...,410839
4,DE,11,-1,Calau,2013-03-01 00:00:06,/rezept/392545/Black-Bottom-Cupcake.html,http://www.kochbar.de/rezepte/cupcakes.html?of...,Mozilla/5.0 (Windows NT 6.1; WOW64; rv:19.0) G...,392545


## Collect data

In [28]:
def find_comment(soup, wanted_comment='REZEPT-KATEGORIEN'):
    found = False

    comments = soup.find_all(string=lambda text:isinstance(text, Comment))
    i = 0
    while not found and i < len(comments):
        node = comments[i]
        if node.strip() == wanted_comment:
            #print('comment found')
            found = True

        i+=1
    return node

In [29]:
def find_categories(soup, wanted_class = 'rtli-pb-small'):
    data = []
    node = find_comment(soup)
    startOfItems, endOfItems = False, False
    
    while not endOfItems and node.find_next_sibling():
        node = node.find_next_sibling()

        if node['class'][0] == wanted_class:
            startOfItems = True

            if node.findChild().name == 'a':
                data.append(node.a.text)
            elif node.findChild().name == 'span':
                data.append(node.span.text)

        elif startOfItems:
            endOfItems = True
    
    return data

In [16]:
domain = 'http://www.kochbar.de'

Remember to specify with rows to process. If 'all_rows' is set to True, 'start_index' and 'end_index' is not taken into account.

In [76]:
start_index, end_index = 40, 50
all_rows = True

In [None]:
import time
start = time.time()

data = []

if not all_rows:
    df = df[start_index:end_index]

for url in df['target_link']:
    #print(domain + url)
    
    response = requests.get(domain + url)
    
    if response.url != 'https://www.kochbar.de/rezepte/' and response.status_code == 200:
        #print('Request status ok!')
        soup = BeautifulSoup(response.content.decode('utf-8','ignore'), "lxml")
        categories = find_categories(soup)
        #print('categories:', categories, '\n')
        data.append(categories)
    elif response.status_code == 200:
        #print('This webpage is probably no longer available!')
        data.append(None)
    else:
        #print('Not correct request status: ', response.status_code)
        data.append(None)
        
print(time.time() - start)

## Extend dataframe

In [72]:
df['categories'] = data
df.head()

Unnamed: 0,country,country_region,zip,city,date,target_link,origin_link,host_type,recipe_id,categories
40,DE,13,4651,Bad_Lausick,2013-03-01 00:00:58,/rezept/363977/Entenbraten-klassisch.html,http://www.google.de/url?sa=t&rct=j&q=&esrc=s&...,Mozilla/5.0 (Windows NT 5.1; rv:19.0) Gecko/20...,363977,"[Fleisch, Sächsische Küche, Mittagstisch, Haup..."
41,DE,9,66564,Ottweiler,2013-03-01 00:01:00,/rezept/413547/Coq-au-Vin.html,http://www.kochbar.de/rezepte/coq-au-vin.html,Mozilla/5.0 (Windows NT 5.1; rv:19.0) Gecko/20...,413547,"[Europa, Frankreich, mit Alkohol, Fleisch, Mil..."
42,DE,11,-1,Brandenburg,2013-03-01 00:01:01,/rezept/397242/Spanferkelbraten-NT-mit-frische...,-,Mozilla/5.0 (compatible; MSIE 9.0; Windows NT ...,397242,"[Beilagen, mit Alkohol, Fleisch, Hauptspeise, ..."
43,DE,6,-1,Hanover,2013-03-01 00:01:01,/rezept/450876/Gemuesenudeln.html,http://www.kochbar.de/rezepte/alle-rezepte.htm...,Mozilla/5.0 (Windows NT 6.1; WOW64; rv:19.0) G...,450876,"[Hauptspeise, schnell & einfach, Fleisch, lakt..."
44,DE,5,-1,Kassel,2013-03-01 00:01:01,/rezept/38466/Schneller-Himbeerkuchen.html,http://www.kochbar.de/rezepte/himbeertorte.htm...,Mozilla/5.0 (iPad; CPU OS 6_0_1 like Mac OS X)...,38466,"[Kuchen/Torte, ohne Weizen]"


In [73]:
df = df[['country', 'country_region', 'city', 'date', 'recipe_id', 'categories']]
df.head()

Unnamed: 0,country,country_region,city,date,recipe_id,categories
40,DE,13,Bad_Lausick,2013-03-01 00:00:58,363977,"[Fleisch, Sächsische Küche, Mittagstisch, Haup..."
41,DE,9,Ottweiler,2013-03-01 00:01:00,413547,"[Europa, Frankreich, mit Alkohol, Fleisch, Mil..."
42,DE,11,Brandenburg,2013-03-01 00:01:01,397242,"[Beilagen, mit Alkohol, Fleisch, Hauptspeise, ..."
43,DE,6,Hanover,2013-03-01 00:01:01,450876,"[Hauptspeise, schnell & einfach, Fleisch, lakt..."
44,DE,5,Kassel,2013-03-01 00:01:01,38466,"[Kuchen/Torte, ohne Weizen]"


## Save data to file

In [74]:
df.to_csv('files/categories.csv', index=None, header=True, encoding='utf8')

## Check correctness of the file

In [75]:
pd.read_csv('files/categories.csv')

Unnamed: 0,country,country_region,city,date,recipe_id,categories
0,DE,13,Bad_Lausick,2013-03-01 00:00:58,363977,"['Fleisch', 'Sächsische Küche', 'Mittagstisch'..."
1,DE,9,Ottweiler,2013-03-01 00:01:00,413547,"['Europa', 'Frankreich', 'mit Alkohol', 'Fleis..."
2,DE,11,Brandenburg,2013-03-01 00:01:01,397242,"['Beilagen', 'mit Alkohol', 'Fleisch', 'Haupts..."
3,DE,6,Hanover,2013-03-01 00:01:01,450876,"['Hauptspeise', 'schnell & einfach', 'Fleisch'..."
4,DE,5,Kassel,2013-03-01 00:01:01,38466,"['Kuchen/Torte', 'ohne Weizen']"
5,DE,7,Essen,2013-03-01 00:01:02,317291,
6,DE,8,Koblenz,2013-03-01 00:01:02,458991,"['Konfekt/Süßware', 'Kuchen/Torte', 'Plätzchen..."
7,DE,2,Neusäß,2013-03-01 00:01:04,269164,"['Fisch', 'Hauptspeise', 'Gesund und Diät', 'o..."
8,DE,16,Berlin,2013-03-01 00:01:09,209594,"['Europa', 'Auflauf /Überbackenes', 'Fleisch',..."
9,CH,5,Bern,2013-03-01 00:01:10,421887,"['Salate', 'Fleisch', 'Mittagstisch', 'Abendes..."
