# Webscraping lab

Practice your webscraping and parsing skills! 🎉

In [1]:
# Import libaries
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

### Step 1: Create a soup object from the home page

In [2]:
url = 'https://pages.git.generalassemb.ly/rldaggie/for-scraping/'
res = requests.get(url)
res.status_code

200

In [3]:
soup = BeautifulSoup(res.content)

### Step 2: Scrape the home page soup for every restaurant

Note: Your best bet is to create a list of dictionaries, one for each restaurant. Each dictionary contains the restaurant's name and path from the `href`. The result of your scrape should look something like this:

```python
restaurants = [
    {'name': 'A&W Restaurants', 'href': 'restaurants/1.html'}, 
    {'name': "Applebee's", 'href': 'restaurants/2.html'},
    ...
]
```

In [4]:
restaurants = []
rows = soup.find_all('td')
for row in rows:
    row_dict = {}
    row_dict['name'] = row.find('a').text
    row_dict['href'] = row.find('a')['href']
    restaurants.append(row_dict)

In [5]:
restaurants

[{'name': 'A&W Restaurants', 'href': 'restaurants/1.html'},
 {'name': "Applebee's", 'href': 'restaurants/2.html'},
 {'name': "Arby's", 'href': 'restaurants/3.html'},
 {'name': 'Atlanta Bread Company', 'href': 'restaurants/4.html'},
 {'name': "Bojangle's Famous Chicken 'n Biscuits",
  'href': 'restaurants/5.html'},
 {'name': 'Buffalo Wild Wings', 'href': 'restaurants/6.html'},
 {'name': 'Burger King', 'href': 'restaurants/7.html'},
 {'name': "Captain D's", 'href': 'restaurants/8.html'},
 {'name': "Carl's Jr.", 'href': 'restaurants/9.html'},
 {'name': "Charley's Grilled Subs", 'href': 'restaurants/10.html'},
 {'name': 'Chick-fil-A', 'href': 'restaurants/11.html'},
 {'name': "Chili's", 'href': 'restaurants/12.html'},
 {'name': 'Chipotle Mexican Grill', 'href': 'restaurants/13.html'},
 {'name': "Church's", 'href': 'restaurants/14.html'},
 {'name': 'Corner Bakery Cafe', 'href': 'restaurants/15.html'},
 {'name': 'Dairy Queen', 'href': 'restaurants/16.html'},
 {'name': "Denny's", 'href': 'res

### Step 3: Using the `href`, scrape each restaurant's page and create a single list of food dictionaries.

Your list of foods should look something like this:
```python
foods = [
    {
        'calories': '0',
        'carbs': '0',
        'category': 'Drinks',
        'fat': '0',
        'name': 'A&W® Diet Root Beer',
        'restaurant': 'A&W Restaurants'
    },
    {
        'calories': '0',
        'carbs': '0',
        'category': 'Drinks',
        'fat': '0',
        'name': 'A&W® Diet Root Beer',
        'restaurant': 'A&W Restaurants'
    },
    ...
]
```

**Note**: Remove extra white space from each category

In [6]:
# Name	Category	Calories	Fat	Carbs
foods = []
for restaurant in restaurants:
    html = BeautifulSoup(requests.get(url=url+restaurant['href']).content)
    menu_items = html.find('tbody')
    for item in menu_items('tr'):
        food = {}
        food['name'] = item('td')[0].text
        food['category'] = item('td')[1].text
        food['calories'] = item('td')[2].text
        food['fat'] = item('td')[3].text
        food['carbs'] = item('td')[4].text
        food['restaurant'] = restaurant['name']
        foods.append(food)
        

In [7]:
len(foods)

5131

In [8]:
foods_df = pd.DataFrame(foods)

### Step 5: Export to csv

**Note:** Don't export the index column from your DataFrame

In [9]:
foods_df.to_csv('foods.csv', index=False)

### Step 6: Use `pd.read_html`
Do the same thing as above, but use `pd.read_html()` to scrape the table from each page instead of BS4.

In [10]:
foods = []
for restaurant in restaurants:
    food = pd.read_html(url+restaurant['href'])
    food[0]['restaurant'] = restaurant['name']
    foods.append(food[0])

In [11]:
foods_df = pd.concat(foods)