# Web scraper with Beautiful soup

#### First we need to import some Python libraries like 'requests', 'BeutifulSoup', etc.

In [37]:
# my imports:

import requests
from bs4 import BeautifulSoup
import pandas as pd

#### We fetch our base URL and add as a variable

In [38]:
base_url = 'https://www.scrapethissite.com/pages/forms/'

#### We then fetch tne html code from the website using GET http request

In [39]:
page = requests.get(base_url)

#### We then create our BS object (in our case in a variable called 'soup')

In [40]:
soup = BeautifulSoup(page.text, 'html')

#### We can select a specific element by tag name. To fetch the first found element only, we use the find() method, if we want to get all emelents we use find_all() method

In [41]:
soup.find('div')

<div class="container">
<div class="col-md-12">
<ul class="nav nav-tabs">
<li id="nav-homepage">
<a class="nav-link hidden-sm hidden-xs" href="/">
<img id="nav-logo" src="/static/images/scraper-icon.png"/>
                                Scrape This Site
                            </a>
</li>
<li id="nav-sandbox">
<a class="nav-link" href="/pages/">
<i class="glyphicon glyphicon-console hidden-sm hidden-xs"></i>
                                Sandbox
                            </a>
</li>
<li id="nav-lessons">
<a class="nav-link" href="/lessons/">
<i class="glyphicon glyphicon-education hidden-sm hidden-xs"></i>
                                Lessons
                            </a>
</li>
<li id="nav-faq">
<a class="nav-link" href="/faq/">
<i class="glyphicon glyphicon-flag hidden-sm hidden-xs"></i>
                                FAQ
                            </a>
</li>
<li class="pull-right" id="nav-login">
<a class="nav-link" href="/login/">
                                Login

#### We can be more specific by adding class or id

In [42]:
soup.find_all('div', class_ = 'col-md-12')

[<div class="col-md-12">
 <ul class="nav nav-tabs">
 <li id="nav-homepage">
 <a class="nav-link hidden-sm hidden-xs" href="/">
 <img id="nav-logo" src="/static/images/scraper-icon.png"/>
                                 Scrape This Site
                             </a>
 </li>
 <li id="nav-sandbox">
 <a class="nav-link" href="/pages/">
 <i class="glyphicon glyphicon-console hidden-sm hidden-xs"></i>
                                 Sandbox
                             </a>
 </li>
 <li id="nav-lessons">
 <a class="nav-link" href="/lessons/">
 <i class="glyphicon glyphicon-education hidden-sm hidden-xs"></i>
                                 Lessons
                             </a>
 </li>
 <li id="nav-faq">
 <a class="nav-link" href="/faq/">
 <i class="glyphicon glyphicon-flag hidden-sm hidden-xs"></i>
                                 FAQ
                             </a>
 </li>
 <li class="pull-right" id="nav-login">
 <a class="nav-link" href="/login/">
                                 

In [43]:
soup.find_all('p', class_ = 'lead')

[<p class="lead">
                             Browse through a database of NHL team stats since 1990. Practice building a scraper that handles common website interface components.
                             Take a look at how pagination and search elements change the URL as your browse. Build a web scraper that can conduct searches and paginate through the results.
                         </p>]

#### To get the content of the element as text we use the text parameter

In [44]:
soup.find('p', class_ = 'lead').text

'\n                            Browse through a database of NHL team stats since 1990. Practice building a scraper that handles common website interface components.\n                            Take a look at how pagination and search elements change the URL as your browse. Build a web scraper that can conduct searches and paginate through the results.\n                        '

#### we can remove any white spaces like space, tab or newline by using the strip() method

In [45]:
soup.find('p', class_ = 'lead').text.strip()

'Browse through a database of NHL team stats since 1990. Practice building a scraper that handles common website interface components.\n                            Take a look at how pagination and search elements change the URL as your browse. Build a web scraper that can conduct searches and paginate through the results.'

In [46]:
soup.find('th').text.strip()

'Team Name'

#### We can target a nested element using nested find() or .find_all() methods and we can select the one we need by its index

In [47]:
teams = soup.find_all('tr', class_ = 'team')

In [48]:
my_team = teams[3]

my_team_name = my_team.find('td', class_ = 'name').text.strip()
my_team_name

'Chicago Blackhawks'

In [49]:
table = soup.find('table', class_ = 'table')

In [50]:
headers = table.find_all('th')

In [51]:
titles = [header.text.strip() for header in headers]

In [52]:
df = pd.DataFrame(columns=titles)

In [53]:
rows = table.find_all('tr')[1:]

In [59]:
for row in rows:
    row_data = row.find_all('td')
    data = [record.text.strip() for record in row_data]
    df.loc[len(df)] = data

In [61]:
df

Unnamed: 0,Team Name,Year,Wins,Losses,OT Losses,Win %,Goals For (GF),Goals Against (GA),+ / -
0,Boston Bruins,1990,44,24,,0.55,299,264,35
1,Buffalo Sabres,1990,31,30,,0.388,292,278,14
2,Calgary Flames,1990,46,26,,0.575,344,263,81
3,Chicago Blackhawks,1990,49,23,,0.613,284,211,73
4,Detroit Red Wings,1990,34,38,,0.425,273,298,-25
5,Edmonton Oilers,1990,37,37,,0.463,272,272,0
6,Hartford Whalers,1990,31,38,,0.388,238,276,-38
7,Los Angeles Kings,1990,46,24,,0.575,340,254,86
8,Minnesota North Stars,1990,27,39,,0.338,256,266,-10
9,Montreal Canadiens,1990,39,30,,0.487,273,249,24


In [63]:
df.to_csv('teams_stats.csv', index=None)