In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

# Using the requests library

In [2]:
url = 'https://rldaggie.github.io/sample-html/'
res = requests.get(url)

### Status Codes

In [3]:
res.status_code

200

### Creating a `BeautifulSoup` object

In [4]:
soup = BeautifulSoup(res.content, 'lxml')

# `soup.find()`

Returns either:

1. A soup object of the first match
2. `None`

In [5]:
h1 = soup.find('h1')

In [6]:
type(h1)

bs4.element.Tag

In [7]:
h1.text

'This is an h1'

In [9]:
h1.attrs

{'class': ['foobar'], 'id': 'title'}

In [10]:
if h1:
    print(h1.text)

This is an h1


# `soup.find_all()`

Returns a **_LIST_** of soup objects that match your query

In [11]:
h1_tags = soup.find_all('h1')

In [12]:
[tag.text for tag in h1_tags]

['This is an h1', 'This is yet another heading.']

In [13]:
[tag.attrs for tag in h1_tags]

[{'class': ['foobar'], 'id': 'title'}, {'class': ['foobar']}]

# Creating a `pandas` DataFrame from a scrape

In [32]:
#people = [
#    {'name': 'Bethany', 'market': 'BOS'},
#    {'name': 'Tucker', 'market': 'NYC'}
#]

people = {'name': ['Bethany','Tucker'], 
     'market': ['BOS','NYC']}

pd.DataFrame(people)

Unnamed: 0,name,market
0,Bethany,BOS
1,Tucker,NYC


### Todo List

In [33]:
ol = soup.find('ol', {'class': 'done'})
ol

<ol class="done">
<li>Mow lawn</li>
<li class="foobar"><span>Take out compost</span></li>
<li><span>Create scraping lecture</span></li>
</ol>

In [34]:
todos = []
for li in ol.find_all('li'):
    todo = {}
    todo['task'] = li.text
    todos.append(todo)
pd.DataFrame(todos)

Unnamed: 0,task
0,Mow lawn
1,Take out compost
2,Create scraping lecture


### GA Directory

In [35]:
table = soup.find('table', {'id': 'directory'})

In [36]:
people = []
for row in table.find('tbody').find_all('tr'):
    person = {}
    person['name'] = row.find('a').text.strip()
    person['email'] = row.find('a').attrs['href'].replace('mailto:', '')
    person['role'] = row.find('td').text.strip()
    
    people.append(person)
pd.DataFrame(people)

Unnamed: 0,email,name,role
0,praveen@ga.co,Praveen,Student
1,fred@ga.co,Fred,Student
2,homer@ga.co,Homer,Student
3,kyle@ga.co,Kyle,Student
4,sam@ga.co,Sam,Student
5,javier@ga.co,Javier,Student
6,nengkuan@ga.co,Nengkuan,Student
7,kieth@ga.co,Kieth,Student
8,bola@ga.co,Bola,Student
9,steve@ga.co,Steve,Student


### Basketball Reference

In [38]:
url = 'https://www.basketball-reference.com/'
res = requests.get(url)
print(res.status_code)

soup = BeautifulSoup(res.content, 'lxml')

200


In [21]:
teams = []
for conf in ['E', 'W']:
    table = soup.find('table', {'id': 'confs_standings_'+conf})
    for row in table.find('tbody').find_all('tr'):
        team = {}
        team['slug'] = row.find('a').text
        team['name'] = row.find('a').attrs['title']
        team['wins'] = row.find_all('td')[2].text
        team['wins'] = row.find('td', {'data-stat': 'wins'}).text
        team['losses'] = row.find('td', {'data-stat': 'losses'}).text
        team['rank'] = row.find('span').text.strip()[1:-1]
        team['conference'] = conf

        teams.append(team)
df = pd.DataFrame(teams)
df

Unnamed: 0,conference,losses,name,rank,slug,wins
0,E,19,Milwaukee Bucks,1,MIL,53
1,E,21,Toronto Raptors,2,TOR,51
2,E,25,Philadelphia 76ers,3,PHI,47
3,E,28,Indiana Pacers,4,IND,44
4,E,29,Boston Celtics,5,BOS,43
5,E,34,Detroit Pistons,6,DET,36
6,E,36,Brooklyn Nets,7,BRK,37
7,E,36,Miami Heat,8,MIA,35
8,E,38,Orlando Magic,9,ORL,34
9,E,39,Charlotte Hornets,10,CHO,31


In [22]:
df.dtypes

conference    object
losses        object
name          object
rank          object
slug          object
wins          object
dtype: object