In [242]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

# Using the requests library

In [243]:
url = 'https://rldaggie.github.io/sample-html/'
res = requests.get(url)

### Status Codes

In [244]:
res.status_code

200

### Creating a `BeautifulSoup` object

In [245]:
soup = BeautifulSoup(res.content, 'lxml')

In [246]:
soup

<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8"/>
<title>The title</title>
<style media="screen">
      tbody tr {
        color: red;
      }
    </style>
</head>
<body>
<h1 class="foobar" id="title">This is an h1</h1>
<div>
<h1 class="foobar">This is yet another heading.</h1>

      Something inside the div
    </div>
<h3>Todo List</h3>
<ol class="todo">
<li class="foobar">Take out trash</li>
<li>Pay billz</li>
<li class="foobar">Feed dog</li>
</ol>
<h3>Completed</h3>
<ol class="done">
<li>Mow lawn</li>
<li class="foobar"><span>Take out compost</span></li>
<li><span>Create scraping lecture</span></li>
</ol>
<p class="foobar">Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. <span>Duis aute irure dolor</span> in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. <em>Excepteu

# `soup.find()`

Returns either:

1. A soup object of the first match
2. `None`

In [247]:
soup.find('h1')

<h1 class="foobar" id="title">This is an h1</h1>

In [248]:
h1 = soup.find('h1')

In [249]:
type(h1)

bs4.element.Tag

In [250]:
h1.text

'This is an h1'

In [251]:
h1.attrs

{'class': ['foobar'], 'id': 'title'}

In [252]:
if h1:
    print(h1.text)

This is an h1


# `soup.find_all()`

Returns a **_LIST_** of soup objects that match your query

In [253]:
soup.find_all('h1')

[<h1 class="foobar" id="title">This is an h1</h1>,
 <h1 class="foobar">This is yet another heading.</h1>]

In [254]:
h1_tags = soup.find_all('h1')

In [255]:
[tag.text for tag in h1_tags]

['This is an h1', 'This is yet another heading.']

In [256]:
[tag.attrs for tag in h1_tags]

[{'class': ['foobar'], 'id': 'title'}, {'class': ['foobar']}]

# Creating a `pandas` DataFrame from a scrape

In [257]:
people = [
    {'name': 'Bethany', 'market': 'BOS'},
    {'name': 'Tucker', 'market': 'NYC'}
]

# list of dictionaries into dataframe 
pd.DataFrame(people)

Unnamed: 0,name,market
0,Bethany,BOS
1,Tucker,NYC


### Todo List

In [258]:
ol = soup.find('ol', {'class': 'done'})

# ol is also a soup object 
# because a subset of html is also a html 



In [259]:
ol.find_all('li')

[<li>Mow lawn</li>,
 <li class="foobar"><span>Take out compost</span></li>,
 <li><span>Create scraping lecture</span></li>]

In [260]:
todos = []
for li in ol.find_all('li'):
    todo = {}
    todo['task'] = li.text
    todo['status'] = 'done'
    todos.append(todo)
pd.DataFrame(todos)

Unnamed: 0,task,status
0,Mow lawn,done
1,Take out compost,done
2,Create scraping lecture,done


In [261]:
notdone = soup.find('ol', {'class': 'todo'})
notdone

<ol class="todo">
<li class="foobar">Take out trash</li>
<li>Pay billz</li>
<li class="foobar">Feed dog</li>
</ol>

In [262]:
notdone.find_all('li')

[<li class="foobar">Take out trash</li>,
 <li>Pay billz</li>,
 <li class="foobar">Feed dog</li>]

In [263]:
[col.text for col in notdone.find_all('li')]

['Take out trash', 'Pay billz', 'Feed dog']

In [264]:
for li in notdone.find_all('li'):
    todo = {}
    todo['task'] = li.text
    todo['status'] = 'notdone'
    todos.append(todo)
pd.DataFrame(todos)

Unnamed: 0,task,status
0,Mow lawn,done
1,Take out compost,done
2,Create scraping lecture,done
3,Take out trash,notdone
4,Pay billz,notdone
5,Feed dog,notdone


### GA Directory

In [271]:
tbl = soup.find('tbody')

In [275]:
# name --> tbl.find('tr').find('a').text.strip()

'Praveen'

In [285]:
# email --> tbl.find('tr').find('a').attrs['href'][7:]

'praveen@ga.co'

In [288]:
# role --> tbl.find('tr').find('td').text

'Student'

In [292]:
person_empty = []
for row in tbl.find_all('tr'):
    person_dict = {}
    person_dict['name'] = row.find('a').text.strip()
    person_dict['email'] = row.find('a').attrs['href'][7:]
    person_dict['role'] = row.find('td').text
    person_empty.append(person_dict)

pd.DataFrame(person_empty)

Unnamed: 0,name,email,role
0,Praveen,praveen@ga.co,Student
1,Fred,fred@ga.co,Student
2,Homer,homer@ga.co,Student
3,Kyle,kyle@ga.co,Student
4,Sam,sam@ga.co,Student
5,Javier,javier@ga.co,Student
6,Nengkuan,nengkuan@ga.co,Student
7,Kieth,kieth@ga.co,Student
8,Bola,bola@ga.co,Student
9,Steve,steve@ga.co,Student


In [None]:
people = []
for row in table.find('tbody').find_all('tr'):
    person = {}
    person['name'] = row.find('a').text.strip()
    person['email'] = row.find('a').attrs['href'].replace('mailto:', '')
    person['role'] = row.find('td').text.strip()
    
    people.append(person)
pd.DataFrame(people)

Unnamed: 0,name,email,role
0,Praveen,praveen@ga.co,Student
1,Fred,fred@ga.co,Student
2,Homer,homer@ga.co,Student
3,Kyle,kyle@ga.co,Student
4,Sam,sam@ga.co,Student
5,Javier,javier@ga.co,Student
6,Nengkuan,nengkuan@ga.co,Student
7,Kieth,kieth@ga.co,Student
8,Bola,bola@ga.co,Student
9,Steve,steve@ga.co,Student


### Basketball Reference

In [None]:
url = 'https://www.basketball-reference.com/'
res = requests.get(url)
res.status_code

soup = BeautifulSoup(res.content, 'lxml')

In [None]:
teams = []
for conf in ['E', 'W']:
    table = soup.find('table', {'id': 'confs_standings_'+conf})
    for row in table.find('tbody').find_all('tr'):
        team = {}
        team['slug'] = row.find('a').text
        team['name'] = row.find('a').attrs['title']
        team['wins'] = row.find_all('td')[2].text
        team['wins'] = row.find('td', {'data-stat': 'wins'}).text
        team['losses'] = row.find('td', {'data-stat': 'losses'}).text
        team['rank'] = row.find('span').text.strip()[1:-1]
        team['conference'] = conf

        teams.append(team)
df = pd.DataFrame(teams)
df

Unnamed: 0,slug,name,wins,losses,rank,conference
0,MIA,Miami Heat,47,24,1,E
1,PHI,Philadelphia 76ers,43,26,2,E
2,MIL,Milwaukee Bucks,44,27,3,E
3,BOS,Boston Celtics,43,28,4,E
4,CHI,Chicago Bulls,41,29,5,E
5,CLE,Cleveland Cavaliers,41,30,6,E
6,TOR,Toronto Raptors,39,31,7,E
7,BRK,Brooklyn Nets,37,34,8,E
8,CHO,Charlotte Hornets,36,35,9,E
9,ATL,Atlanta Hawks,35,35,10,E


In [None]:
df.dtypes

slug          object
name          object
wins          object
losses        object
rank          object
conference    object
dtype: object