# Scrape across a web page

How do you scrape data from across a webpage and not just in a table?  This is an introduction. 

### Read in the webpage

In [1]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
import pandas as pd

# What web page do we want? 
pageAddress = "https://www.nfl.com/stats/player-stats/category/passing/2019/REG/all/passingyards/desc"

# Open and read in the web page 
soup = BeautifulSoup(urlopen(pageAddress), "html.parser")

### Print the HTML (source HTML for the webpage)

In [2]:
# Print all the HTML, if you want to reference it
#print(soup.prettify())

### Use Pandas to read a table of data.

In [3]:
# Read the table of data into a Dataframe 
table = soup.find_all('table')[0] 
df = pd.read_html(str(table))
print(df[0])

               Player  Pass Yds  Yds/Att  Att  Cmp  Cmp %  TD  INT   Rate  \
0      Jameis Winston      5109      8.2  626  380  0.607  33   30   84.3   
1        Dak Prescott      4902      8.2  596  388  0.651  30   11   99.7   
2          Jared Goff      4638      7.4  626  394  0.629  22   16   86.5   
3       Philip Rivers      4615      7.8  591  390  0.660  23   20   88.5   
4           Matt Ryan      4466      7.3  616  408  0.662  26   14   92.1   
5      Russell Wilson      4110      8.0  516  341  0.661  31    5  106.3   
6           Tom Brady      4057      6.6  613  373  0.608  24    8   88.0   
7          Derek Carr      4054      7.9  513  361  0.704  21    8  100.8   
8        Carson Wentz      4039      6.7  607  388  0.639  27    7   93.1   
9     Patrick Mahomes      4031      8.3  484  319  0.659  26    5  105.3   
10      Aaron Rodgers      4002      7.0  569  353  0.620  26    4   95.4   
11    Jimmy Garoppolo      3978      8.4  476  329  0.691  27   13  102.0   

### Find all the links in the page (which have a tag of <a ...>)

In [4]:
# Find all the links in the webpage 
links = soup.find_all('a')

for link in links:
    names = link.contents[0]
    fullLink = link.get('href')
    print(fullLink)

#main-content
https://www.nfl.com
https://www.nfl.com/news/
https://www.nfl.com/scores/
https://www.nfl.com/schedules/
https://www.nfl.com/videos/
https://www.nfl.com/teams/
https://www.nfl.com/players/
https://www.nfl.com/stats/player-stats/
https://www.nfl.com/standings/
https://www.nfl.com/draft/
#2ndlevel
https://www.nfl.com/photos/
https://www.nfl.com/super-bowl/
https://www.nfl.com/gamepass?icampaign=gpg-nav-gno-gamepass
https://www.nfl.com/free-agency/
https://www.nfl.com/ways-to-watch/
https://www.foxdeportes.com/nfl/
https://www.nfl.com/causes/inspire-change/
https://www.nfl.com/network/watch/nfl-network-live
https://smart.link/qd5unmrz3lfwv
https://www.ticketmaster.com/nfl?wt.mc_id=NFL_LEAGUE_TICKETS_LINK&utm_source=NFL.com&utm_medium=client&utm_campaign=NFL_LEAGUE&utm_content=TICKETS_LINK
http://www.nflshop.com/?bm-nflcom-2017-Header-Shop-Tab
https://www.nfl.com/account/sign-in
https://www.nflshop.com/?_s=bm-nflcom-2017-Header-Shop-Tab-Peak
https://www.nfl.com/network/watch/

### Find specific types (classes) of links

In [5]:
# Find only the links to players
# You view the source of the HTML page to get the class. 
links = soup.find_all('a', class_="d3-o-player-fullname nfl-o-cta--link")

for link in links:
    names = link.contents[0]
    fullLink = link.get('href')
    print(fullLink)

/players/jameis-winston/
/players/dak-prescott/
/players/jared-goff/
/players/philip-rivers/
/players/matt-ryan/
/players/russell-wilson/
/players/tom-brady/
/players/derek-carr/
/players/carson-wentz/
/players/patrick-mahomes/
/players/aaron-rodgers/
/players/jimmy-garoppolo/
/players/deshaun-watson/
/players/baker-mayfield/
/players/kyler-murray/
/players/kirk-cousins/
/players/ryan-fitzpatrick/
/players/andy-dalton/
/players/kyle-allen/
/players/gardner-minshew/
/players/mitchell-trubisky/
/players/lamar-jackson/
/players/josh-allen-4/
/players/daniel-jones/
/players/sam-darnold/
