# Web Scraping

In [None]:
import numpy as np
import pandas as pd

## Scraping a very basic webpage

In [None]:
# Whenever you want to scrape a website without an API
import requests
from bs4 import BeautifulSoup

BeautifulSoup documentation [here](https://www.crummy.com/software/BeautifulSoup/bs4/doc/).


Let's look [this](https://web.ics.purdue.edu/~gchopra/class/public/pages/webdesign/05_simple.html) very simple webpage.

In [None]:
# Get the content of a website
site = 

In [None]:
# What did we just get?


In [None]:
# Check the status


In [None]:
# With APIs, the output was typically JSON, but not regular webpages
site.json() # error!

In [None]:
# inspect the contents with text

# messy!

In [None]:
# Let's beautify this and make it easier to parse
soup = 

In [None]:
# What does our soup look like?
soup

In [None]:
# Make it even prettier


### Parse the html

In [None]:
# Find a level 1 header


In [None]:
# Find a level 2 header


In [None]:
# Find all the level 2 headers


In [None]:
# Find all the level 3 headers

# There were none.

In [None]:
# Find all the paragraphs


In [None]:
# Find all the hyperlinks


In [None]:
# Get the links
soup.find('a')

In [None]:
# Get the text for the hyperlink
soup.find('a')

In [None]:
# Get all the list items
soup.find_all(...)

In [None]:
# Specifically get the ordered lists
soup.find_all(...)

In [None]:
# Specifically get the unordered lists
soup.find_all(...)

## Another example

Let's check out [this](https://irar.humboldt.edu/node/552) Cal Poly Humboldt website.

In [None]:
# Get the data
cph_stats = requests.get('https://www.humboldt.edu/irar/fall-semester-fast-facts')
cph_stats.status_code

In [None]:
# Beautify the data
cph_soup = BeautifulSoup(cph_stats.text, 'html.parser')

In [None]:
# Pretty!
print(cph_soup.prettify())

In [None]:
# check for specific tags
cph_soup.find_all('a')

In [None]:
# refine the search with css selectors
cph_soup.find_all('a', class_ = "top-level")

In [None]:
# A shorthand way of searching that
cph_soup.select('a.top-level')

In [None]:
# How many tables are there?
len(cph_soup.find_all('...'))

In [None]:
# Some tables are labeled with h3 headers
cph_soup.find_all('h3')

In [None]:
# Let's just focus on one of the tables
student_ethnicity_table = cph_soup.find_all('table')

In [None]:
# Look at the rows
student_ethnicity_table.find_all('')

In [None]:
# Look at a single row
student_ethnicity_table.find_all('tr')

In [None]:
# Look at all the data points in that row
student_ethnicity_table.find_all('tr')[0]

In [None]:
# Look at one specific data point
student_ethnicity_table.find_all('tr')[0].find_all('td')

In [None]:
# Get the text of that data point 
student_ethnicity_table.find_all('tr')[0].find_all('td')[1]

### Create a Pandas DataFrame

We'll start with the manual approach.

In [None]:
# Create a nested list with the data
table_vals = []

for i in ...:
    row_i = ...
    for j in ...:
        ...
    table_vals...

In [None]:
# Check out the result
table_vals

In [None]:
# Make it a dataframe
df = 
df

In [None]:
# Clean it up (reset column labels)
df.columns = df.iloc[0]
df.drop(0,inplace=True)
df

In [None]:
df.columns[0]

In [None]:
# Clean it up (reset row labels)
df.set_index('\xa0',inplace=True)

In [None]:
df

In [None]:
# Would require further cleanup
df.dtypes

There is also a Pandas approach.

In [None]:
# We've taken a chunk of html that we want to parse
student_ethnicity_table

In [None]:
# What is the type
type(student_ethnicity_table)

In [None]:
# Read the html into a dataframe


In [None]:
# We can use it on the whole page if we want to
pd.read_html(...)

## Activity

1. From the same webpage we scraped last (Cal Poly Humboldt IRAR), put the data in the Fall 2024 Geographic Origin of Current Students table into a Pandas DataFrame. Clean the dataframe.