# Class 5 - Web Scraping

## Using Pandas

In [1]:
import pandas as pd

In [2]:
dfs = pd.read_html('https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population', flavor='bs4')

In [3]:
len(dfs)

2

In [4]:
df_countries = dfs[0]

In [5]:
df_countries.head()

Unnamed: 0,Rank,Country / Dependency,Region,Population,Percentage of the world,Date,Source (official or from the United Nations),Notes
0,–,World,,7932767000,100%,26 Feb 2022,UN projection[1],
1,1,China,Asia,1412600000,17.8%,31 Dec 2021,National annual estimate[2],The population figure refers to mainland China...
2,2,India,Asia,1388517013,17.5%,26 Feb 2022,National population clock[3],The figure includes the population of Jammu an...
3,3,United States,Americas,333293321,4.20%,26 Feb 2022,National population clock[4],The figure includes the 50 states and the Dist...
4,4,Indonesia[b],Asia,271350000,3.42%,31 Dec 2020,National annual estimate[5],


In [6]:
df_countries.to_csv('countries.csv')

In [7]:
# Challenge: Scrape a Wikipedia Table
dfs = pd.read_html('https://en.wikipedia.org/wiki/List_of_My_Little_Pony_villains')
df_ponies_villians = dfs[4]
df_ponies_villians.head()

Unnamed: 0,Name,Species,Gender,Body color,Hair color,Year of toy/animation debut,"Special, Episode and Film debut",Voiced by,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12
0,Tirac,Centaur,Male,Dark Gray,,1984,Rescue from Midnight Castle,Victor Caroli,,,,,
1,Tirac is a centaur with elements of a demon (m...,Tirac is a centaur with elements of a demon (m...,Tirac is a centaur with elements of a demon (m...,Tirac is a centaur with elements of a demon (m...,Tirac is a centaur with elements of a demon (m...,Tirac is a centaur with elements of a demon (m...,Tirac is a centaur with elements of a demon (m...,Tirac is a centaur with elements of a demon (m...,Tirac is a centaur with elements of a demon (m...,Tirac is a centaur with elements of a demon (m...,Tirac is a centaur with elements of a demon (m...,Tirac is a centaur with elements of a demon (m...,Tirac is a centaur with elements of a demon (m...
2,Scorpan,Demon Gargoyle (Corrupted form)Human (Purified...,Male,Ash Brown,,1984,Rescue from Midnight Castle,Ron Taylor,,,,,
3,Scorpan is a demon gargoyle (originally a huma...,Scorpan is a demon gargoyle (originally a huma...,Scorpan is a demon gargoyle (originally a huma...,Scorpan is a demon gargoyle (originally a huma...,Scorpan is a demon gargoyle (originally a huma...,Scorpan is a demon gargoyle (originally a huma...,Scorpan is a demon gargoyle (originally a huma...,Scorpan is a demon gargoyle (originally a huma...,Scorpan is a demon gargoyle (originally a huma...,Scorpan is a demon gargoyle (originally a huma...,Scorpan is a demon gargoyle (originally a huma...,Scorpan is a demon gargoyle (originally a huma...,Scorpan is a demon gargoyle (originally a huma...
4,Katrina,Humanoid Feline Witch,Female,Dark Brown,Orange,1985,Escape from Katrina,Tammy Grimes,,,,,


## Using Requests and BeautifulSoup 

In [8]:
import requests
from bs4 import BeautifulSoup

In [9]:
response = requests.get('https://www8.gsb.columbia.edu/courses/mba/2022/Spring')
soup = BeautifulSoup(response.text, 'lxml')

In [10]:
soup.find('a') # Finds the first link

<a class="cbs-local-nav-search mobile" href="#mMenuSearch" tabindex="-1"></a>

In [11]:
soup.find_all('a') # Finds all links

[<a class="cbs-local-nav-search mobile" href="#mMenuSearch" tabindex="-1"></a>,
 <a class="active-trail active-trail" href="/courses/mba">MBA Courses</a>,
 <a href="/courses/mba/archived-schedules">Archived Schedules</a>,
 <a href="/courses/mba/2021/Summer" title="">Summer 2021</a>,
 <a href="/courses/mba/2021/Fall" title="">Fall 2021</a>,
 <a class="active-trail active" href="/courses/mba/2022/Spring" title="">Spring 2022</a>,
 <a href="/courses/emba">EMBA Courses</a>,
 <a href="/courses/emba/2021/Summer" title="EMBA Summer 2021">Summer 2021</a>,
 <a href="/courses/emba/2021/Fall" title="EMBA Fall 2021">Fall 2021</a>,
 <a href="/courses/emba/2022/Spring" title="">Spring 2022</a>,
 <a href="/courses/phd" title="">PhD and MS Courses</a>,
 <a href="/courses/content/phd-archived-schedules">PhD Archived Schedules</a>,
 <a href="/courses/phd/2021/Spring" title="">Spring 2021</a>,
 <a href="/courses/phd/2021/Fall" title="">Fall 2021</a>,
 <a href="/courses/phd/2022/Spring" title="">Spring 20

In [12]:
first_course = soup.find_all('div', 'course-name')[0]

In [13]:
first_course.get_text()

'\nB8784-001 - Business and Society: Reconciling Shareholder and Stakeholder Interests \n1.5 credit hours '

In [14]:
first_course.attrs

{'class': ['views-field', 'views-field-title', 'course-name']}

In [15]:
first_course.get('class')

['views-field', 'views-field-title', 'course-name']

In [16]:
first_course.find('a')

<a href="/courses/mba/2022/spring/b8784-001">B8784-001 - Business and Society: Reconciling Shareholder and Stakeholder Interests </a>

In [17]:
# Print Course Names Challenge
for course in soup.find_all('div', 'course-name'):
    print(course.find('a').get_text())
    print("https://www8.gsb.columbia.edu" + course.find('a').get('href'))

B8784-001 - Business and Society: Reconciling Shareholder and Stakeholder Interests 
https://www8.gsb.columbia.edu/courses/mba/2022/spring/b8784-001
B8782-001 - Innovation Salon 
https://www8.gsb.columbia.edu/courses/mba/2022/spring/b8782-001
B8779-001 - Global Immersion: Africa's Consumer Market: The Case of Ghana 
https://www8.gsb.columbia.edu/courses/mba/2022/spring/b8779-001
B8767-001 - Investing in Social Ventures 
https://www8.gsb.columbia.edu/courses/mba/2022/spring/b8767-001
B8744-001 - The Psychology and Economics of Consumer Finance 
https://www8.gsb.columbia.edu/courses/mba/2022/spring/b8744-001
B8725-001 - Global Immersion: Economic Growth in the UAE 
https://www8.gsb.columbia.edu/courses/mba/2022/spring/b8725-001
B8716-001 - Global Family Enterprise: Stakeholdership, Sustainability, and Innovation 
https://www8.gsb.columbia.edu/courses/mba/2022/spring/b8716-001
B8714-001 - Global Immersion: Doing Business in Brazil - Challenges & Opportunities 
https://www8.gsb.columbia.ed

In [18]:
first_course.find_next_sibling('div', 'instructor')

<div class="instructor">
<span class="desktop">R. Glenn Hubbard <span class="uni">(rgh1)</span><br/></span> </div>

In [19]:
for course in soup.find_all('div', 'mba-course'):
    print(course.find('div', 'course-name').find('a').get_text())
    print(course.find('div', 'date-time').get_text().strip())
    print(course.find('div', 'instructor').get_text().strip())
    print(course.find('div', 'course-location').get_text().strip())

B8784-001 - Business and Society: Reconciling Shareholder and Stakeholder Interests 
W - B Term02:00PM to 05:15PM
R. Glenn Hubbard (rgh1)
Geffen-590
B8782-001 - Innovation Salon 
R - Full Term03:50PM to 07:05PM
Sheena Iyengar (ss957)

B8779-001 - Global Immersion: Africa's Consumer Market: The Case of Ghana 
W - Full Term12:10PM to 01:40PM
Stephan Meier (sm3087)
Geffen-570
B8767-001 - Investing in Social Ventures 
R - B Term02:00PM to 05:15PM
Vikas Raj (vr2235)Bruce Usher (bmu2001)
Geffen-420
B8744-001 - The Psychology and Economics of Consumer Finance 
Block Week 1 - TWRFS - 09:00AM to 05:00PM
Eric Johnson (ejj3)Stephen Zeldes (spz1)
Geffen-620
B8725-001 - Global Immersion: Economic Growth in the UAE 
T - Full Term08:30AM to 10:00AM
Pierre Yared (py2114)
Geffen-440
B8716-001 - Global Family Enterprise: Stakeholdership, Sustainability, and Innovation 
T - Full Term10:20AM to 11:50AM
Patricia Angus (pma36)
Kravis-880
B8714-001 - Global Immersion: Doing Business in Brazil - Challenges & 

### Saving to CSV

In [20]:
import csv

In [21]:
with open('courses.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["Course Name", "Day & Time", "Instructor", "Location"]) 
    for course in soup.find_all('div', 'mba-course'):
        name = course.find('div', 'course-name').find('a').get_text()
        date = course.find('div', 'date-time').get_text().strip()
        instructor = course.find('div', 'instructor').get_text().strip()
        location = course.find('div', 'course-location').get_text().strip()
        writer.writerow([name, date, instructor, location])

### Downloading Files

In [22]:
response = requests.get('https://www8.gsb.columbia.edu/courses/mba/2022/spring/b8784-001')
soup = BeautifulSoup(response.text, 'lxml')

In [23]:
response = requests.get(soup.find('a', string='Download Syllabus').get('href'))

In [24]:
with open('syllabus.pdf', 'wb') as file:
    file.write(response.content)

In [25]:
# Challenge: Downloading All Syllabi
response = requests.get('https://www8.gsb.columbia.edu/courses/mba/2022/Spring')
soup = BeautifulSoup(response.text, 'lxml')

# Loop over all the courses
for course in soup.find_all('div', 'course-name'):
    # Find and save the course name and URL
    course_name = course.find('a').get_text().strip()
    course_url = "https://www8.gsb.columbia.edu" + course.find('a').get('href')
    # Visit the course page
    print("Visiting page", course_name)
    response = requests.get(course_url)
    soup = BeautifulSoup(response.text, 'lxml')
    if soup.find('a', string='Download Syllabus'):
        # Find the syllabus URL
        syllabus_url = soup.find('a', string='Download Syllabus').get('href')
        # Visit the syllabus URL
        response = requests.get(syllabus_url)
        # Download syllabus
        print("Downloading syllabus", course_name)
        with open(f'syllabi/{course_name}.pdf', 'wb') as file:
            file.write(response.content)

Visiting page B8784-001 - Business and Society: Reconciling Shareholder and Stakeholder Interests
Downloading syllabus B8784-001 - Business and Society: Reconciling Shareholder and Stakeholder Interests
Visiting page B8782-001 - Innovation Salon
Downloading syllabus B8782-001 - Innovation Salon
Visiting page B8779-001 - Global Immersion: Africa's Consumer Market: The Case of Ghana
Downloading syllabus B8779-001 - Global Immersion: Africa's Consumer Market: The Case of Ghana
Visiting page B8767-001 - Investing in Social Ventures
Downloading syllabus B8767-001 - Investing in Social Ventures
Visiting page B8744-001 - The Psychology and Economics of Consumer Finance
Downloading syllabus B8744-001 - The Psychology and Economics of Consumer Finance
Visiting page B8725-001 - Global Immersion: Economic Growth in the UAE
Downloading syllabus B8725-001 - Global Immersion: Economic Growth in the UAE
Visiting page B8716-001 - Global Family Enterprise: Stakeholdership, Sustainability, and Innovatio

KeyboardInterrupt: 

In [26]:
# Remove the spaces from the end of file names
import os
for file in os.listdir('syllabi'):
    file_name, extension = os.path.splitext(file)
    if extension == ".pdf":
        os.rename(f'syllabi/{file_name}.pdf', f'syllabi/{file_name.strip()}.pdf')