# 6. Web Scraping

## Using Pandas' read_html()

In [1]:
import pandas as pd

In [2]:
pip install html5lib

Note: you may need to restart the kernel to use updated packages.


In [3]:
dfs = pd.read_html('https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population', flavor='bs4')

In [4]:
len(dfs)

2

In [5]:
df_countries = dfs[0]

In [6]:
df_countries.head()

Unnamed: 0,Rank,Country / Dependency,Continent,Population,Percentage of the world,Date,Source (official or from the United Nations),Notes
0,–,World,All,7982394000,100%,4 Oct 2022,UN projection[3],
1,1,China,Asia,1412600000,17.7%,31 Dec 2021,Official estimate[4],The population figure refers to mainland China...
2,2,India,Asia,1375586000,17.2%,1 Mar 2022,Official projection[5],The figure includes the population of Indian-a...
3,3,United States,North America,331893745,4.16%,1 Jul 2021,Official estimate[6],The figure includes the 50 states and the Dist...
4,4,Indonesia,Asia[b],275773800,3.45%,1 Jul 2022,Official estimate[7],


In [7]:
df_countries.to_csv('countries.csv')

In [8]:
# Challenge: Scrape a Wikipedia Table
dfs = pd.read_html('https://en.wikipedia.org/wiki/List_of_My_Little_Pony_villains')
df_ponies_villians = dfs[4]
df_ponies_villians.head()

Unnamed: 0,Name,Species,Gender,Body color,Hair color,Year of toy/animation debut,"Special, Episode and Film debut",Voiced by,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12
0,Tirac,Centaur,Male,Dark Gray,,1984,Rescue from Midnight Castle,Victor Caroli,,,,,
1,Tirac is a centaur with elements of a demon (m...,Tirac is a centaur with elements of a demon (m...,Tirac is a centaur with elements of a demon (m...,Tirac is a centaur with elements of a demon (m...,Tirac is a centaur with elements of a demon (m...,Tirac is a centaur with elements of a demon (m...,Tirac is a centaur with elements of a demon (m...,Tirac is a centaur with elements of a demon (m...,Tirac is a centaur with elements of a demon (m...,Tirac is a centaur with elements of a demon (m...,Tirac is a centaur with elements of a demon (m...,Tirac is a centaur with elements of a demon (m...,Tirac is a centaur with elements of a demon (m...
2,Scorpan,Demon Gargoyle (Corrupted form)Human (Purified...,Male,Ash Brown,,1984,Rescue from Midnight Castle,Ron Taylor,,,,,
3,Scorpan is a demon gargoyle (originally a huma...,Scorpan is a demon gargoyle (originally a huma...,Scorpan is a demon gargoyle (originally a huma...,Scorpan is a demon gargoyle (originally a huma...,Scorpan is a demon gargoyle (originally a huma...,Scorpan is a demon gargoyle (originally a huma...,Scorpan is a demon gargoyle (originally a huma...,Scorpan is a demon gargoyle (originally a huma...,Scorpan is a demon gargoyle (originally a huma...,Scorpan is a demon gargoyle (originally a huma...,Scorpan is a demon gargoyle (originally a huma...,Scorpan is a demon gargoyle (originally a huma...,Scorpan is a demon gargoyle (originally a huma...
4,Katrina,Humanoid Feline Witch,Female,Dark Brown,Orange,1985,Escape from Katrina,Tammy Grimes,,,,,


## Using Requests and BeautifulSoup

In [9]:
import requests
from bs4 import BeautifulSoup

In [10]:
response = requests.get('https://www.google.com')
soup = BeautifulSoup(response.text)

In [11]:
print(soup.prettify())

<!DOCTYPE html>
<html itemscope="" itemtype="http://schema.org/WebPage" lang="en">
 <head>
  <meta content="Search the world's information, including webpages, images, videos and more. Google has many special features to help you find exactly what you're looking for." name="description"/>
  <meta content="noodp" name="robots"/>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <meta content="/images/branding/googleg/1x/googleg_standard_color_128dp.png" itemprop="image"/>
  <title>
   Google
  </title>
  <script nonce="okfcz91_TeZTba4ZEzj4vw">
   (function(){window.google={kEI:'5-g9Y6qtFMrm2roP6MSvqAM',kEXPI:'0,1302530,56879,1709,4350,206,4804,2316,383,246,5,5367,1123753,1197748,380743,16114,28683,1121,21311,1361,284,12029,17586,4998,13124,104,3847,10622,22741,1832,3249,1593,1279,2742,149,1103,840,1983,4314,108,3406,606,2023,1777,520,14670,3227,2845,7,4773,826,28171,1851,15324,432,3,1590,1,5445,148,11323,2652,4,1528,2304,7039,22023,5708,7355,933,1,12726,2980,1457,

In [12]:
soup.find('a') # Finds the first link

<a class="gb1" href="https://www.google.com/imghp?hl=en&amp;tab=wi">Images</a>

In [13]:
soup.find_all('a') # Finds all links

[<a class="gb1" href="https://www.google.com/imghp?hl=en&amp;tab=wi">Images</a>,
 <a class="gb1" href="https://maps.google.com/maps?hl=en&amp;tab=wl">Maps</a>,
 <a class="gb1" href="https://play.google.com/?hl=en&amp;tab=w8">Play</a>,
 <a class="gb1" href="https://www.youtube.com/?tab=w1">YouTube</a>,
 <a class="gb1" href="https://news.google.com/?tab=wn">News</a>,
 <a class="gb1" href="https://mail.google.com/mail/?tab=wm">Gmail</a>,
 <a class="gb1" href="https://drive.google.com/?tab=wo">Drive</a>,
 <a class="gb1" href="https://www.google.com/intl/en/about/products?tab=wh" style="text-decoration:none"><u>More</u> »</a>,
 <a class="gb4" href="http://www.google.com/history/optout?hl=en">Web History</a>,
 <a class="gb4" href="/preferences?hl=en">Settings</a>,
 <a class="gb4" href="https://accounts.google.com/ServiceLogin?hl=en&amp;passive=true&amp;continue=https://www.google.com/&amp;ec=GAZAAQ" id="gb_70" target="_top">Sign in</a>,
 <a href="/advanced_search?hl=en&amp;authuser=0">Advanc

In [14]:
soup.find_all('a', class_='gb1') # Finds all links with the class of gb1

[<a class="gb1" href="https://www.google.com/imghp?hl=en&amp;tab=wi">Images</a>,
 <a class="gb1" href="https://maps.google.com/maps?hl=en&amp;tab=wl">Maps</a>,
 <a class="gb1" href="https://play.google.com/?hl=en&amp;tab=w8">Play</a>,
 <a class="gb1" href="https://www.youtube.com/?tab=w1">YouTube</a>,
 <a class="gb1" href="https://news.google.com/?tab=wn">News</a>,
 <a class="gb1" href="https://mail.google.com/mail/?tab=wm">Gmail</a>,
 <a class="gb1" href="https://drive.google.com/?tab=wo">Drive</a>,
 <a class="gb1" href="https://www.google.com/intl/en/about/products?tab=wh" style="text-decoration:none"><u>More</u> »</a>]

In [15]:
soup.find_all(class_='gb1') # Finds all html tags with the class of gb1

[<b class="gb1">Search</b>,
 <a class="gb1" href="https://www.google.com/imghp?hl=en&amp;tab=wi">Images</a>,
 <a class="gb1" href="https://maps.google.com/maps?hl=en&amp;tab=wl">Maps</a>,
 <a class="gb1" href="https://play.google.com/?hl=en&amp;tab=w8">Play</a>,
 <a class="gb1" href="https://www.youtube.com/?tab=w1">YouTube</a>,
 <a class="gb1" href="https://news.google.com/?tab=wn">News</a>,
 <a class="gb1" href="https://mail.google.com/mail/?tab=wm">Gmail</a>,
 <a class="gb1" href="https://drive.google.com/?tab=wo">Drive</a>,
 <a class="gb1" href="https://www.google.com/intl/en/about/products?tab=wh" style="text-decoration:none"><u>More</u> »</a>]

In [16]:
first_link = soup.find('a')
first_link

<a class="gb1" href="https://www.google.com/imghp?hl=en&amp;tab=wi">Images</a>

In [17]:
first_link.get_text()

'Images'

In [18]:
first_link.get('href')

'https://www.google.com/imghp?hl=en&tab=wi'

In [19]:
first_link.get('class')

['gb1']

In [20]:
first_link.attrs

{'class': ['gb1'], 'href': 'https://www.google.com/imghp?hl=en&tab=wi'}

In [21]:
# Google Scraper Challenge
links = soup.find_all('a')
for link in links:
    print(link.get_text(), link.get('href'))

Images https://www.google.com/imghp?hl=en&tab=wi
Maps https://maps.google.com/maps?hl=en&tab=wl
Play https://play.google.com/?hl=en&tab=w8
YouTube https://www.youtube.com/?tab=w1
News https://news.google.com/?tab=wn
Gmail https://mail.google.com/mail/?tab=wm
Drive https://drive.google.com/?tab=wo
More » https://www.google.com/intl/en/about/products?tab=wh
Web History http://www.google.com/history/optout?hl=en
Settings /preferences?hl=en
Sign in https://accounts.google.com/ServiceLogin?hl=en&passive=true&continue=https://www.google.com/&ec=GAZAAQ
Advanced search /advanced_search?hl=en&authuser=0
Advertising Programs /intl/en/ads/
Business Solutions /services/
About Google /intl/en/about.html
Privacy /intl/en/policies/privacy/
Terms /intl/en/policies/terms/


In [22]:
from urllib.parse import urljoin

In [23]:
# Google Scraper Challenge
links = soup.find_all('a')
for link in links:
    print(link.get_text(), urljoin('https://google.com', link.get('href')))

Images https://www.google.com/imghp?hl=en&tab=wi
Maps https://maps.google.com/maps?hl=en&tab=wl
Play https://play.google.com/?hl=en&tab=w8
YouTube https://www.youtube.com/?tab=w1
News https://news.google.com/?tab=wn
Gmail https://mail.google.com/mail/?tab=wm
Drive https://drive.google.com/?tab=wo
More » https://www.google.com/intl/en/about/products?tab=wh
Web History http://www.google.com/history/optout?hl=en
Settings https://google.com/preferences?hl=en
Sign in https://accounts.google.com/ServiceLogin?hl=en&passive=true&continue=https://www.google.com/&ec=GAZAAQ
Advanced search https://google.com/advanced_search?hl=en&authuser=0
Advertising Programs https://google.com/intl/en/ads/
Business Solutions https://google.com/services/
About Google https://google.com/intl/en/about.html
Privacy https://google.com/intl/en/policies/privacy/
Terms https://google.com/intl/en/policies/terms/


In [24]:
soup.find('img', 'lnXdpd')

In [25]:
logo = soup.find('img', id='hplogo')

In [26]:
logo

<img alt="Google" height="92" id="hplogo" src="/images/branding/googlelogo/1x/googlelogo_white_background_color_272x92dp.png" style="padding:28px 0 14px" width="272"/>

In [27]:
logo.get_text()

''

In [28]:
logo.get('src')

'/images/branding/googlelogo/1x/googlelogo_white_background_color_272x92dp.png'

In [29]:
logo.get('alt')

'Google'

## CBS Courses

In [30]:
response = requests.get('https://www8.gsb.columbia.edu/courses/mba/2022/Spring')
soup = BeautifulSoup(response.text)

In [31]:
courses = soup.find_all('div', class_='mba-course')

In [32]:
first_course = courses[0]

In [33]:
first_course.get_text()

'\n\nB8784-001 - Business and Society: Reconciling Shareholder and Stakeholder Interests \n1.5 credit hours \n\n            W - B Term02:00PM to 05:15PM \n\nR. Glenn Hubbard (rgh1) \n\nGeffen-590 \n\nIn-person. Attendance at first class is mandatory for all enrolled students as well as those on a waitlist or who hope to add the class during Add/Drop.\n\n\n            Division(s): Business \n\n            Center(s) and Program(s): Social Enterprise \n\n\n\n\n\n            Method of Instruction: In Person          \n'

In [34]:
first_course.attrs

{'class': ['odd', 'mba-course', 'MBA-course', 'views-row-first']}

In [35]:
first_course

<div class="odd mba-course MBA-course views-row-first">
<div class="views-field views-field-title course-name">
<a href="/courses/mba/2022/spring/b8784-001">B8784-001 - Business and Society: Reconciling Shareholder and Stakeholder Interests </a>
<div class="course-credithours">1.5 credit hours</div><a></a> </div>
<div class="date-time">
            W - B Term<br/><span class="date-display-single"><div class="date-display-range"><span class="date-display-start" content="1970-01-01T14:00:00-05:00" datatype="xsd:dateTime" property="dc:date">02:00PM</span> to <span class="date-display-end" content="1970-01-01T17:15:00-05:00" datatype="xsd:dateTime" property="dc:date">05:15PM</span></div></span><br/> </div>
<div class="instructor">
<span class="desktop">R. Glenn Hubbard <span class="uni">(rgh1)</span><br/></span> </div>
<div class="views-field views-field-field-room course-location">
<span class="desktop">Geffen-590</span> </div>
<div class="views-field views-field-field-webnotes course-web

In [36]:
first_course.find('a').get_text()

'B8784-001 - Business and Society: Reconciling Shareholder and Stakeholder Interests '

In [37]:
first_course.find('a').get_text().split(' - ')

['B8784-001',
 'Business and Society: Reconciling Shareholder and Stakeholder Interests ']

In [38]:
# Get All Course Names and URLs Challenge
for course in courses:
    course_name = course.find('a').get_text()
    url = "https://www8.gsb.columbia.edu" + course.find('a').get('href')
    print(course_name, url)

B8784-001 - Business and Society: Reconciling Shareholder and Stakeholder Interests  https://www8.gsb.columbia.edu/courses/mba/2022/spring/b8784-001
B8782-001 - Innovation Salon  https://www8.gsb.columbia.edu/courses/mba/2022/spring/b8782-001
B8779-001 - Global Immersion: Africa's Consumer Market: The Case of Ghana  https://www8.gsb.columbia.edu/courses/mba/2022/spring/b8779-001
B8767-001 - Investing in Social Ventures  https://www8.gsb.columbia.edu/courses/mba/2022/spring/b8767-001
B8744-001 - The Psychology and Economics of Consumer Finance  https://www8.gsb.columbia.edu/courses/mba/2022/spring/b8744-001
B8725-001 - Global Immersion: Economic Growth in the UAE  https://www8.gsb.columbia.edu/courses/mba/2022/spring/b8725-001
B8716-001 - Global Family Enterprise: Stakeholdership, Sustainability, and Innovation  https://www8.gsb.columbia.edu/courses/mba/2022/spring/b8716-001
B8714-001 - Global Immersion: Doing Business in Brazil - Challenges & Opportunities  https://www8.gsb.columbia.ed

In [39]:
for course in courses:
    course_name = course.find('a').get_text()
    url = "https://www8.gsb.columbia.edu" + course.find('a').get('href')
    credits = course.find(class_="course-credithours").get_text()
    datetime = course.find(class_="date-time").get_text().strip()
    instructor = course.find(class_="instructor").get_text().strip()
    location = course.find(class_="course-location").get_text().strip()
    print(course_name, url, credits, datetime, instructor, location)

B8784-001 - Business and Society: Reconciling Shareholder and Stakeholder Interests  https://www8.gsb.columbia.edu/courses/mba/2022/spring/b8784-001 1.5 credit hours W - B Term02:00PM to 05:15PM R. Glenn Hubbard (rgh1) Geffen-590
B8782-001 - Innovation Salon  https://www8.gsb.columbia.edu/courses/mba/2022/spring/b8782-001 1.5 credit hours R - Full Term03:50PM to 07:05PM Sheena Iyengar (ss957) 
B8779-001 - Global Immersion: Africa's Consumer Market: The Case of Ghana  https://www8.gsb.columbia.edu/courses/mba/2022/spring/b8779-001 3.0 credit hours W - Full Term12:10PM to 01:40PM Stephan Meier (sm3087) Geffen-570
B8767-001 - Investing in Social Ventures  https://www8.gsb.columbia.edu/courses/mba/2022/spring/b8767-001 1.5 credit hours R - B Term02:00PM to 05:15PM Vikas Raj (vr2235)Bruce Usher (bmu2001) Geffen-420
B8744-001 - The Psychology and Economics of Consumer Finance  https://www8.gsb.columbia.edu/courses/mba/2022/spring/b8744-001 3.0 credit hours Block Week 1 - TWRFS - 09:00AM to 0

In [40]:
temp = []
for course in courses:
    row = {}
    row['course_name'] = course.find('a').get_text()
    row['url'] = "https://www8.gsb.columbia.edu" + course.find('a').get('href')
    row['credits'] = course.find(class_="course-credithours").get_text()
    row['datetime'] = course.find(class_="date-time").get_text().strip()
    row['instructor'] = course.find(class_="instructor").get_text().strip()
    row['location'] = course.find(class_="course-location").get_text().strip()
    temp.append(row)

In [41]:
temp

[{'course_name': 'B8784-001 - Business and Society: Reconciling Shareholder and Stakeholder Interests ',
  'url': 'https://www8.gsb.columbia.edu/courses/mba/2022/spring/b8784-001',
  'credits': '1.5 credit hours',
  'datetime': 'W - B Term02:00PM to 05:15PM',
  'instructor': 'R. Glenn Hubbard (rgh1)',
  'location': 'Geffen-590'},
 {'course_name': 'B8782-001 - Innovation Salon ',
  'url': 'https://www8.gsb.columbia.edu/courses/mba/2022/spring/b8782-001',
  'credits': '1.5 credit hours',
  'datetime': 'R - Full Term03:50PM to 07:05PM',
  'instructor': 'Sheena Iyengar (ss957)',
  'location': ''},
 {'course_name': "B8779-001 - Global Immersion: Africa's Consumer Market: The Case of Ghana ",
  'url': 'https://www8.gsb.columbia.edu/courses/mba/2022/spring/b8779-001',
  'credits': '3.0 credit hours',
  'datetime': 'W - Full Term12:10PM to 01:40PM',
  'instructor': 'Stephan Meier (sm3087)',
  'location': 'Geffen-570'},
 {'course_name': 'B8767-001 - Investing in Social Ventures ',
  'url': 'htt

In [42]:
df_courses = pd.DataFrame(temp)

In [43]:
df_courses.to_csv('courses.csv', index=False)