# Intro to Web Scraping

## Using Pandas

In [32]:
import pandas as pd

In [47]:
pd.read_html('https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population')[0]

Unnamed: 0,Rank,Country(or dependent territory),Population,Date,% of worldpopulation,Source
0,1,China[Note 2],1397920000,"June 24, 2019",18.1%,Official population clock
1,2,India[Note 3],1348880000,"June 24, 2019",17.5%,Official population clock
2,3,United States[Note 4],329431000,"June 24, 2019",4.27%,Official population clock
3,4,Indonesia,268074600,"July 1, 2019",3.47%,Official annual projection
4,5,Brazil,210081000,"June 24, 2019",2.72%,Official population clock
5,6,Pakistan,205106000,"June 24, 2019",2.66%,Official population clock
6,7,Nigeria,200963599,"July 1, 2019",2.6%,UN Projection
7,8,Bangladesh,166780000,"June 24, 2019",2.16%,Official population clock
8,9,Russia[Note 5],146793744,"January 1, 2019",1.9%,Official estimate
9,10,Mexico,126577691,"July 1, 2019",1.64%,Official annual projection


In [48]:
df = pd.read_html('https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population')[0]
df.to_csv('countries_by_population.csv', index=False)

## Using Requests and BeautifulSoup

In [49]:
import requests

In [51]:
response = requests.get('https://www.google.com')

In [55]:
response.url

'https://www.google.com/'

In [56]:
response.status_code

200

In [65]:
response.text

'<!doctype html><html itemscope="" itemtype="http://schema.org/WebPage" lang="en"><head><meta content="Search the world\'s information, including webpages, images, videos and more. Google has many special features to help you find exactly what you\'re looking for." name="description"><meta content="noodp" name="robots"><meta content="text/html; charset=UTF-8" http-equiv="Content-Type"><meta content="/images/branding/googleg/1x/googleg_standard_color_128dp.png" itemprop="image"><title>Google</title><script nonce="fUnPhj6Vkp45blJk2CkgrA==">(function(){window.google={kEI:\'uEUSXfq9Oe6k_Qbv6orQCQ\',kEXPI:\'0,1353804,1958,2422,1225,591,139,224,510,1065,3152,56,321,207,1017,626,109,381,87,120,93,67,54,187,21,2331853,329526,1294,12383,4855,32691,15248,867,6056,6107,5281,1100,3335,2,2,6801,363,3320,5505,224,2218,260,5107,575,835,284,2,579,727,2431,59,2,1,3,1297,3471,852,3700,1268,773,2247,1410,3337,1146,9,1491,256,212,2599,3601,669,1048,3,1807,1397,81,7,1,2,490,2042,8909,5295,798,1220,38,920,7

In [2]:
from bs4 import BeautifulSoup

In [71]:
soup = BeautifulSoup(response.text)

In [85]:
print(soup.prettify())

<!DOCTYPE html>
<html itemscope="" itemtype="http://schema.org/WebPage" lang="en">
 <head>
  <meta content="Search the world's information, including webpages, images, videos and more. Google has many special features to help you find exactly what you're looking for." name="description"/>
  <meta content="noodp" name="robots"/>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <meta content="/images/branding/googleg/1x/googleg_standard_color_128dp.png" itemprop="image"/>
  <title>
   Google
  </title>
  <script nonce="fUnPhj6Vkp45blJk2CkgrA==">
   (function(){window.google={kEI:'uEUSXfq9Oe6k_Qbv6orQCQ',kEXPI:'0,1353804,1958,2422,1225,591,139,224,510,1065,3152,56,321,207,1017,626,109,381,87,120,93,67,54,187,21,2331853,329526,1294,12383,4855,32691,15248,867,6056,6107,5281,1100,3335,2,2,6801,363,3320,5505,224,2218,260,5107,575,835,284,2,579,727,2431,59,2,1,3,1297,3471,852,3700,1268,773,2247,1410,3337,1146,9,1491,256,212,2599,3601,669,1048,3,1807,1397,81,7,1,2,490,20

In [87]:
soup.find('title')

<title>Google</title>

In [91]:
soup.find_all('a')

[<a class="gb1" href="https://www.google.com/imghp?hl=en&amp;tab=wi">Images</a>,
 <a class="gb1" href="https://maps.google.com/maps?hl=en&amp;tab=wl">Maps</a>,
 <a class="gb1" href="https://play.google.com/?hl=en&amp;tab=w8">Play</a>,
 <a class="gb1" href="https://www.youtube.com/?gl=US&amp;tab=w1">YouTube</a>,
 <a class="gb1" href="https://news.google.com/nwshp?hl=en&amp;tab=wn">News</a>,
 <a class="gb1" href="https://mail.google.com/mail/?tab=wm">Gmail</a>,
 <a class="gb1" href="https://drive.google.com/?tab=wo">Drive</a>,
 <a class="gb1" href="https://www.google.com/intl/en/about/products?tab=wh" style="text-decoration:none"><u>More</u> »</a>,
 <a class="gb4" href="http://www.google.com/history/optout?hl=en">Web History</a>,
 <a class="gb4" href="/preferences?hl=en">Settings</a>,
 <a class="gb4" href="https://accounts.google.com/ServiceLogin?hl=en&amp;passive=true&amp;continue=https://www.google.com/" id="gb_70" target="_top">Sign in</a>,
 <a href="/advanced_search?hl=en&amp;authuse

In [92]:
links = soup.find_all('a')
for link in links:
    print(link.text)

Images
Maps
Play
YouTube
News
Gmail
Drive
More »
Web History
Settings
Sign in
Advanced search
Language tools
Explore Yoga with Google
Advertising Programs
Business Solutions
About Google
Privacy
Terms


In [9]:
html = """
<div class="product">
    <h2 class="title">Black Tee</h2>
    <span class=“price">$54.00</span>
</div>"""
soup = BeautifulSoup(html)

In [12]:
soup.find(class_='product').text

'\nBlack Tee\n$54.00\n'