## Webscraping

In [9]:
import pandas as pd

#### Basic html page

```
<!DOCTYPE html>
<html>
<head>
    <title>Web Page!</title>
    <style>
        body {background-color: powderblue;}
        h1   {color: blue;}
        p    {color: red;}
    </style>
    <link rel="stylesheet" href="styles.css">
    <script>
        document.getElementById("demo").innerHTML = "Hello JavaScript!";
    </script>
</head>
<body>
    <h1>A Very Bold Header</h1>
    <div style="background-color:lightblue">
        <p>This is a paragraph.</p>
    </div>
</body>
</html>
```

### nyc weather history

http://w1.weather.gov/data/obhistory/KNYC.html

In [10]:
knyc_link = 'http://w1.weather.gov/data/obhistory/KNYC.html'

In [11]:
import requests

knyc_page = requests.get(knyc_link)
knyc_page

<Response [200]>

In [12]:
# the first 1000 characters of the page
print(knyc_page.content[:1000])

b'<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">\r\n\t\t\t\t\t\t\t<html><meta name="Author" content="Leon Minton"><head><title>\r\n\t\t\t\t\t\t\tNational Weather Service : Observed Weather for past 3 Days : New York City, Central Park</title>\r\n\t\t\t\t\t\t\t<link rel="STYLESHEET" type="text/css" href="/images/weather/fcicons/main.css"></head>\r\n\t\t\t\t\t\t\t<body bgcolor="#ffffff" leftmargin="0" topmargin="0" marginwidth="0" marginheight="0" background="/images/weather/fcicons/gray_background.gif">\r\n\t\t\t\t\t\t\t<table cellspacing="0" cellpadding="0" border="0" width="670" background="/images/weather/fcicons/topbanner.jpg">\r\n\t\t\t\t\t\t\t<tr><td align="right" height="19"><a href="http://weather.gov"><span class="nwslink">weather.gov</span></a>&nbsp;&nbsp;&nbsp;</td></tr></table>\r\n\t\t\t\t\t\t\t<table cellspacing="0" cellpadding="0" border="0" width="670"><tr valign="top">\r\n\t\t\t\t\t\t\t<td rowspan="2"><a href="http://www.noaa.gov"><img src="/images/weathe

In [13]:
# need to parse some html!
from bs4 import BeautifulSoup

In [14]:
knyc_soup = BeautifulSoup(knyc_page.content)

In [15]:
# first 1000 characters more legibly
print(knyc_soup.prettify()[:1000])

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">
<html>
 <head>
  <meta content="Leon Minton" name="Author"/>
  <title>
   National Weather Service : Observed Weather for past 3 Days : New York City, Central Park
  </title>
  <link href="/images/weather/fcicons/main.css" rel="STYLESHEET" type="text/css"/>
 </head>
 <body background="/images/weather/fcicons/gray_background.gif" bgcolor="#ffffff" leftmargin="0" marginheight="0" marginwidth="0" topmargin="0">
  <table background="/images/weather/fcicons/topbanner.jpg" border="0" cellpadding="0" cellspacing="0" width="670">
   <tr>
    <td align="right" height="19">
     <a href="http://weather.gov">
      <span class="nwslink">
       weather.gov
      </span>
     </a>
    </td>
   </tr>
  </table>
  <table border="0" cellpadding="0" cellspacing="0" width="670">
   <tr valign="top">
    <td rowspan="2">
     <a href="http://www.noaa.gov">
      <img alt="NOAA logo - Click to go to the NOAA homepage" border="0" height="78" s

In [16]:
# print the 4rd table in the page
print(knyc_soup.find_all('table')[3])

<table border="0" cellpadding="2" cellspacing="3" width="670"><tr align="center" bgcolor="#b0c4de"><th rowspan="3" width="17">D<br/>a<br/>t<br/>e</th><th rowspan="3" width="32">Time<br/>(est)</th>
<th rowspan="3" width="80">Wind<br/>(mph)</th><th rowspan="3" width="40">Vis.<br/>(mi.)</th><th rowspan="3" width="80">Weather</th><th rowspan="3" width="65">Sky Cond.</th>
<th colspan="4">Temperature (ºF)</th><th rowspan="3" width="65">Relative<br/>Humidity</th><th rowspan="3" width="80">Wind<br/>Chill<br/>(°F)</th><th rowspan="3" width="80">Heat<br/>Index<br/>(°F)</th><th colspan="2">Pressure</th><th colspan="3">Precipitation (in.)</th></tr>
<tr align="center" bgcolor="#b0c4de"><th rowspan="2" width="45">Air</th><th rowspan="2" width="26">Dwpt</th><th colspan="2">6 hour</th>
<th rowspan="2" width="40">altimeter<br/>(in)</th><th rowspan="2" width="40">sea level<br/>(mb)</th><th rowspan="2" width="24">1 hr</th>
<th rowspan="2" width="24">3 hr</th><th rowspan="2" width="30">6 hr</th></tr>
<tr 

In [17]:
# extract data from the 4th table in the page into a dataframe

data_table = knyc_soup.find_all('table')[3]

table_rows = data_table.find_all('tr') # get rows from table

data = []
for idx,tr in enumerate(table_rows):
    if idx < 3 :                       # skip header rows
        continue
    td = tr.find_all('td')             # get table cells
    row = [elem.text for elem in td]   # pull text from cells
    data.append(row)                   # add to dataset
    
pd.DataFrame(data).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,2,13:51,Vrbl 6,0.75,Light Snow Fog/Mist,BKN007 OVC012,33,31,,,92%,27,,29.51,998.5,0.07,,
1,2,12:51,Vrbl 5,0.75,Light Snow Fog/Mist,OVC006,33,31,36.0,33.0,92%,28,,29.51,998.6,0.07,,0.13
2,2,11:51,Vrbl 7,1.75,Light Snow Fog/Mist,BKN007 OVC012,34,31,,,89%,28,,29.52,999.0,0.04,,
3,2,10:51,Vrbl 5,3.0,Light Rain Fog/Mist,OVC009,35,33,,,93%,31,,29.53,999.4,,,
4,2,09:51,NE 10 G 21,7.0,Overcast,OVC009,35,33,,,93%,27,,29.54,999.6,,0.02,


#### central park weather history summary
https://www.wunderground.com/history/daily/us/ny/new-york-city/KNYC/date/2018-12-3?cm_ven=localwx_history

In [18]:
wu_link = 'https://www.wunderground.com/history/daily/us/ny/new-york-city/KNYC/date/2018-12-3?cm_ven=localwx_history'

In [19]:
# get the page
wu_page = requests.get(wu_link)
wu_page

<Response [404]>

In [20]:
wu_soup = BeautifulSoup(wu_page.content)

In [21]:
print(wu_soup.prettify()[:1000])

<!DOCTYPE html>
<html lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   Oops! There's been an error | Weather Underground
  </title>
  <base href="/"/>
  <meta content="width=device-width, initial-scale=1" name="viewport"/>
  <meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible"/>
  <link href="https://www.wunderground.com/static/favicon.png" rel="shortcut icon" type="image/png"/>
  <link href="https://www.wunderground.com/static/favicon.png" rel="apple-touch-icon"/>
  <link href="/bundle-next/styles.70b207e6d3b5af6ea45e.css" rel="stylesheet"/>
  <style ng-transition="app-root">
   {}  body,   p{font-size:.875rem;color:#1e2023}  :focus{outline:0!important}  a:link{color:#1088b0}  a:visited{color:#1088b0}  a:hover{color:#1088b0}  a:hover:not(.button){text-decoration:underline}  a:active{color:#1088b0}  a.button{color:#fff}  a.hook{text-decoration:underline}  a:focus,   button:focus{outline:0}  input[type=date],   input[type=datetime-local],   input[type=datetime],   inpu

In [22]:
# the table we want doesn't exist! culprit: javascript
wu_soup.find_all('div',class_='tablesaw-sortable')

[]

In [23]:
# get the text from the page
wu_text = wu_soup.get_text()

# clean up the whitespace
import re
wu_text = re.sub(r'\n+','\n',text.strip())
print(text[:1000])

NameError: name 'text' is not defined

### Need to actually render page to process scripts!

In [32]:
# need to install chromedriver
from selenium.webdriver.chrome.options import Options
from selenium import webdriver

chrome_options = Options()
chrome_options.add_argument("--headless")

driver = webdriver.Chrome(options=chrome_options)

In [33]:
# this will actually render the page
driver.get(wu_link)

In [47]:
# two ways to find the table we want
wu_table = driver.find_element_by_class_name('city-history-observation')
#wu_table = driver.find_element_by_id('history-observation-table')

In [48]:
# text in the table
wu_table.text

'Daily Observations\nTime Temperature Dew Point Humidity Wind Wind Speed Wind Gust Pressure Precip.\n12:51 AM\n55 F 52 F 89 %\n0 mph 0 mph 29.4 in 0.0 in\n1:51 AM\n55 F 52 F 89 %\n0 mph 0 mph 29.4 in 0.0 in\n2:51 AM\n55 F 53 F 93 %\n0 mph 0 mph 29.4 in 0.0 in\n3:38 AM\n54 F 53 F 97 %\n0 mph 0 mph 29.4 in 0.0 in\n3:51 AM\n54 F 53 F 97 %\n0 mph 0 mph 29.4 in 0.0 in\n4:51 AM\n54 F 53 F 97 %\n0 mph 0 mph 29.4 in 0.0 in\n5:01 AM\n54 F 53 F 97 %\n0 mph 0 mph 29.4 in 0.0 in\n5:28 AM\n54 F 52 F 93 %\n0 mph 0 mph 29.4 in 0.0 in\n5:51 AM\n54 F 51 F 90 %\n0 mph 0 mph 29.4 in 0.0 in\n6:51 AM\n53 F 49 F 86 %\n0 mph 0 mph 29.4 in 0.0 in\n7:51 AM\n54 F 46 F 75 %\n0 mph 0 mph 29.5 in 0.0 in\n8:51 AM\n54 F 44 F 69 %\n0 mph 0 mph 29.5 in 0.0 in\n9:51 AM\n54 F 41 F 62 %\n0 mph 0 mph 29.5 in 0.0 in\n10:51 AM\n54 F 39 F 57 %\n0 mph 0 mph 29.5 in 0.0 in\n11:51 AM\n52 F 37 F 57 %\n0 mph 0 mph 29.5 in 0.0 in\n12:51 PM\n52 F 34 F 50 %\n0 mph 0 mph 29.5 in 0.0 in\n1:51 PM\n53 F 33 F 47 %\n0 mph 0 mph 29.5 in 0.

In [49]:
# extracting text into a datafram
wu_data = []
for tr in wu_table.find_elements_by_css_selector('tr'):
    tmp_row = []
    for td in tr.find_elements_by_css_selector('td'):
        tmp_row.append(td.text.strip())
    wu_data.append(tmp_row)
df_wu = pd.DataFrame(wu_data)
df_wu.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,,,,,,,,,,,
1,12:51 AM,55 F,52 F,89 %,,0 mph,0 mph,29.4 in,0.0 in,,
2,1:51 AM,55 F,52 F,89 %,,0 mph,0 mph,29.4 in,0.0 in,,
3,2:51 AM,55 F,53 F,93 %,,0 mph,0 mph,29.4 in,0.0 in,,
4,3:38 AM,54 F,53 F,97 %,,0 mph,0 mph,29.4 in,0.0 in,,


In [52]:
# visualize the rendered table, still missing some stuff, need to debug
wu_table.screenshot('./images/test1.png')

True

<img src='./images/test2.png'>