In [None]:
import pandas as pd
import requests

In [None]:
#Now, we will download the source code to Python.
url = 'https://www.worldcoinindex.com/'
crypto_url = requests.get(url)
crypto_url

In [None]:
body = crypto_url.text

#Body now consists of full HTML source code of our webpage. 
#Now if the HTML source has a table which is marked by the HTML tag <table></table> 
#(this tag is used for defining a table in HTML) 
#Pandas uses read_html() to extract the table from the HTML document.

In [None]:
crypto_data = pd.read_html(body)
print(type(crypto_data))
print(len(crypto_data))

In [None]:
#From the above output, it is clear that there is a list with one element which is our table. Therefore

crypto_data = crypto_data[0]
crypto_data.head()

In [None]:
#If we want to extract information from HTML, which doesn't have a table, we need to use a different approach: Scraping. 
#Fortunately, Python has a great package for this called Beautiful Soup.

In [None]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(page.content, 'html.parser')

In [None]:
print(soup.prettify())

In [None]:
list(soup.children)

In [None]:
[type(item) for item in list(soup.children)]

In [None]:
html = list(soup.children)[2]

In [None]:
list(html.children)
body = list(html.children)[3]
p = list(body.children)[1]
p.get_text()

In [None]:
#all at once
soup = BeautifulSoup(page.content, 'html.parser')
soup.find_all('p')
soup.find_all('p')[0].get_text()
soup.find('p')

In [None]:
#searching by class/id

page = requests.get("http://dataquestio.github.io/web-scraping-pages/ids_and_classes.html")
soup = BeautifulSoup(page.content, 'html.parser')
soup

soup.find_all('p', class_='outer-text')

soup.find_all(class_="outer-text")

soup.find_all(id="first")

In [None]:
#css selectors

soup.select("div p")

In [None]:
#http://forecast.weather.gov/MapClick.php?lat=37.7772&lon=-122.4168


#chrome devtools

page = requests.get("http://forecast.weather.gov/MapClick.php?lat=37.7772&lon=-122.4168")
soup = BeautifulSoup(page.content, 'html.parser')
seven_day = soup.find(id="seven-day-forecast")
forecast_items = seven_day.find_all(class_="tombstone-container")
tonight = forecast_items[0]
print(tonight.prettify())

In [None]:
#extracting info 

period = tonight.find(class_="period-name").get_text()
short_desc = tonight.find(class_="short-desc").get_text()
temp = tonight.find(class_="temp").get_text()
print(period)
print(short_desc)
print(temp)

img = tonight.find("img")
desc = img['title']
print(desc)

In [None]:
#extracting all info

period_tags = seven_day.select(".tombstone-container .period-name")
periods = [pt.get_text() for pt in period_tags]
periods

#apply the same technique to get the other 3 fields:

short_descs = [sd.get_text() for sd in seven_day.select(".tombstone-container .short-desc")]
temps = [t.get_text() for t in seven_day.select(".tombstone-container .temp")]
descs = [d["title"] for d in seven_day.select(".tombstone-container img")]print(short_descs)print(temps)print(descs)

In [None]:
#into pandas

import pandas as pd
weather = pd.DataFrame({
    "period": periods,
    "short_desc": short_descs,
    "temp": temps,
    "desc":descs
})
weather

#analysis

temp_nums = weather["temp"].str.extract("(?P<temp_num>d+)", expand=False)
weather["temp_num"] = temp_nums.astype('int')
temp_nums

is_night = weather["temp"].str.contains("Low")
weather["is_night"] = is_night
is_night

In [None]:
# Python Libraries for Web Scraping

# requests — this critical library is needed to actually get the data from the web server onto your machine, 
# and it contains some additional cool features like caching too.

# Beautiful Soup 4 — This is the library we’ve used here, and it’s designed to make filtering data based on HTML 
# tags straightforward.

# lmxl — An HTML and XML parser that’s fast (and now, integrated with Beautiful Soup, too!)

# Selenium — A web driver tool that’s useful when you need to get data from a website that the requests library can’t access, 
# because it’s hidden behind things like login forms or mandatory mouse-clicks.

# Scrapy — A full-on web scraping framework that might be overkill for one-off data analysis projects, but a good fit 
# when scraping’s required for production projects, pipelines, etc.
