In [12]:
# import & read text from a Web Page using HTTP
import urllib.request, urllib.parse, urllib.error

fhand = urllib.request.urlopen('http://data.pr4e.org/romeo.txt')
for line in fhand:
    x = line.decode()
    for char in x:
        print(char)
    print(line.decode().strip()) 

B
u
t
 
s
o
f
t
 
w
h
a
t
 
l
i
g
h
t
 
t
h
r
o
u
g
h
 
y
o
n
d
e
r
 
w
i
n
d
o
w
 
b
r
e
a
k
s


But soft what light through yonder window breaks
I
t
 
i
s
 
t
h
e
 
e
a
s
t
 
a
n
d
 
J
u
l
i
e
t
 
i
s
 
t
h
e
 
s
u
n


It is the east and Juliet is the sun
A
r
i
s
e
 
f
a
i
r
 
s
u
n
 
a
n
d
 
k
i
l
l
 
t
h
e
 
e
n
v
i
o
u
s
 
m
o
o
n


Arise fair sun and kill the envious moon
W
h
o
 
i
s
 
a
l
r
e
a
d
y
 
s
i
c
k
 
a
n
d
 
p
a
l
e
 
w
i
t
h
 
g
r
i
e
f


Who is already sick and pale with grief


In [15]:
import xml.etree.ElementTree as ET

input = '''
    <stuff>
        <users>
            <user x="2">
                <id>001</id>
                <name>Chuck</name>
            </user>
            <user x="7">
                <id>009</id>
                <name>Brent</name>
            </user>
            <user x="5">
                <id>025</id>
                <name>Stewart</name>
            </user>
        </users>
    </stuff>'''

stuff = ET.fromstring(input)
lst = stuff.findall('users/user')
print('User count:', len(lst))

for item in lst:
    print('Name', item.find('name').text)
    print('Id', item.find('id').text)
    print('Attribute', item.get("x"))

User count: 3
Name Chuck
Id 001
Attribute 2
Name Brent
Id 009
Attribute 7
Name Stewart
Id 025
Attribute 5


In [16]:
# Complex example of pulling XML from the internet
import xml.etree.ElementTree as ET
import urllib.request, urllib.parse, urllib.error
import datetime as dt

# pull a list of winning PowerBall numbers since 2010
url = 'https://data.ny.gov/api/views/d6yy-54nr/rows.xml?accessType=DOWNLOAD'
response = urllib.request.urlopen(url).read()

# parse the XML response to build a tree
tree = ET.fromstring(response)
listLookup = tree.findall('row/row')
print('Record count:', len(listLookup))

for item in listLookup:
    drawDate = item.find('draw_date').text
    winningNumber = item.find('winning_numbers').text

    # reformat the date to a prettier format
    isoDate = dt.datetime.fromisoformat(drawDate)
    formatDate = isoDate.strftime("%A %d. %B %Y")

    print('\nDrawing Date: - ', formatDate)
    print('Winning Numbers: - ', winningNumber)

Record count: 1516

Drawing Date: -  Saturday 26. September 2020
Winning Numbers: -  11 21 27 36 62 24

Drawing Date: -  Wednesday 30. September 2020
Winning Numbers: -  14 18 36 49 67 18

Drawing Date: -  Saturday 03. October 2020
Winning Numbers: -  18 31 36 43 47 20

Drawing Date: -  Wednesday 07. October 2020
Winning Numbers: -  06 24 30 53 56 19

Drawing Date: -  Saturday 10. October 2020
Winning Numbers: -  05 18 23 40 50 18

Drawing Date: -  Wednesday 14. October 2020
Winning Numbers: -  21 37 52 53 58 05

Drawing Date: -  Saturday 17. October 2020
Winning Numbers: -  06 10 31 37 44 23

Drawing Date: -  Wednesday 21. October 2020
Winning Numbers: -  01 03 13 44 56 26

Drawing Date: -  Saturday 24. October 2020
Winning Numbers: -  18 20 27 45 65 06

Drawing Date: -  Wednesday 28. October 2020
Winning Numbers: -  11 28 37 40 53 13

Drawing Date: -  Saturday 31. October 2020
Winning Numbers: -  02 06 40 42 55 24

Drawing Date: -  Wednesday 04. November 2020
Winning Numbers: -  23 3

In [18]:
# example of JSON data with Python
import json

input = '''
[
    { "id" : "001",
        "x" : "2",
        "name" : "Max"
    } ,
    { "id" : "009",
        "x" : "7",
        "name" : "Chuck"
    } ,
    { "id" : "025",
        "x" : "5",
        "name" : "Stewart"
    }
]'''

info = json.loads(input)
print('User count:', len(info))

for item in info:
    print("\n")
    print('Name', item['name'])
    print('Id', item['id'])
    print('Attribute', item['x'])

User count: 3


Name Max
Id 001
Attribute 2


Name Chuck
Id 009
Attribute 7


Name Stewart
Id 025
Attribute 5


In [19]:
### Here is an example of pulling JSON from the web
# import urllib library and json
from urllib.request import urlopen
import json

# store the URL to import in ourURL as parameter for urlopen
ourURL = "https://jsonplaceholder.typicode.com/users"

# store the response of URL
response = urlopen(ourURL)

# storing the JSON response from url in data
data_json = json.loads(response.read())

# print the unformatted JSON response data
print("\nUnformatted JSON data ...\n", data_json)

# transform the raw JSON into a structured object
data_object = json.dumps(data_json, indent = 4)

# print the formatted JSON Object
print("\nFormatted JSON Object ...\n", data_object)


Unformatted JSON data ...
 [{'id': 1, 'name': 'Leanne Graham', 'username': 'Bret', 'email': 'Sincere@april.biz', 'address': {'street': 'Kulas Light', 'suite': 'Apt. 556', 'city': 'Gwenborough', 'zipcode': '92998-3874', 'geo': {'lat': '-37.3159', 'lng': '81.1496'}}, 'phone': '1-770-736-8031 x56442', 'website': 'hildegard.org', 'company': {'name': 'Romaguera-Crona', 'catchPhrase': 'Multi-layered client-server neural-net', 'bs': 'harness real-time e-markets'}}, {'id': 2, 'name': 'Ervin Howell', 'username': 'Antonette', 'email': 'Shanna@melissa.tv', 'address': {'street': 'Victor Plains', 'suite': 'Suite 879', 'city': 'Wisokyburgh', 'zipcode': '90566-7771', 'geo': {'lat': '-43.9509', 'lng': '-34.4618'}}, 'phone': '010-692-6593 x09125', 'website': 'anastasia.net', 'company': {'name': 'Deckow-Crist', 'catchPhrase': 'Proactive didactic contingency', 'bs': 'synergize scalable supply-chains'}}, {'id': 3, 'name': 'Clementine Bauch', 'username': 'Samantha', 'email': 'Nathan@yesenia.net', 'address

In [49]:
#open url json file 
from urllib.request import urlopen
import json
import pandas as pd

urlToOpen = "https://jsonplaceholder.typicode.com/photos"
covers = urlopen(urlToOpen)
album_info = json.loads(covers.read())
album_clean= json.dumps(album_info, indent = 4)
print("\nformatted JSON data ...\n", album_clean)
try_again = pd.read_json(album_clean)
album_list = {}
/
album = pd.DataFrame(album_info)
print(album)



formatted JSON data ...
 [
    {
        "albumId": 1,
        "id": 1,
        "title": "accusamus beatae ad facilis cum similique qui sunt",
        "url": "https://via.placeholder.com/600/92c952",
        "thumbnailUrl": "https://via.placeholder.com/150/92c952"
    },
    {
        "albumId": 1,
        "id": 2,
        "title": "reprehenderit est deserunt velit ipsam",
        "url": "https://via.placeholder.com/600/771796",
        "thumbnailUrl": "https://via.placeholder.com/150/771796"
    },
    {
        "albumId": 1,
        "id": 3,
        "title": "officia porro iure quia iusto qui ipsa ut modi",
        "url": "https://via.placeholder.com/600/24f355",
        "thumbnailUrl": "https://via.placeholder.com/150/24f355"
    },
    {
        "albumId": 1,
        "id": 4,
        "title": "culpa odio esse rerum omnis laboriosam voluptate repudiandae",
        "url": "https://via.placeholder.com/600/d32776",
        "thumbnailUrl": "https://via.placeholder.com/150/d32776"
    }

In [50]:
# import the libraries
import requests

# request the URL of the webpage you want to access
URL = "http://www.dr-chuck.com/page1.htm"
response = requests.get(URL)

#  print 'response.content' to get the raw HTML content of the webpage. It's of ‘string’ type
print(response.content)

b'<h1>The First Page</h1>\n<p>\nIf you like, you can switch to the \n<a href="http://www.dr-chuck.com/page2.htm">\nSecond Page</a>.\n</p>\n'


In [2]:
# request the URL of the webpage you want to access
import requests
from bs4 import BeautifulSoup as soup

URL = "http://www.values.com/inspirational-quotes"
response = requests.get(URL)

# parse the response into a readable form
results = soup(response.content, 'lxml')
print(results.prettify())

<!DOCTYPE html>
<html class="no-js" dir="ltr" lang="en-US">
 <head>
  <meta charset="utf-8"/>
  <meta content="text/html; charset=utf-8" http-equiv="content-type"/>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <meta content="width=device-width,initial-scale=1.0" name="viewport"/>
  <title>
   Inspirational Quotes - Motivational Quotes - | The Foundation for a Better Life
  </title>
  <meta content="Find the perfect quotation from our hand-picked collection of inspiring quotes by hundreds of authors." name="description"/>
  <meta content="pass, it, on, passiton, values, kindness" name="keywords"/>
  <meta content="The Foundation for a Better Life" name="twitter:site_name"/>
  <meta content="@passiton_values" name="twitter:site"/>
  <meta content="summary" name="twitter:card"/>
  <meta content="Thank you for visiting." name="twitter:description"/>
  <meta content="https://www.passiton.com/passiton_fbl.jpg" name="twitter:image"/>
  <meta content="https://www.passiton.com/ins

In [8]:
#Program to scrape website and save quotes from website
import requests
from bs4 import BeautifulSoup  as bsoup
import csv

# request the URL of the webpage you want to access
URL = "http://www.values.com/inspirational-quotes"
response = requests.get(URL)

# parse the response into a readable form
results = bsoup(response.content, 'lxml')
#print(soup.prettify(results))

quotes=[] # a list to store quotes

# search the response for the HTML container that holds the quotes
table = results.find('div', attrs = {'id':'all_quotes'})
# print(table)

# iterate the able rows to find each quote info
for row in table.findAll('div'):
    quote = {}  # create a dictionary for each quote
    quote['url'] = "https:/" + row.a['href']
    quote['lines'] = row.img['alt'].split(" #")[0]
    quote['theme'] = row.h5.a.text
    quote['img'] = row.img['src']
    quotes.append(quote) 
       # attache each quote to the list

print(quotes) 

# save the quotes list of dictionaries into a CSV file
filename = 'inspirational_quotes.csv'
with open(filename, 'w', newline='') as f:
	w = csv.DictWriter(f,['theme','url','img','lines'])
	w.writeheader()
	for quote in quotes:
		w.writerow(quote)

[{'url': 'https://inspirational-quotes/4218-there-is-no-chance-no-destiny-no-fate-that', 'lines': 'There is no chance, no destiny, no fate, that can hinder or control the firm resolve of a determined soul.', 'theme': 'DETERMINATION', 'img': 'https://assets.passiton.com/quotes/quote_artwork/4218/medium/20230829_tuesday_quote.jpg'}, {'url': 'https://inspirational-quotes/6860-a-dream-doesnt-become-reality-through-magic-it', 'lines': "A dream doesn't become reality through magic; it takes sweat, determination and hard work.", 'theme': 'DETERMINATION', 'img': 'https://assets.passiton.com/quotes/quote_artwork/6860/medium/20230828_monday_quote.jpg'}, {'url': 'https://inspirational-quotes/7169-have-patience-with-all-things-but-first-of', 'lines': 'Have patience with all things, but, first of all with yourself.', 'theme': 'PATIENCE', 'img': 'https://assets.passiton.com/quotes/quote_artwork/7169/medium/20230825_friday_quote.jpg'}, {'url': 'https://inspirational-quotes/7725-patience-is-not-simply

In [14]:
import requests
from bs4 import BeautifulSoup  as bsoup

url = "https://www.baseball-reference.com/leagues/?__hstc=213859787.ef8803363fb75ee07ede07e784569[…]60.1&__hssc=213859787.1.1693359702060&__hsfp=1204492269"
response = requests.get(url)
results = bsoup(response.content, 'lxml')
print(results)

#table = results.find('div', attrs = {'id':'all_quotes'})


<!DOCTYPE html>
<html class="no-js" data-root="/home/br/build" data-version="klecko-" lang="en">
<head>
<meta charset="utf-8"/>
<meta content="ie=edge" http-equiv="x-ua-compatible"/>
<meta content="width=device-width, initial-scale=1.0, maximum-scale=2.0" name="viewport"/>
<link href="https://cdn.ssref.net/req/202308191" rel="dns-prefetch"/>
<!-- Quantcast Choice. Consent Manager Tag v2.0 (for TCF 2.0) -->
<script async="true" type="text/javascript">
    (function() {
	var host = window.location.hostname;
	var element = document.createElement('script');
	var firstScript = document.getElementsByTagName('script')[0];
	var url = 'https://cmp.quantcast.com'
	    .concat('/choice/', 'XwNYEpNeFfhfr', '/', host, 
		    '/choice.js?tag_version=V2');
	var uspTries = 0;
	var uspTriesLimit = 3;
	element.async = true;
	element.type = 'text/javascript';
	element.src = url;
	
	firstScript.parentNode.insertBefore(element, firstScript);
	
	function makeStub() {
	    var TCF_LOCATOR_NAME = '__tcfapiLoc

In [None]:
#Using a website to get the average temp by state
import requests as rq
from bs4 import BeautifulSoup as soup   

url = 'http://www.usa.com/rank/us--average-temperature--state-rank.htm'
data = rq.get(url).text

temps = soup(data, "html.parser")

table = temps.find_all('table')
table = temps.find('table', width='648')

rank = []
average_temp = []
state_pop = []

for row in table.find_all('tr'):
    columns = row.find_all('td')
    rank.append(columns[0].text.strip())
    average_temp.append(columns[1].text.strip())
    state_pop.append(columns[2].text.strip())

r = pd.DataFrame(rank)
a = pd.DataFrame(average_temp)
s = pd.DataFrame(state_pop)

df = pd.concat([r,a,s], axis = 1)
df.columns = df.iloc[0]
df=df.drop([0])
df = df.sort_values(by=['State / Population'])
print(df)
#df.to_csv('AveTempbyState.csv')

In [None]:
#Template for web scraping 
import requests as rq
from bs4 import BeautifulSoup as soup   

#copying from the internet
url = 'url copied from internet'
data = rq.get(url).text

#using soup to convert data
soupify_data = soup(data, "html.parser")

#creating table using soup lib (I'm not sure what temps means and if I should change it?)
table = temps.find_all('table')
table = temps.find('table', width='648')

#iniating list, if need another number than 3, fix the amount
variable1 = []
variable2 = []
variable3 = []

#Adding to the created list, made need modifying if more or less lists
for row in table.find_all('tr'):
    columns = row.find_all('td')
    variable1.append(columns[0].text.strip())
    variable2.append(columns[1].text.strip())
    variable3.append(columns[2].text.strip())

#Converting list into Dataframes, may need more or less
r = pd.DataFrame(variable1)
a = pd.DataFrame(variable2)
s = pd.DataFrame(variable3)

#Combining all Dataframes to make a complete dataset
df = pd.concat([r,a,s], axis = 1)
df.columns = df.iloc[0]
df=df.drop([0])
df = df.sort_values(by=['Assign by what you need'])
print(df)
#df.to_csv('AveTempbyState.csv')