In [2]:
import pandas as pd #Import pandas
dict_={'a':[11,21,31],'b':[12,22,32]} #Create a dictionary
df=pd.DataFrame(dict_) #Create pandas object with the DataFrame constructor ##This is an 'instance'
## The data in the dictionary gets passed along to the pandas API
df.head() #When you call the head, the df communicates with the pandas API

Unnamed: 0,a,b
0,11,12
1,21,22
2,31,32


In [3]:
df.mean()
#Same deal here...you call the mean, the df communicates with pandas, pandas returns the output

a    21.0
b    22.0
dtype: float64

# REST APIs
## Stands for REpresentational State Transfter API
## Communication through the internet, giving you access to more resources (storage, data, AI algorithms)
###e.g. HTTP methods (usually through JSON files)

In [12]:
#Example: pycoingecko

!pip install pycoingecko
from pycoingecko import CoinGeckoAPI
cg=CoinGeckoAPI()
bitcoin_data=cg.get_coin_market_chart_by_id(id='bitcoin',vs_currency='usd',days='30')



In [13]:
data=pd.DataFrame(bitcoin_data['prices'],columns=['TimeStamp','Price']) #Simplifies by converting nested list into a DataFrame

In [14]:
data

Unnamed: 0,TimeStamp,Price
0,1745694516883,94222.647904
1,1745698189954,94375.500198
2,1745701747622,94211.330261
3,1745705092473,94379.965077
4,1745708530001,94749.595696
...,...,...
716,1748275428086,110094.414744
717,1748279019775,109470.221518
718,1748282629693,109188.595765
719,1748286158736,109254.393630


In [18]:
# Use 'to_datetime' to convert timestamp to more intuitive format vs milliseconds
data['Date']=pd.to_datetime(data['TimeStamp'],unit='ms')

In [20]:
#Verify...
data

Unnamed: 0,TimeStamp,Price,Date
0,1745694516883,94222.647904,2025-04-26 19:08:36.883
1,1745698189954,94375.500198,2025-04-26 20:09:49.954
2,1745701747622,94211.330261,2025-04-26 21:09:07.622
3,1745705092473,94379.965077,2025-04-26 22:04:52.473
4,1745708530001,94749.595696,2025-04-26 23:02:10.001
...,...,...,...
716,1748275428086,110094.414744,2025-05-26 16:03:48.086
717,1748279019775,109470.221518,2025-05-26 17:03:39.775
718,1748282629693,109188.595765,2025-05-26 18:03:49.693
719,1748286158736,109254.393630,2025-05-26 19:02:38.736


# URL = Uniform Resource Locator

## Elements of URL:
* Scheme / protocol (e.g. http://)
* Address (e.g. ibm.com)
* Route (e.g. /images/logo.png)

## Status Codes
* 100s okay (so far)
* 200s okay / success
* 300s redirection, multiple choices
* 400 errors 401 unauthorized, 403 forbidden, 404 file not found
* 500s server error, not implemented

In [23]:
# Import requests that work with python library
import requests
url='http://www.ibm.com' #short name your url
r=requests.get(url) #makes a GET request from your url
r.status_code #check the status code (200 is okay)

200

In [27]:
r.request.body #(None)
header=r.headers
header

{'Content-Security-Policy': 'upgrade-insecure-requests', 'x-frame-options': 'SAMEORIGIN', 'Last-Modified': 'Mon, 26 May 2025 19:22:08 GMT', 'ETag': '"2d783-6360edd297518-gzip"', 'Accept-Ranges': 'bytes', 'Content-Type': 'text/html;charset=utf-8', 'X-Content-Type-Options': 'nosniff', 'Cache-Control': 'max-age=76', 'Expires': 'Mon, 26 May 2025 19:36:44 GMT', 'X-Akamai-Transformed': '0 - 0 -', 'Content-Encoding': 'gzip', 'Date': 'Mon, 26 May 2025 19:35:28 GMT', 'Content-Length': '32232', 'Connection': 'keep-alive', 'Vary': 'Accept-Encoding', 'Strict-Transport-Security': 'max-age=31536000'}

In [28]:
#Find data
header['date']

'Mon, 26 May 2025 19:35:28 GMT'

In [30]:
header['Content-Type']

'text/html;charset=utf-8'

In [35]:
r.encoding
r.text[0:500] #Review range of characters 

'\n<!DOCTYPE HTML>\n<html lang="en">\n<head>\r\n    \r\n    \r\n    \r\n    \r\n    \r\n    \r\n    \r\n      \r\n    \r\n    \r\n    \r\n    \r\n    <meta charset="UTF-8"/>\r\n    <meta name="languageCode" content="en"/>\r\n    <meta name="countryCode" content="us"/>\r\n    <meta name="searchTitle" content="IBM - United States"/>\r\n    <meta name="focusArea" content="Cross IBM SDRs"/>\r\n    <title>IBM - United States</title>\r\n      <script defer="defer" type="text/javascript" src="https://rum.hlx.page/.rum/@adobe/helix-rum-js@%5E2/'

In [46]:
#Create a query string
#AKA: Playing Go Fish with a website. The code below says: "Hey, httpbin.org, got any Joseph with ID 123?"

url_get='http://httpbin.org/get' #Base url with /get at the end
payload={'name':'Joseph','ID':'123'} #Create a dicitonary called payload, keys are parameter names and values are values of query string
r=requests.get(url_get,params=payload) #pass dictionary payload to params parameter to the get() function
## 'requests.get(url_get,...)' sends an HTTP GET request to web server
## 'params=payload' means 'I would like information matching what's in the 'payload' dictionary
r.url #See the request body
r.request.body #None in this case
r.status_code #200 in this case... all good

200

In [49]:
r.text

'{\n  "args": {\n    "ID": "123", \n    "name": "Joseph"\n  }, \n  "headers": {\n    "Accept": "*/*", \n    "Accept-Encoding": "gzip, deflate, br, zstd", \n    "Host": "httpbin.org", \n    "User-Agent": "python-requests/2.32.3", \n    "X-Amzn-Trace-Id": "Root=1-6834c85c-493a67644569b90c6e7ce771"\n  }, \n  "origin": "162.245.89.101", \n  "url": "http://httpbin.org/get?name=Joseph&ID=123"\n}\n'

In [50]:
r.headers['Content-Type']

'application/json'

In [51]:
r.json()

{'args': {'ID': '123', 'name': 'Joseph'},
 'headers': {'Accept': '*/*',
  'Accept-Encoding': 'gzip, deflate, br, zstd',
  'Host': 'httpbin.org',
  'User-Agent': 'python-requests/2.32.3',
  'X-Amzn-Trace-Id': 'Root=1-6834c85c-493a67644569b90c6e7ce771'},
 'origin': '162.245.89.101',
 'url': 'http://httpbin.org/get?name=Joseph&ID=123'}

In [53]:
r.json()['args'] #the key 'args' has the name and values of query string...

{'ID': '123', 'name': 'Joseph'}

In [55]:
# POST request to send data to a server
url_post='http://httpbin.org/post'
payload={'name':'Joseph','ID':'123'}
r_post=requests.post(url_post,data=payload)
print('POST request URL:',r_post.url)
print('GET request URL:',r.url)

POST request URL: http://httpbin.org/post
GET request URL: http://httpbin.org/get?name=Joseph&ID=123


In [58]:
#Compare POST and GET
print('POST request body:',r_post.request.body)
print('GET request body:',r.request.body)

POST request body: name=Joseph&ID=123
GET request body: None


In [59]:
#View the key form to get payload
r_post.json()['form']

{'ID': '123', 'name': 'Joseph'}

# Importing Beautiful Soup to scrape web pages

In [60]:
from bs4 import BeautifulSoup

In [62]:
import requests
from bs4 import BeautifulSoup

#specify page (give url a name)
url='https://en.wikipedia.org/wiki/IBM'

#send an HTTP GET request
response=requests.get(url) #Create a thing called 'response' that is the GET requests from 'url'

#store the HTML content in a variable
html_content=response.text

#Create a BeautifulSoup object to parse the HTML
soup=BeautifulSoup(html_content,'html.parser')

#Display a snippet of the HTML content
print(html_content[0:500])

<!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vect


In [65]:
#Cleaning out common HTML contents 
#Find all <a> tags (anchor tags) in HTML
links=soup.find_all('a')

#Iterate through the list of links and print their text
for link in links:
    print(link.text)


Jump to content
Main page
Contents
Current events
Random article
About Wikipedia
Contact us
Help
Learn to edit
Community portal
Recent changes
Upload file
Special pages








Search

Donate
Create account
Log in
Donate
 Create account
 Log in
learn more
Contributions
Talk

(Top)



1
History




1.1
1910s–1950s




1.2
1960s–1980s




1.3
1990s–2000s




1.4
2010s–present




2
Corporate affairs




2.1
Business trends




2.2
Board and shareholders




2.3
Headquarters and offices




3
Products




3.1
Hardware




3.1.1
Mainframe computers




3.1.2
Microprocessors




3.1.3
Quantum computing




3.2
Software




3.3
Cloud services




3.4
Artificial intelligence




3.5
Consulting




4
Research




4.1
Patents




5
Brand and reputation




5.1
Environmental




5.2
Tax avoidance




6
People and culture




6.1
Employees




6.1.1
Notable current and former employees




6.2
Workplace culture




7
Leadership




7.1
President




7.2
Chairman of the Board




8
See also




9


In [66]:
#This is a few lines of code that gets you the raw text... Super useful!

all_text = soup.get_text()
print("--- All Text (raw) ---")
print(all_text)

--- All Text (raw) ---




IBM - Wikipedia



































Jump to content







Main menu





Main menu
move to sidebar
hide



		Navigation
	


Main pageContentsCurrent eventsRandom articleAbout WikipediaContact us





		Contribute
	


HelpLearn to editCommunity portalRecent changesUpload fileSpecial pages



















Search











Search






















Appearance
















Donate

Create account

Log in








Personal tools





Donate Create account Log in





		Pages for logged out editors learn more



ContributionsTalk




























Contents
move to sidebar
hide




(Top)





1
History




Toggle History subsection





1.1
1910s–1950s








1.2
1960s–1980s








1.3
1990s–2000s








1.4
2010s–present










2
Corporate affairs




Toggle Corporate affairs subsection





2.1
Business trends








2.2
Board and shareholders








2.3
Headquarters and offices










3
Products




Toggle Products sub

# You can also use BeautifulSoup for parsing
# pandas has a read_html function for table extraction