In [None]:
import requests

## Simple API Call with Requests Library

It may be good to look at the reference documentation for the [requests library](https://2.python-requests.org/en/master/user/quickstart/).

First, let's have a look at the [GitHub API](https://developer.github.com/v3/).

In [None]:
r = requests.get('https://api.github.com/users/nmattei', timeout=10)
r.status_code

In [None]:
r.headers['content-type']

In [None]:
r.url

In [None]:
r.content

In [None]:
r.json()

## Looking at HTTP Requests

We'll try to get some data from Google.  Note that this is kind of against the TOS and we **should not do it this way in general -- Google has very [specific rules on their site](https://developers.google.com/custom-search/v1/).**

In [None]:
params = {'q':'Tulane University'}
r = requests.get('http://www.google.com/search', params = params, timeout=10)
r.status_code

In [None]:
r.url

In [None]:
r.headers['content-type']

In [None]:
r.text

## More Complicated with Parameters

We'll look for some information from the [Apple ITunes API](https://affiliate.itunes.apple.com/resources/documentation/itunes-store-web-service-search-api/).

In [None]:
params = {'term' : "the+meters"}
r = requests.get('https://itunes.apple.com/search', params=params, timeout=10)
r.status_code

In [None]:
r.url

In [None]:
r.json()

In [None]:
r.url

We can do lots of parameters in the payload like [this](https://2.python-requests.org/en/master/user/quickstart/).

In [None]:
params = {'term' : "the+meters", 'entity' : 'album'}
r = requests.get('https://itunes.apple.com/search', params=params, timeout=10)
r.status_code


In [None]:
r.url

In [None]:
x = r.json()

In [None]:
x['results'][0]['wrapperType']

## Converting the returned JSON to an object!

In [None]:
import json

In [None]:
data = json.loads(r.content)

In [None]:
data.keys()

In [None]:
data['results'][1]

## Using Beautiful Soup to Parse a Webpage.

The [beautifulsoup4 documentation](https://www.crummy.com/software/BeautifulSoup/).

In [None]:
# Grab the course webpage.
import requests
from bs4 import BeautifulSoup

r = requests.get('https://tulaneintrodatascience.github.io/')

root = BeautifulSoup( r.content )

In [None]:
r.content

In [None]:
root.find("table")

In [None]:
root.find("table").findAll("a")

## Trying out some Regular Expressions.

In [None]:
import re
# Find the index in the raw HTML where we first mention CMPS3660

# Note we use the r to make sure special flags get used correctly.

match = re.search(r'CMPS 3660', r.text)
print(match.start())

In [None]:
r.text[410:450]

In [None]:
# Does the start match?
match = re.match(r'CMPS 3660', r.text)
print(match)

In [None]:
# Iterate over all occurances and print a few characters.
for match in re.finditer(r'CMPS 3660', r.text):
    print(r.text[match.start()-30:match.start()+20])


In [None]:
# Find them all.
match = re.findall(r'CMPS 3660', r.text)
print(match)

In [None]:
# More complicated RegExes - Groups
regex = r'\s*([Uu]niversity)\s([Oo]f)\s(\w{3,})'
text = ''' The university of kentucky is the best
            basketball team and an ok university.
            The University Of Kentucky can be put in 
            some weird capitalization'''
m = re.search( regex, text)
print(m.groups())

In [None]:
# Find all
print(re.findall(regex, text))

In [None]:
# Named Groups.
regex = r'\s*([Uu]niversity)\s([Oo]f)\s(?P<school>\w{3,})'
text = ''' The university of kentucky is the best
            basketball team and an ok university.
            The University Of Kentucky can be put in 
            some weird capitalization'''
m = re.search( regex, text)
print(m.groupdict())


In [None]:
# Find all named groups

# Named Groups.
regex = r'\s*([Uu]niversity)\s([Oo]f)\s(?P<school>\w{3,})'
text = ''' The university of kentucky is the best
            basketball team and an ok university.
            The University Of Kentucky can be put in 
            some weird capitalization.  And Kentucky is much better than
            the University of Mississippi.'''
for m in re.finditer(regex, text):
    print(m.groupdict())


In [None]:
'abcabcabc'.replace('a', 'X')

In [None]:
text = 'I love Introduction to Data Science'
re.sub(r'Data Science', r'Schmada Schmience', text) 

In [None]:
re.sub(r'(\w+)\s([Ss]cience)', r'\2 \1hmience', text) 


## Downloadning All the PDFs from the course website.

Using beautiful soup and some regular expressions.

In [None]:
import re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from urllib.parse import urljoin
import os

In [None]:
# HTTP GET request sent to the URL url
url = "https://tulaneintrodatascience.github.io/"
r = requests.get( url )

# Use BeautifulSoup to parse the GET response
root = BeautifulSoup( r.content )
lnks = root.find("table").find("tbody").findAll("a")
lnks

In [None]:
# Cycle through the href for each anchor, checking
# to see if it's a PDF/PPTX link or not
pdfs = []
for lnk in lnks:
    href = lnk['href']
    
    # If it's a PDF/PPTX link, queue a download   
    if href.lower().endswith(('.pdf', '.pptx')):
        pdfs.append(href)
print(pdfs)

In [None]:
# Download all the files to whatever you're running notebook from.

# Be careful for href!

for href in pdfs:
    urld = urljoin(url, href)
    print(urld)
    rd = requests.get(urld, stream=True)
    
    # Write the downloaded PDF to a file
    # Note because the href is a path we have to just get the filename!
    outfile = os.path.join("./", href.split("/")[2])
    print("Writing: ",outfile)
    with open(outfile, 'wb') as f:
        f.write(rd.content)
