## Reading files
### basics of reading

In [16]:
import re

In [21]:
from collections import Counter

In [24]:
import csv

In [3]:
# r -  read-only, default is 'r'
file_for_reading = open('../random-text.txt', 'r')
file_for_reading2 = open('../random-text.txt')

In [5]:
# w - writing, file will be destroyed if it exists already
file_for_writing = open('../random-write-text.txt','w') # there is no such file, so will be created

In [6]:
# a - append, adding to the end of the file
file_for_appending = open('../random-append-text.txt', 'a') # file will be created but will not be converted to blank.

In [11]:
# closing the file, always close the opened file
file_for_reading.close()
file_for_reading2.close()
file_for_writing.close()
file_for_appending.close()

> Because opening and closing are two tasks, many times one forgets whether all files are closed or not. You’ll see the codes written in this manner.

In [14]:
def process(file):
    # Function does nothing except print a message
    print("Processing complete. Did nothing")

In [15]:
filename = '../random-text.txt'
with open(filename) as f:
    text = process(f)

Processing complete. Did nothing


In [17]:
# read a whole text file

starts_with_hash = 0
with open('../random-text.txt') as f:
    for line in f:                       # look at each line in the file
        if re.match("^#", line):         # use a regex to see if it starts with #
            starts_with_hash += 1        # if it does, add 1 to the count

------

Public Suffix list - https://www.publicsuffix.org

-----
**Update**: https://pypi.org/project/publicsuffixlist/

In [19]:
def get_domain(email_address: str) -> str:
    """Split on '@' and return the last piece"""
    return email_address.lower().split("@")[-1]

In [20]:
# tests
assert get_domain('nikhilsingh@gmail.com') == 'gmail.com'
assert get_domain('nikhils@a.machinelearning.com') == 'a.machinelearning.com'

In [23]:
def get_email_addrs(email_addr_file_path):
    with open(email_addr_file_path) as f:
        domain_counts = Counter(get_domain(line.strip())
                               for line in f)


## Delimited files

#### word of advice:
> Never parse a csv file yourself, you'll screw up the edge cases.

In [55]:
import csv

filename = '../dummy_stocks_data.txt'

# Manually specify the field names if the automatic detection is failing
fieldnames = ['Company Name', 'Day-1', 'Day-2', 'Day-3', 'Day-4', 'Day-5']



In [59]:
with open(filename, 'r') as f:
    tab_reader = csv.DictReader(f, delimiter='\t', fieldnames=fieldnames)
    next(tab_reader) # to avoid header
    for dict_row in tab_reader:

        ### uncomment for parsing
        
        # institution = dict_row["Company Name"]
        # day1_price = dict_row["Day-1"]
        # day2_price = dict_row["Day-2"]
        # day3_price = dict_row["Day-3"]
        # day4_price = dict_row["Day-4"]
        # day5_price = dict_row["Day-5"]

        
        print(dict_row)

{'Company Name': 'SUDITI INDUSTRIES  120.946 122.13 123.32 124.11 125.95', 'Day-1': None, 'Day-2': None, 'Day-3': None, 'Day-4': None, 'Day-5': None}
{'Company Name': 'HB PORTFOLIO       113.98  114.55 115.20 116.45 117.30', 'Day-1': None, 'Day-2': None, 'Day-3': None, 'Day-4': None, 'Day-5': None}
{'Company Name': 'CHETANA EDUCATION  100.90  101.35 101.85 102.50 103.15', 'Day-1': None, 'Day-2': None, 'Day-3': None, 'Day-4': None, 'Day-5': None}
{'Company Name': 'ENERGY MISSION     429.50  430.25 431.75 433.00 434.55', 'Day-1': None, 'Day-2': None, 'Day-3': None, 'Day-4': None, 'Day-5': None}
{'Company Name': 'INVESTMENT & PREC  1029.45 1031.00 1032.50 1034.00 1035.50', 'Day-1': None, 'Day-2': None, 'Day-3': None, 'Day-4': None, 'Day-5': None}


## Scraping the web
### HTML and the Parsing

> https://www.crummy.com/software/BeautifulSoup/bs4/doc/#    
> Version 4.12.0

In [65]:
from bs4 import BeautifulSoup
import requests

In [67]:
url = ("https://raw.githubusercontent.com/"
      "joelgrus/data/master/getting-data.html")

html = requests.get(url).text
soup = BeautifulSoup(html, 'html.parser')

In [70]:
first_para = soup.find('p')
first_para

<p id="p1">This is the first paragraph.</p>

In [71]:
first_para_text = soup.p.text
first_para_words = soup.p.text.split()

first_para_text, first_para_words

('This is the first paragraph.', ['This', 'is', 'the', 'first', 'paragraph.'])

In [73]:
first_para_id = soup.p['id']       # KeyError if no id
first_para_id2 = soup.p.get('id')  # returns None of no 'id' # safer option

first_para_id, first_para_id2

('p1', 'p1')

In [77]:
# multiple tags

all_paras = soup.find_all('p')
paras_with_ids = [p for p in soup('p') if p.get('id')]

print(all_paras)
print('\n---')
print(paras_with_ids)

[<p id="p1">This is the first paragraph.</p>, <p class="important">This is the second paragraph.</p>]

---
[<p id="p1">This is the first paragraph.</p>]


In [78]:
important_paras = soup('p', {'class': 'important'})
important_paras2 = soup('p', 'important')
important_paras3 = [p for p in soup('p') 
                    if 'important' in p.get('class', [])]

important_paras, important_paras2, important_paras3

([<p class="important">This is the second paragraph.</p>],
 [<p class="important">This is the second paragraph.</p>],
 [<p class="important">This is the second paragraph.</p>])

In [80]:
# a more elaborative logic example

spans_inside_divs = [span
                    for div in soup('div')
                    for span in div('span')]

spans_inside_divs

[<span id="name">Joel</span>,
 <span id="twitter">@joelgrus</span>,
 <span id="email">joelgrus-at-gmail</span>]

--------------
## Example: Keeping tabs on congress (as per the book)

https://www.house.gov/representatives

In [81]:
from bs4 import BeautifulSoup
import requests

In [83]:
url = "https://www.house.gov/representatives"
text = requests.get(url).text
soup = BeautifulSoup(text, "html.parser")

In [85]:
all_urls = [a['href']
            for a in soup('a')
            if a.has_attr('href')]

print(len(all_urls))

967


In [89]:
import re

regex = r"^https?://.*\.house\.gov/?$"

In [90]:
assert re.match(regex, "http://nikhil.house.gov")
assert re.match(regex, "https://nikhil.house.gov")
assert re.match(regex, "http://nikhil.house.gov/")
assert re.match(regex, "https://nikhil.house.gov/")

assert not re.match(regex, "joel.house.gov")
assert not re.match(regex, "http://joel.house.com")
assert not re.match(regex, "https://joel.house.com/biography")

In [91]:
good_urls = [url for url in all_urls if re.match(regex, url)]

print(len(good_urls))

872


In [92]:
good_urls = list(set(good_urls))

print(len(good_urls))

436


In [94]:
html = requests.get('https://jayapal.house.gov').text
soup = BeautifulSoup(html, 'html.parser')

In [95]:
# the usage of set is advised because the links might appear multiple times

links = {a['href'] for a in soup('a') if 'press releases' in a.text.lower()}

print(links)

{'https://jayapal.house.gov/category/news/', 'https://jayapal.house.gov/category/press-releases/'}


In [96]:
from typing import Dict, Set

press_releases: Dict[str, Set[str]] = {}

for house_url in good_urls:
    html = requests.get(house_url).text
    soup = BeautifulSoup(html, 'html.parser')
    pr_links = {a['href'] for a in soup('a') if 'press releases'
                in a .text.lower()}

    print(f"{house_url}: {pr_links}")
    press_releases[house_url] = pr_links

https://gaetz.house.gov: {'/media/press-releases'}
https://adriansmith.house.gov/: {'/media/press-releases'}
https://cammack.house.gov: {'/media/press-releases'}
https://meuser.house.gov: {'/media/press-releases'}
https://meng.house.gov: {'/media-center/press-releases'}
https://smucker.house.gov/: {'/media/press-releases'}
https://guest.house.gov: {'/media/press-releases'}
https://bice.house.gov: {'/media/press-releases'}
https://kim.house.gov/: {'/media/press-releases'}
https://owens.house.gov: set()
https://fischbach.house.gov: {'/press-releases'}
https://frankel.house.gov: {'/news/documentquery.aspx?DocumentTypeID=27'}
https://loudermilk.house.gov: {'/news/documentquery.aspx?DocumentTypeID=27'}
https://gwenmoore.house.gov: {'/news/documentquery.aspx?DocumentTypeID=27'}
https://ruppersberger.house.gov: {'/news-room/press-releases'}
https://feenstra.house.gov: {'/media/press-releases', '/node/1119'}
https://grijalva.house.gov/: set()
https://foster.house.gov: {'/media/press-releases'}

-------
_Writing a general function that checks whether a page of press releases mentions any given term_

In [99]:
def paragraph_mentions(text: str, keyword: str) -> bool:
    """
    Returns True if a <p> inside the text mentions {keyword}
    """

    soup = BeautifulSoup(text, 'html.parser')
    paragraphs = [p.get_text() for p in soup('p')]

    return any(keyword.lower() in paragraph.lower()
              for paragraph in paragraphs)

In [100]:
text = """<body><h1>Facebook</h1><p>Twitter</p>"""
assert paragraph_mentions(text, "twitter")              # inside a <p>
assert not paragraph_mentions(text, "facebook")         # not inside a <p>

In [101]:
for house_url, pr_links in press_releases.items():
    for pr_link in pr_links:
        url = f"{house_url}/{pr_link}"
        text = requests.get(url).text

        if paragraph_mentions(text, 'data'):
            print(f"{house_url}")
            break # done with this house_url

https://foster.house.gov
https://fallon.house.gov
https://curtis.house.gov/
https://mace.house.gov
https://cartwright.house.gov
https://mikejohnson.house.gov
https://edwards.house.gov
https://danbishop.house.gov
https://luetkemeyer.house.gov/
https://garretgraves.house.gov/
https://bucshon.house.gov/
https://phillips.house.gov/
https://collins.house.gov
https://obernolte.house.gov
https://sherman.house.gov


## Using APIs
### JSON and XML

In [104]:
import json

serialized = """{
"title": "Some title",
"author": "some author",
"publicationYear": 2024,
"topics": ["data", "science", "data science"]
}"""

In [105]:
deserialized = json.loads(serialized)

assert deserialized["publicationYear"]==2024

In [106]:
assert "data science" in deserialized["topics"]

### Using an unauthenticated API

In [107]:
import requests, json
github_user = "nikhilsingh13"
endpoint = f"https://api.github.com/users/{github_user}/repos"

repos = json.loads(requests.get(endpoint).text)

> the date parser suggested in the book is `python-dateutil`, I prefer `pendulum` so continuing with that

In [110]:
import pendulum

from collections import Counter

In [111]:
dates = [pendulum.parse(repo["created_at"]) for repo in repos]

In [113]:
month_counts = Counter(date.month for date in dates)
weekday_counts = Counter(date.weekday() for date in dates)

In [114]:
last_5_repos = sorted(repos,
                      key = lambda r: r["pushed_at"],
                      reverse=True)[:5]

In [116]:
last_5_langs = [repo["language"] for repo in last_5_repos]

In [118]:
last_5_langs

['Jupyter Notebook',
 'HTML',
 'Jupyter Notebook',
 'Jupyter Notebook',
 'JavaScript']

## Example: Using the twitter API

In [121]:
import tweepy
import json

In [123]:
with open('../credentials.json') as file:
    credentials = json.load(file)

In [125]:
client = tweepy.Client(bearer_token=credentials['Twitter keys'][2]['Bearer'])

In [128]:
query = "#AI"

In [129]:
try:
    tweets = client.search_recent_tweets(query=query, max_results=5)
    # Display tweets
    for tweet in tweets.data:
        print(tweet.text)
except tweepy.TweepyException as e:
    print(f"An error occurred: {e}")

An error occurred: 403 Forbidden
When authenticating requests to the Twitter API v2 endpoints, you must use keys and tokens from a Twitter developer App that is attached to a Project. You can create a project via the developer portal.
