In [None]:
import re
import xlwings as xw
import pandas as pd

from urllib.request import urlopen

from pathlib import Path
from pyquery import PyQuery
from datetime import datetime

In [None]:
annual_url = (
    'https://www.eia.gov/opendata/qb.php?'
    'sdid=PET.EMM_EPMR_PTE_NUS_DPG.A'
)
monthly_url = (
    'https://www.eia.gov/opendata/qb.php?'
    'sdid=PET.EMM_EPMR_PTE_NUS_DPG.M'
)

In [3]:
ann_res = urlopen(annual_url)
mon_res = urlopen(monthly_url)

In [4]:
# # dead code
# # python requests module sucks
# # don't care what anybody says
# heads = {
#     'User-Agent': ('Mozilla/5.0 (Windows NT 10.0;'
#                    ' Win64; x64; rv:74.0) Gecko/20'
#                    '100101 Firefox/74.0'),
# }

# # get the pages
# # this code only works when verify is false
# with requests.Session() as sess:
#     retry = Retry(connect=3, backoff_factor=0.5)
#     adapter = HTTPAdapter(max_retries=retry)
#     sess.mount('https://', adapter)
#     ann_res = sess.get(annual_url, verify=True, headers=heads)
#     mon_res = sess.get(monthly_url, verify=True, headers=heads)

# if ann_res.status_code != 200:
#     raise Exception('Doomed request, annual...')

# if mon_res.status_code != 200:
#     raise Exception('Doomed, monthly...')

In [5]:
ann = PyQuery(ann_res.read())
mon = PyQuery(mon_res.read())

In [6]:
ann_cols = tuple(th.text for th in ann('th'))

In [7]:
ann_cols

('Series Name', 'Period', 'Frequency', 'Value', 'Units')

In [8]:
ann_rows = tuple(td.text for td in ann('td'))

In [9]:
mon_cols = tuple(th.text for th in mon('th'))

In [10]:
mon_cols

('Series Name', 'Period', 'Frequency', 'Value', 'Units')

In [11]:
mon_rows = tuple(td.text for td in mon('td'))

### Compiled ReGex

In [12]:
annual_period = re.compile(r'\d\d\d\d')

In [13]:
monthly_period = re.compile(r'\d\d\d\d\d\d')

In [14]:
deci_vals = re.compile(r'\d[.]\d+')

In [15]:
ann_table = dict.fromkeys(ann_cols)
mon_table = dict.fromkeys(mon_cols)

#### Ann Table

In [16]:
ann_dud = (
    'U.S. Regular All Formulations ' 
    'Retail Gasoline Prices, Annual'
)

In [17]:
ann_table['Series Name'] = tuple(
    i for i in ann_rows if i == ann_dud
)

In [18]:
ann_table['Period'] = tuple(
    i for i in ann_rows if re.match(annual_period, i)
)

In [19]:
ann_table['Frequency'] = tuple(
    i for i in ann_rows if i == 'A'
)

In [20]:
ann_table['Value'] = tuple(
    i for i in ann_rows if re.match(deci_vals, i)
    or i == 'null'
)

In [21]:
ann_table['Units'] = tuple(
    i for i in ann_rows if i == 'Dollars per Gallon'
)

In [22]:
if len(set(len(v) for v in ann_table.values())) != 1:
    raise Exception(
        'Annual-table dimensions are off: '
        + str(tuple(len(v) for v in ann_table.values()))
    )

#### Mon Table

In [23]:
mon_rows[-10:]

('U.S. Regular All Formulations Retail Gasoline Prices, Monthly',
 '199009',
 'M',
 '1.258',
 'Dollars per Gallon',
 'U.S. Regular All Formulations Retail Gasoline Prices, Monthly',
 '199008',
 'M',
 'null',
 'Dollars per Gallon')

In [24]:
mon_dud = (
    'U.S. Regular All Formulations Retail Gasoline'
    ' Prices, Monthly'
)

In [25]:
mon_table['Series Name'] = tuple(
    i for i in mon_rows if i == mon_dud
)

In [26]:
mon_table['Period'] = tuple(
    i for i in mon_rows if re.match(monthly_period, i)
)

In [27]:
mon_table['Frequency'] = tuple(
    i for i in mon_rows if i == 'M'
)

In [28]:
mon_table['Value'] = tuple(
    i for i in mon_rows if re.match(deci_vals, i)
    or re.match(r'^\d$', i)  # matches single number only
    or i == 'null'
)

In [29]:
mon_table['Units'] = tuple(
    i for i in mon_rows if i == 'Dollars per Gallon'
)

In [30]:
if len(set(len(v) for v in mon_table.values())) != 1:
    raise Exception(
        'Month-table dimensions are off: '
        + str(tuple(len(v) for v in mon_table.values()))
    )

### The DFs

In [31]:
annual = pd.DataFrame(ann_table)

In [32]:
monthly = pd.DataFrame(mon_table)

In [33]:
monthly['Date'] = monthly['Period'].apply(
    lambda x: datetime.strptime(x, '%Y%m')
)

## Current Gas Average

In [34]:
mon_avg = monthly[
    monthly['Date'].apply(
        lambda x: x.year == datetime.now().year
    )
]['Value'].apply(
    lambda x: float(x)
).mean()

In [35]:
ann_avgs = annual[
    (annual['Period'].apply(lambda x: int(x)) >= 2010)
]['Value'].apply(
    lambda x: float(x)
).to_list()

In [36]:
annual.iloc[0, 0]

'U.S. Regular All Formulations Retail Gasoline Prices, Annual'

In [37]:
monthly.iloc[0, 0]

'U.S. Regular All Formulations Retail Gasoline Prices, Monthly'

In [38]:
mon_avg

2.495

In [39]:
ann_avgs

[2.604, 2.719, 2.415, 2.143, 2.429, 3.358, 3.505, 3.618, 3.521, 2.782]