# Data Gathering and Cleaning

Data for this project is gathered from the U.S. Energy Information Administration (EIA) [website](https://www.eia.gov).

## Import packages

In [45]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

import requests
import time
from bs4 import BeautifulSoup as BS

%load_ext autoreload
%autoreload 2
import helper_functions


We have to do a little manipulation to the text file from EIA. It is a text file with a bunch of line-separated JSON objects, but I massage it here to a proper JSON and export it as a new file.

In [24]:
lastline = None

# Open and read text file
with open("SEDS.txt","r") as f:
    lineList = f.readlines()
    
    # Keep track of last line
    lastline=lineList[-1]

# Open text file and create new json to be written
with open("SEDS.txt","r") as f, open("cleanfile.json","w") as g:
    
    # Iterate through each line of the text file
    for i,line in enumerate(f,0):
        
        # First line gets [ and , to initialize the json
        if i == 0:
            line = "["+str(line)+","
            g.write(line)
            
        # Last line gets ] to signal the end of the json
        elif line == lastline:            
            g.write(line)
            g.write("]")
            
        # Other lines get comma separation
        else:
            line = str(line)+","
            g.write(line)

In [26]:
file = open('cleanfile.json', 'r')
json_data = json.load(file)

In [42]:
string = json_data[3]
string['series_id']

'SEDS.TNRSB.AL.A'

In [44]:
re.findall('SEDS\.TNRSB\..*',string['series_id'])

['SEDS.TNRSB.AL.A']

The following energy types were selected based on the categories in the [EIA educational page](https://www.eia.gov/energyexplained/energy-and-the-environment/).

In [115]:
energy_types = [
    'All Petroleum Products excluding Fuel Ethanol',
    'Coal','Natural Gas including Supplemental Gaseous Fuels',
    'Nuclear Power',
    'Biomass',
    'Fuel Ethanol excluding Denaturant',
    'Geothermal',
    'Hydroelectricity',
    'Solar Energy',
    'Wind Energy',
    'Renewable Energy']

# Make all lowercase in case some pages have inconsistent letter casing
for i in range(len(energy_types)):
    energy_types[i] = energy_types[i].lower()

## Web scrape to get the information for each of the above types of energy

In [76]:
headers = {'user-agent': 'Safari/13.0.2 (Macintosh; Intel Mac OS X 10_15)'}
base_url = 'https://www.eia.gov/opendata/qb.php'
consumption_suffix = '?category=40204'

In [85]:
consumption_page = helper_functions.get_page(base_url+consumption_suffix,headers)

In [363]:
# Create empty dict to store all info across every sector and energy type by state
env_series_ids = {}

# Start by scraping the consumption website in order to get the list of available sectors  
consumption_sectors = consumption_page.find('div',{'class':'pagecontent mr_temp2'})

# Store sector url suffixes in a list
sector_url_suffixes = [sector.a['href'] for sector in consumption_sectors.find_all('li')[:7]]

# Loop 1 - iterate through each sector
for sector_url_suffix in sector_url_suffixes:
    
    # Scrape the sector page
    sector_page = helper_functions.get_page(base_url+sector_url_suffix,headers)    

    # Go into first url and grab tags of all children categories
    children_categories = sector_page.find('section').ul.find_all('li')

    # Store the urls of children cats (ccats = children categories)
    ccats_url_suffixes = [children_category.a['href'] 
                          for children_category in children_categories
                          if children_category.text.lower() in energy_types]
    
    # Loop 2 - for each sector, iterate through the relevant types of energy consumption to get state-level data
    for ccats_url_suffix in ccats_url_suffixes:
        
        # Scrape the child category page
        child_category_page = helper_functions.get_page(base_url+ccats_url_suffix,headers)

        # Grab tags of all energy unit children categories. Only want Btu
        energy_unit_cats = child_category_page.find('div',{'class':'main_col'}).ul.find_all('li')

        # Store only the url of the 'Btu' children category. I make a list and select only the first element 
        # because sometimes there will be two energy unit options or just one. This way ensures we only take 
        # the Btu option.
        btu_url_suffix = [energy_unit.a['href'] 
                   for energy_unit in energy_unit_cats
                   if energy_unit.text == 'Btu'][0]
        
        # Scrape the Btu page
        btu_page = helper_functions.get_page(base_url+btu_url_suffix,headers)
        
        # Get list of states by their tags
        states = btu_page.find('div',{'class':'main_col'}).ul.find_all('li')
        
        # Get url suffixes for each state
        state_url_suffixes = [state.a['href'] for state in states]
        
        # Isolate the sector and energy type
        sector = btu_page.find('div',{'class':'main_col'}).h3.find_all('a')[3].text
        energy_type = btu_page.find('div',{'class':'main_col'}).h3.find_all('a')[4].text
        
        # Add these to a dict which will be the values of the overarching env_series_ids dict
        series_id_values = {'sector':sector,'energy_type':energy_type}
        
        # Parse through url suffixes to get and store the series ids we want to use to parse the big JSON
        for state_suffix in state_url_suffixes:
            series_id = re.findall('SEDS.*',state_suffix)[0]
            env_series_ids[series_id] = series_id_values
            

### Parse energy data

In [399]:
# Set up empty bucket for parsed data
environmental_data = []

# Iterate through big json to parse relevant info
for single_json in json_data:
    
    # Only parse entries that have the series ids that we care about
    if single_json.get('series_id') in env_series_ids.keys():
        single_data_entry = {}
        single_data_entry['series_id'] = single_json['series_id']
        single_data_entry['sector'] = env_series_ids[single_json['series_id']]['sector']
        single_data_entry['energy_type'] = env_series_ids[single_json['series_id']]['energy_type']
        single_data_entry['data'] = single_json['data']
        single_data_entry['state'] = re.findall(', .*$',single_json['name'])[0][2:]
        single_data_entry['units'] = single_json['units']
        environmental_data.append(single_data_entry)

## Web scrape for population and GDP data

In [349]:
population_url_suffix = '?category=40367'
gdp_url_suffix = '?category=40828'
pop_gdp_url_suffixes = [population_url_suffix, gdp_url_suffix]

# Create container for  data
pop_gdp_series_ids = {}

for pop_gdp_url_suffix in pop_gdp_url_suffixes:
    
    # Scrape population page
    page = helper_functions.get_page(base_url + pop_gdp_url_suffix,headers)

    # Isolate html tags containing urls for each state
    state_tags = page.find('div',{'class':'main_col'}).ul.find_all('li')

    # Extract and save each state url suffix
    state_url_suffixes = [state_tag.a['href'] 
                          for state_tag in state_tags]

    # Iterate through each state url suffix to extract features
    for state_url_suffix in state_url_suffixes:

        # Scrape each state's series page
        state_page = helper_functions.get_page(base_url + state_url_suffix,headers)

        # Isolate html tags containing state name, get text from tag, parse for name
        api_call_tags = state_page.find('div',{'class':'main_col'}).find('div',{'class':'api_call_container'})
        state_text = api_call_tags.find_all('p')[1].text
        state = re.findall('(, )(.*)',state_text)[0][1]
        
        # Isolate html tags containing description (gdp or pop), get text from tag
        main_col_tags = state_page.find('div',{'class':'main_col'}).h3
        desc = main_col_tags.find_all('a')[2].text
        
        # Parse url suffix for series id
        series_id = re.findall('SEDS.*',state_url_suffix)[0]

        # Add to data container
        values = {'state':state,'description':desc}
        pop_gdp_series_ids[series_id] = values

### Parse population and gdp data

In [400]:
# Set up empty bucket for parsed data
pop_gdp_data = []

# Iterate through big json to parse relevant info
for single_json in json_data:
    
    # Only parse entries that have the series ids that we care about
    if single_json.get('series_id') in pop_gdp_series_ids.keys():
        single_data_entry = {}
        single_data_entry['series_id'] = single_json['series_id']
        single_data_entry['description'] = pop_gdp_series_ids[single_json['series_id']]['description']
        single_data_entry['units'] = single_json['units']
        single_data_entry['data'] = single_json['data']
        single_data_entry['state'] = pop_gdp_series_ids[single_json['series_id']]['state']
        pop_gdp_data.append(single_data_entry)

## Storing data into MongoDB

In [388]:
from pymongo import MongoClient

In [383]:
from pprint import pprint

In [389]:
client = MongoClient('mongodb://localhost/')

In [390]:
db = client.admin

In [391]:
# Issue the serverStatus command and print the results
serverStatusResult=db.command("serverStatus")
pprint(serverStatusResult)

 'connections': {'active': 1,
                 'available': 3272,
                 'current': 4,
                 'totalCreated': 4},
 'electionMetrics': {'averageCatchUpOps': 0.0,
                     'catchUpTakeover': {'called': 0, 'successful': 0},
                     'electionTimeout': {'called': 0, 'successful': 0},
                     'freezeTimeout': {'called': 0, 'successful': 0},
                     'numCatchUps': 0,
                     'numCatchUpsAlreadyCaughtUp': 0,
                     'numCatchUpsFailedWithError': 0,
                     'numCatchUpsFailedWithNewTerm': 0,
                     'numCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd': 0,
                     'numCatchUpsSkipped': 0,
                     'numCatchUpsSucceeded': 0,
                     'numCatchUpsTimedOut': 0,
                     'numStepDownsCausedByHigherTerm': 0,
                     'priorityTakeover': {'called': 0, 'successful': 0},
                     'stepUpCmd': {'called': 0, 'suc

                         'file system read latency histogram (bucket 6) - 1000ms+': 0,
                         'file system write latency histogram (bucket 1) - 10-49ms': 0,
                         'file system write latency histogram (bucket 2) - 50-99ms': 0,
                         'file system write latency histogram (bucket 3) - 100-249ms': 0,
                         'file system write latency histogram (bucket 4) - 250-499ms': 0,
                         'file system write latency histogram (bucket 5) - 500-999ms': 0,
                         'file system write latency histogram (bucket 6) - 1000ms+': 0,
                         'operation read latency histogram (bucket 1) - 100-249us': 0,
                         'operation read latency histogram (bucket 2) - 250-499us': 0,
                         'operation read latency histogram (bucket 3) - 500-999us': 0,
                         'operation read latency histogram (bucket 4) - 1000-9999us': 0,
                         'ope

In [392]:
import pymongo

In [394]:
mydb = client['energy_data']

In [396]:
print(client.list_database_names())

['admin', 'config', 'local']


In [397]:
energy_collection = mydb['energy_data']

In [401]:
env_results = energy_collection.insert_many(environmental_data)

In [404]:
pop_gdp_results = energy_collection.insert_many(pop_gdp_data)

### Practice querying

In [409]:
query_1 = energy_collection.find({'state':'Oregon'})

In [414]:
query_1

TypeError: batch_size() missing 1 required positional argument: 'batch_size'