# Data Gathering and Cleaning

Energy data for this project is gathered from the U.S. Energy Information Administration (EIA) [website](https://www.eia.gov).

Weather data for this project is gathered from the National Centers for Environmental Information. Documentation for the API can be found [here](https://www.ncei.noaa.gov/support/access-data-service-api-user-documentation).

## Import packages

In [1]:
import json
import pandas as pd
import numpy as np
import re
import requests
import time
from bs4 import BeautifulSoup as BS
%load_ext autoreload
%autoreload 2
import helper_functions


We have to do a little manipulation to the text file from EIA. It is a text file with a bunch of line-separated JSON objects, but I massage it here to a proper JSON and export it as a new file.

In [24]:
lastline = None

# Open and read text file
with open("SEDS.txt","r") as f:
    lineList = f.readlines()
    
    # Keep track of last line
    lastline=lineList[-1]

# Open text file and create new json to be written
with open("SEDS.txt","r") as f, open("cleanfile.json","w") as g:
    
    # Iterate through each line of the text file
    for i,line in enumerate(f,0):
        
        # First line gets [ and , to initialize the json
        if i == 0:
            line = "["+str(line)+","
            g.write(line)
            
        # Last line gets ] to signal the end of the json
        elif line == lastline:            
            g.write(line)
            g.write("]")
            
        # Other lines get comma separation
        else:
            line = str(line)+","
            g.write(line)

In [21]:
file = open('cleanfile.json', 'r')
json_data = json.load(file)

The following energy types were selected based on the categories in the [EIA educational page](https://www.eia.gov/energyexplained/energy-and-the-environment/).

In [11]:
energy_types = [
    'All Petroleum Products excluding Fuel Ethanol',
    'Coal',
    'Natural Gas including Supplemental Gaseous Fuels',
    'Nuclear Power',
    'Biomass',
    'Fuel Ethanol excluding Denaturant',
    'Geothermal',
    'Hydroelectricity',
    'Solar Energy',
    'Wind Energy',
    'Renewable Energy'
]



# Make all lowercase in case some pages have inconsistent letter casing
for i in range(len(energy_types)):
    energy_types[i] = energy_types[i].lower()
    
nonrenewable_energies = energy_types[:4]

renewable_energies = energy_types[4:]

In [12]:
energy_types

['all petroleum products excluding fuel ethanol',
 'coal',
 'natural gas including supplemental gaseous fuels',
 'nuclear power',
 'biomass',
 'fuel ethanol excluding denaturant',
 'geothermal',
 'hydroelectricity',
 'solar energy',
 'wind energy',
 'renewable energy']

## Web scrape to get the information for each of the above types of energy

In [13]:
headers = {'user-agent': 'Safari/13.0.2 (Macintosh; Intel Mac OS X 10_15)'}
base_url = 'https://www.eia.gov/opendata/qb.php'
consumption_suffix = '?category=40204'

In [14]:
consumption_page = helper_functions.get_page(base_url+consumption_suffix,headers)

In [20]:
# Create empty dict to store all info across every sector and energy type by state
env_series_ids = {}

# Start by scraping the consumption website in order to get the list of available sectors  
consumption_sectors = consumption_page.find('div',{'class':'pagecontent mr_temp2'})

# Store sector url suffixes in a list
sector_url_suffixes = [sector.a['href'] for sector in consumption_sectors.find_all('li')[:7]]

# Loop 1 - iterate through each sector
for sector_url_suffix in sector_url_suffixes:
    
    # Scrape the sector page
    sector_page = helper_functions.get_page(base_url+sector_url_suffix,headers)    

    # Go into first url and grab tags of all children categories
    children_categories = sector_page.find('section').ul.find_all('li')

    # Store the urls of children cats (ccats = children categories)
    ccats_url_suffixes = [children_category.a['href'] 
                          for children_category in children_categories
                          if children_category.text.lower() in energy_types]
    
    # Loop 2 - for each sector, iterate through the relevant types of energy consumption to get state-level data
    for ccats_url_suffix in ccats_url_suffixes:
        
        # Scrape the child category page
        child_category_page = helper_functions.get_page(base_url+ccats_url_suffix,headers)

        # Grab tags of all energy unit children categories. Only want Btu
        energy_unit_cats = child_category_page.find('div',{'class':'main_col'}).ul.find_all('li')

        # Store only the url of the 'Btu' children category. I make a list and select only the first element 
        # because sometimes there will be two energy unit options or just one. This way ensures we only take 
        # the Btu option.
        btu_url_suffix = [energy_unit.a['href'] 
                   for energy_unit in energy_unit_cats
                   if energy_unit.text == 'Btu'][0]
        
        # Scrape the Btu page
        btu_page = helper_functions.get_page(base_url+btu_url_suffix,headers)
        
        # Get list of states by their tags
        states = btu_page.find('div',{'class':'main_col'}).ul.find_all('li')
        
        # Get url suffixes for each state
        state_url_suffixes = [state.a['href'] for state in states]
        
        # Isolate the sector and energy type
        sector = btu_page.find('div',{'class':'main_col'}).h3.find_all('a')[3].text
        energy_type = btu_page.find('div',{'class':'main_col'}).h3.find_all('a')[4].text
        
        # Add these to a dict which will be the values of the overarching env_series_ids dict
        series_id_values = {'sector':sector,'energy_type':energy_type}
        
        # Parse through url suffixes to get and store the series ids we want to use to parse the big JSON
        for state_suffix in state_url_suffixes:
            series_id = re.findall('SEDS.*',state_suffix)[0]
            env_series_ids[series_id] = series_id_values
              

### Parse energy data

In [36]:
# Set up empty bucket for parsed data
environmental_data = []

# Iterate through big json to parse relevant info
for single_json in json_data:
    
    # Only parse entries that have the series ids that we care about
    if single_json.get('series_id') in env_series_ids.keys():
        single_data_entry = {}
        single_data_entry['series_id'] = single_json['series_id']
        single_data_entry['sector'] = env_series_ids[single_json['series_id']]['sector']
        single_data_entry['data'] = single_json['data']
        single_data_entry['state'] = re.findall('(, )(\w* ?\w* ?\w*)',single_json['name'])[-1][-1]
        single_data_entry['units'] = single_json['units']
        single_data_entry['energy_type'] = env_series_ids[single_json['series_id']]['energy_type']
        # Add a column for whether or not this type of energy is renewable
        if single_data_entry['energy_type'] in renewable_energies:
            single_data_entry['renewable'] = 1
        else: 
            single_data_entry['renewable'] = 0
        environmental_data.append(single_data_entry)

In [51]:
sectors = [data['sector'] for data in environmental_data]
sectors = list(set(sectors))
sectors
del sector

## Web scrape for population and GDP data

In [47]:
population_url_suffix = '?category=40367'
gdp_url_suffix = '?category=40828'
pop_gdp_url_suffixes = [population_url_suffix, gdp_url_suffix]

# Create container for  data
pop_gdp_series_ids = {}

for pop_gdp_url_suffix in pop_gdp_url_suffixes:
    
    # Scrape population page
    page = helper_functions.get_page(base_url + pop_gdp_url_suffix,headers)

    # Isolate html tags containing urls for each state
    state_tags = page.find('div',{'class':'main_col'}).ul.find_all('li')

    # Extract and save each state url suffix
    state_url_suffixes = [state_tag.a['href'] 
                          for state_tag in state_tags]

    # Iterate through each state url suffix to extract features
    for state_url_suffix in state_url_suffixes:

        # Scrape each state's series page
        state_page = helper_functions.get_page(base_url + state_url_suffix,headers)

        # Isolate html tags containing state name, get text from tag, parse for name
        api_call_tags = state_page.find('div',{'class':'main_col'}).find('div',{'class':'api_call_container'})
        state_text = api_call_tags.find_all('p')[1].text
        state = re.findall('(, )(.*)',state_text)[0][1]
        
        # Isolate html tags containing description (gdp or pop), get text from tag
        main_col_tags = state_page.find('div',{'class':'main_col'}).h3
        desc = main_col_tags.find_all('a')[2].text
        
        # Parse url suffix for series id
        series_id = re.findall('SEDS.*',state_url_suffix)[0]

        # Add to data container
        values = {'state':state,'description':desc}
        pop_gdp_series_ids[series_id] = values

### Parse population and gdp data

In [48]:
# Set up empty bucket for parsed data
pop_gdp_data = []

# Iterate through big json to parse relevant info
for single_json in json_data:
    
    # Only parse entries that have the series ids that we care about
    if single_json.get('series_id') in pop_gdp_series_ids.keys():
        single_data_entry = {}
        single_data_entry['series_id'] = single_json['series_id']
        single_data_entry['description'] = pop_gdp_series_ids[single_json['series_id']]['description']
        single_data_entry['units'] = single_json['units']
        single_data_entry['data'] = single_json['data']
        single_data_entry['state'] = pop_gdp_series_ids[single_json['series_id']]['state']
        pop_gdp_data.append(single_data_entry)

# Get Weather data from NCEI 
National Centeres for Environmental Information has an Access Data Service that provides a RESTful API. From it, I gather weather information from 10 random cities per state to generate mean max and min temperatures of a given state on any day.


#### Load GDCND weather station codes

This is how we tell the API where we want to retrieve data from.

In [2]:
# List of GHCND station codes
with open("ghcnd.txt","r") as f:
    lineList = f.readlines()

# Turn each line into a list for easier sorting
split_lines = [line.split() for line in lineList]

# The city names got split too, so I rejoin them back together
for i in range(len(split_lines)):
    if len(split_lines[i]) > 6:
        split_lines[i] = split_lines[i][:5] + [' '.join(split_lines[i][5:])]

In [5]:
split_lines[0]

['USC00010008', '31.5703', '-85.2483', '139.0', 'AL', 'ABBEVILLE']

#### Load state names

In [321]:
# Import the state capitals, convert to dict
import csv
state_capitals = open('us-state-capitals.csv')
state_capitals_reader = csv.reader(state_capitals)
state_capitals_dict = dict(state_capitals_reader)
state_capitals_dict.pop('')
state_capitals_dict.keys()

dict_keys(['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA', 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY'])

#### Find indexes where each state starts and ends in the GHCND list

In [None]:
# Empty dict to store this info
state_indexes = {}

# Iterate through state abbreviations
for state in state_capitals_dict:
    
    # Create a flag to keep track of indexes where a state starts and ends in split_lines
    flag = False
    
    # Create start and end index variables
    start_index = None
    end_index = None
    
    # Iterate through station lines
    for index in range(len(split_lines)):
        
        # record index of only the first occurence where we find the state code
        if split_lines[index][4] == state and not flag:
            start_index = index
            
            # Switch flag to satisfy the next conditional
            flag = True
        
        # record index of only the first occurence where we don't find the state code AFTER having found it
        if split_lines[index][4] != state and flag:
            end_index = index
            
            # end this loop
            break
            
    # Store the indexes        
    state_indexes[state] = [start_index, end_index]
        

#### For one state, generate a random city code.

In [64]:
np.random.seed(100)

# Dict of form {state:[list of station codes]}
# state_station_codes = {}

states_remaining = 50

for state in state_indexes:
    """
    Everything before the while loop is just to set up the while loop
    """
    print(state)
    
    # List that will populate the values of state_station_codes dict
    state_station_codes_values = []

    # Keep track of how many stations we've added to the list. Only want to go to 10
    count = 0
    
    # Keep track of how many times the while loop has run
    tally = 0
    
    # Start and end indexes for each state
    start_index = state_indexes[state][0]
    end_index = state_indexes[state][1]
    max_num_iters = end_index - start_index
    print(f'max number of iterations is {max_num_iters}')
    
    # Keep track of used indexes to have no replacement
    remaining_indexes = list(range(start_index,end_index+1))

    while count < 10:

        if not len(remaining_indexes):
            break
            
        # Generate a random int to index from remaining_indexes
        rand_index = np.random.randint(0, len(remaining_indexes))
    
        # Remove that index from remaining indexes so as to not resample
        del remaining_indexes[rand_index]

        # Call the API w/ that code for 1990-01-01
        api_call = helper_functions.get_station_weather(split_lines[rand_index][0])
        
        # keep track of how many times it's called the api
        if not tally % 20:
            print(f'have made {tally} api calls')
            
        # restart loop if the api call yielded a blank entry
        if not api_call:
            continue
        
        # If output has both TMIN and TMAX then save that code and state
        elif api_call[0].get('TMIN') and api_call[0].get('TMAX'):
            state_station_codes_values.append(api_call[0]['STATION'])
            count += 1
        
        tally += 1
    
    # After while loop is done, add state_station_codes_values to state_station_codes
    state_station_codes[state] = state_station_codes_values
    
    print(f'just finished {state}. {states_remaining} states remaining')
    states_remaining -= 1

GA
max number of iterations is 367
have made 0 api calls
have made 0 api calls
have made 0 api calls
have made 0 api calls
just finished GA. 43 states remaining
HI
max number of iterations is 587
have made 0 api calls
have made 0 api calls
have made 20 api calls
just finished HI. 42 states remaining
ID
max number of iterations is 393
have made 0 api calls
have made 0 api calls
have made 0 api calls
just finished ID. 41 states remaining
IL
max number of iterations is 506
have made 0 api calls
have made 20 api calls
just finished IL. 40 states remaining
IN
max number of iterations is 379
have made 0 api calls
have made 0 api calls
just finished IN. 39 states remaining
IA
max number of iterations is 395
have made 0 api calls
have made 0 api calls
have made 0 api calls
have made 0 api calls
have made 0 api calls
just finished IA. 38 states remaining
KS
max number of iterations is 626
have made 0 api calls
just finished KS. 37 states remaining
KY
max number of iterations is 447
have made 0 

In [67]:
# Check how many stations in each state satisfied our requirements
for state in state_station_codes:
    print(state, len(state_station_codes[state]))

AL 10
AK 10
AZ 10
AR 10
CA 10
CO 10
CT 10
DE 2
FL 10
GA 10
HI 10
ID 10
IL 10
IN 10
IA 10
KS 10
KY 10
LA 10
ME 10
MD 10
MA 10
MI 10
MN 10
MS 10
MO 10
MT 10
NE 10
NV 10
NH 10
NJ 10
NM 10
NY 10
NC 10
ND 10
OH 10
OK 10
OR 10
PA 10
RI 4
SC 10
SD 10
TN 10
TX 10
UT 10
VT 10
VA 10
WA 10
WV 10
WI 10
WY 10


#### Gather and parse temperature data for each state from the API 

In [190]:
# Create empty container for final data to be uploaded to mongodb {'state' : [state data]}
average_temp_data = {}

states_left = 50

for state in state_station_codes: #state_station_codes:
    print(f'starting {state}. {states_left} states remaining')
    
    
    # Create empty container for state-level data {'station': [station data]}
    state_data = {}
    
    # Iterate through each station to get all of its weather data 
    for station in state_station_codes[state]:
        
        # Create empty container for station-level data [{'DATE':date, 'TMIN':tmin, 'TMAX':tmax}]
        station_data = []
        
        # Call api for this station to collect weather data
        api_call = helper_functions.get_station_weather(station,'1960-01-01','2018-12-31')
        
        
        for day in api_call:
            
            # Create empty container for daily-level data
            daily_data = {}
            
            # For each dict in the response, save 'DATE', api_call.get('STATION'), api_call.get('TMIN'), api_call.get('TMAX'), api_call.get('STATE')
            daily_data['DATE'] = day.get('DATE')
            daily_data['TMIN'] = day.get('TMIN')
            daily_data['TMAX'] = day.get('TMAX') 

            # Append to station level-data
            station_data.append(daily_data)
        
        # Append to state-level data
        state_data[station] = station_data        
    
    print('Done gathering and parsing data for each station. Now making lists of temps by day')
    
    # Loop through each station and add as ints to the appropriate numpy array
    dates_temp = {}

    # Iterate through each station within a state
    for station in state_data:
#         print(station)

        # Iterate through each day of data from a particular station to get TMIN and TMAX
        for daily_data in state_data[station]:
    #         print(daily_data)

            # Only do stuff if 1. it's not an empty list and 2. the temperature data is present
            if daily_data and (daily_data.get('TMAX') and daily_data.get('TMIN')):

                # Keep track of the date
                date = daily_data['DATE']

                # Create a key within our dict that keeps track of the date
                if not dates_temp.get(date):

                    # Assign empty lists where we'll store info from each station
                    dates_temp[date] = {'TMAX':[], 'TMIN':[]}

                # Append    
                dates_temp[date]['TMAX'].append(int(daily_data['TMAX']))
                dates_temp[date]['TMIN'].append(int(daily_data['TMIN']))
    
    print('Now finding average temps')
    
    # Loop through the arrays and store the mean value into final dict
    final_values = []
    
    # Iterate through dates_temp TMIN/TMAX arrays and store the mean value into final dict
    for date in dates_temp:

        daily_temps = {}
        daily_temps['date'] = date
        daily_temps['tmax'] = round(np.mean(np.array(dates_temp[date]['TMAX'])),2)
        daily_temps['tmin'] = round(np.mean(np.array(dates_temp[date]['TMIN'])),2)
        final_values.append(daily_temps)
    
    average_temp_data[state] = final_values
    states_left -= 1

starting AL. 50 states remaining
Done gathering and parsing data for each station. Now making lists of temps by day
Now finding average temps
starting AK. 49 states remaining
Done gathering and parsing data for each station. Now making lists of temps by day
Now finding average temps
starting AZ. 48 states remaining
Done gathering and parsing data for each station. Now making lists of temps by day
Now finding average temps
starting AR. 47 states remaining
Done gathering and parsing data for each station. Now making lists of temps by day
Now finding average temps
starting CA. 46 states remaining
Done gathering and parsing data for each station. Now making lists of temps by day
Now finding average temps
starting CO. 45 states remaining
Done gathering and parsing data for each station. Now making lists of temps by day
Now finding average temps
starting CT. 44 states remaining
Done gathering and parsing data for each station. Now making lists of temps by day
Now finding average temps
starti

In [194]:
# Check how many data points per state
for state in average_temp_data:
    print(state, len(average_temp_data[state]))

AL 21550
AK 21550
AZ 21550
AR 21550
CA 21550
CO 21550
CT 21550
DE 21139
FL 21550
GA 21550
HI 21550
ID 21550
IL 21550
IN 21550
IA 21550
KS 21550
KY 21550
LA 21550
ME 21550
MD 21550
MA 21550
MI 21550
MN 21550
MS 21550
MO 21550
MT 21550
NE 21550
NV 21550
NH 21550
NJ 21550
NM 21550
NY 21550
NC 21550
ND 21550
OH 21549
OK 21550
OR 21550
PA 21550
RI 21550
SC 21550
SD 21550
TN 21550
TX 21550
UT 21550
VT 21550
VA 21550
WA 21550
WV 21550
WI 21550
WY 21550


Unfortunately, I forgot all of the keys are now in state abbreviations, so I'm going to switch them all to the normal state name for easier querying from the db later.

In [201]:
state_abbrevs = open('state-abbreviations.csv')
state_abbrevs_reader = csv.reader(state_abbrevs)
state_abbrevs_dict = dict(state_abbrevs_reader)

In [203]:
final_temp_data = {}

for state in average_temp_data:
    full_state_name = state_abbrevs_dict[state]
    final_temp_data[full_state_name] = average_temp_data[state]

It turns out this file is too big to fit onto mongo. I'll therefore separate it into a list of dicts or make this cool trick

count number of days in each year that:

Temp is above 70, 75, 80, 85, 90, 95, or 100

Below 70, 65, 60, 55, 50, 45, 40, 35, 30, 25, 20, 15, 10, 5, or 0



In [308]:
final_temp_data.pop('_id')
final_temp_data.keys()

dict_keys(['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming'])

#### Engineering temperature data to have information on a yearly basis rather than daily basis

In [340]:
"""
The below code goes through the final temp data to make new lists of just the TMAX values 
for each year so that we can start getting year-level numbers that could be useful.
"""

# Create container for final output
processed_temp_data = []

# Iterate through each state's data
for state in final_temp_data:
    
    # Create container to store each state's individual yearly data
    years_temps_lists = {str(year):[] for year in np.arange(1960,2019)}

    # Iterate through all of a state's daily data
    for day in final_temp_data[state]:

        # Figure out which key to access in years_temps_lists and assign the tmax
        year = day['date'][:4]
        years_temps_lists[year].append(day['tmax'])

    # Create arrays with which to determine how many days in a year were above certain temps    
    above_temps = np.arange(100, 65, -5)
    below_temps = np.arange(70, 0, -5)

    # Create container for each state's output
    yearly_above_below = {}
    
    # Iterate through the data which has now been sorted into yearly chunks
    for year in years_temps_lists:
        
        # Convert to array for quick math
        array = np.array(years_temps_lists[year])

        # Find number of days that the temperature was above/below a certain threshold in a given year
        # Have to convert to int because mongo doesn't accept numpy.int64
        days_above_temp = [int((array>temp).sum()) for temp in above_temps]
        days_below_temp = [int((array<temp).sum()) for temp in below_temps]

        # Assign this information to a year
        yearly_above_below[year] = [days_above_temp, days_below_temp]
    
    # Append each state's data to the container
    processed_temp_data.append({'state' : state, 'data' : yearly_above_below, 'description':'Temperature','Units':'F'})


In [341]:
type(processed_temp_data[0]['data']['1960'][0][0])

int

#### Checking to make sure we have a complete data set

In [317]:
# Print any states that don't have a complete data set
for state in states_data:
    data = state['data']
    if len(data) < len(np.arange(1960,2019)):
        print(state['state'])
            

## Storing data into MongoDB

In [5]:
import pymongo
from pprint import pprint

client = pymongo.MongoClient('mongodb://localhost/')
db = client.admin

# Issue the serverStatus command and print the results
serverStatusResult=db.command("serverStatus")
pprint(serverStatusResult)

mydb = client['energy_data']

print(client.list_database_names())

 'connections': {'active': 1,
                 'available': 3271,
                 'current': 5,
                 'totalCreated': 18},
 'electionMetrics': {'averageCatchUpOps': 0.0,
                     'catchUpTakeover': {'called': 0, 'successful': 0},
                     'electionTimeout': {'called': 0, 'successful': 0},
                     'freezeTimeout': {'called': 0, 'successful': 0},
                     'numCatchUps': 0,
                     'numCatchUpsAlreadyCaughtUp': 0,
                     'numCatchUpsFailedWithError': 0,
                     'numCatchUpsFailedWithNewTerm': 0,
                     'numCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd': 0,
                     'numCatchUpsSkipped': 0,
                     'numCatchUpsSucceeded': 0,
                     'numCatchUpsTimedOut': 0,
                     'numStepDownsCausedByHigherTerm': 0,
                     'priorityTakeover': {'called': 0, 'successful': 0},
                     'stepUpCmd': {'called': 0, 'su

                            'table verify failed calls': 0,
                            'table verify successful calls': 0},
                'snapshot-window-settings': {'cache pressure percentage threshold': 95,
                                             'current available snapshots window size in seconds': 0,
                                             'current cache pressure percentage': 0,
                                             'latest majority snapshot timestamp available': 'Dec '
                                                                                             '31 '
                                                                                             '19:00:00:0',
                                             'max target available snapshots window size in seconds': 5,
                                             'oldest majority snapshot timestamp available': 'Dec '
                                                                                         

In [6]:
energy_collection = mydb['energy_data']

In [67]:
env_results = energy_collection.insert_many(environmental_data)

In [49]:
pop_gdp_results = energy_collection.insert_many(pop_gdp_data)

In [342]:
weather_results = energy_collection.insert_many(processed_temp_data)

### Practice querying

In [68]:
energy = [x for x in energy_collection.find({})]

In [60]:
energy_collection.remove({'sector': {'$in':sectors}})

  """Entry point for launching an IPython kernel.


{'n': 2080, 'ok': 1.0}

In [69]:
len(energy)

2234

In [41]:
count = 0
gdp_count = 0
for series in all_states:
    if len(series.get('data')) < 58:
        count+=1
        if series.get('description') == 'GDP':
            gdp_count += 1
        
        
        
#         print(series)
    
print(f'there are {count} series with less than 58 data points')
print(f'of those, {gdp_count} of them are gdp')

there are 52 series with less than 58 data points
of those, 52 of them are gdp


Did some digging and turns out GDP only goes back to 1997 because that was when data transitioned from Standard Industrial Classification (SIC) to North American Industry Classification System (NAICS). This gives us only about 20 years of data to work with for GDP. Since our data is in annual increments, we only have roughly 20 data points which will not be enough for time series analysis (usually sounds like 50-60 points minimum). Thus, we will exclude GDP from the exogenous vars.

In [29]:
short_list['data']

[['2017', 2175],
 ['2016', 1983],
 ['2015', 1787],
 ['2014', 1679],
 ['2013', 1562],
 ['2012', 1475],
 ['2011', 1377],
 ['2010', 1282],
 ['2009', 1211],
 ['2008', 1150],
 ['2007', 1035],
 ['2006', 866],
 ['2005', 738],
 ['2004', 678],
 ['2003', 674],
 ['2002', 659],
 ['2001', 652],
 ['2000', 631],
 ['1999', 645],
 ['1998', 621],
 ['1997', 599],
 ['1996', 571],
 ['1995', 514],
 ['1994', 463],
 ['1993', 413],
 ['1992', 377],
 ['1991', 352],
 ['1990', 314],
 ['1989', 282],
 ['1988', 0],
 ['1987', 0],
 ['1986', 0],
 ['1985', 0],
 ['1984', 0],
 ['1983', 0],
 ['1982', 0],
 ['1981', 0],
 ['1980', 0],
 ['1979', 0],
 ['1978', 0],
 ['1977', 0],
 ['1976', 0],
 ['1975', 0],
 ['1974', 0],
 ['1973', 0],
 ['1972', 0],
 ['1971', 0],
 ['1970', 0],
 ['1969', 0],
 ['1968', 0],
 ['1967', 0],
 ['1966', 0],
 ['1965', 0],
 ['1964', 0],
 ['1963', 0],
 ['1962', 0],
 ['1961', 0],
 ['1960', 0]]

In [19]:
dates = np.arange(2018,1959,-1)
df = pd.DataFrame(index = dates)

for series in oregon:
    if series.get('sector') == 'Residential Sector':

        ts_values = [tuple_[1] for tuple_ in series['data']]
        df = pd.concat([df, pd.Series(data = ts_values,name=series['energy_type'], index=dates)],axis=1)


Natural Gas including Supplemental Gaseous Fuels
Solar energy


ValueError: Length of passed values is 58, index implies 59

In [132]:
df.head()

Unnamed: 0,Natural Gas including Supplemental Gaseous Fuels,Solar energy,Geothermal,Coal
2018,30969,30969,30969,30969
2017,33987,33987,33987,33987
2016,28570,28570,28570,28570
2015,26962,26962,26962,26962
2014,29185,29185,29185,29185


In [82]:
series.keys()

dict_keys(['_id', 'series_id', 'sector', 'data', 'state', 'units', 'energy_type', 'renewable'])

In [117]:
test_df = pd.concat([or_df, pd.Series(values,name='test',index=dates)],axis=1)

In [118]:
pd.concat([test_df, pd.Series(values, name='test 2',index=dates)],axis=1)

Unnamed: 0,test,test 2
2018,30969,30969
2017,33987,33987
2016,28570,28570
2015,26962,26962
2014,29185,29185
2013,30832,30832
2012,29451,29451
2011,31034,31034
2010,27461,27461
2009,30504,30504
