# Experimenting with EDA and Modeling 

To start off, I'm going to focus all efforts on just NY. Once everything is up and running, I'll scale up to all 50 states.

In [95]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import pymongo
from pprint import pprint
%load_ext autoreload
%autoreload 2
%matplotlib inline

import helper_functions
import plotly.express as px
import plotly.graph_objects as go


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### First off, connect to MongoDB

In [4]:
client = pymongo.MongoClient('mongodb://localhost/')
db = client.admin

# Issue the serverStatus command and print the results
serverStatusResult=db.command("serverStatus")
pprint(serverStatusResult)

mydb = client['energy_data']


 'connections': {'active': 1,
                 'available': 3273,
                 'current': 3,
                 'totalCreated': 16},
 'electionMetrics': {'averageCatchUpOps': 0.0,
                     'catchUpTakeover': {'called': 0, 'successful': 0},
                     'electionTimeout': {'called': 0, 'successful': 0},
                     'freezeTimeout': {'called': 0, 'successful': 0},
                     'numCatchUps': 0,
                     'numCatchUpsAlreadyCaughtUp': 0,
                     'numCatchUpsFailedWithError': 0,
                     'numCatchUpsFailedWithNewTerm': 0,
                     'numCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd': 0,
                     'numCatchUpsSkipped': 0,
                     'numCatchUpsSucceeded': 0,
                     'numCatchUpsTimedOut': 0,
                     'numStepDownsCausedByHigherTerm': 0,
                     'priorityTakeover': {'called': 0, 'successful': 0},
                     'stepUpCmd': {'called': 0, 'su

In [5]:
print(client.list_database_names())

['admin', 'config', 'energy_data', 'local']


In [6]:
energy_collection = mydb['energy_data']

## Get NY Data, group the data by sector, including total (renewable vs. nonrenewable)

In [82]:
ny = [x for x in energy_collection.find({'state':'New York'})]

In [85]:
del ny[1]

In [87]:
sectors = [series.get('sector') for series in ny]
sectors = set(sectors)
sectors.remove(None)

In [88]:
sectors

{'Commercial Sector',
 'Electric Power Sector',
 'Industrial Sector',
 'Residential Sector',
 'Total All Sectors',
 'Total End-Use Sectors',
 'Transportation Sector'}

In [44]:
all_sectors = [series for series in ny if series.get('sector') == 'Total All Sectors']

In [183]:
dates = np.arange(2017,1959,-1)
df = pd.DataFrame(index = dates)

for series in ny:
    
    for sector in ['Total All Sectors']:#sectors
        
        if series.get('sector') == sector:
            
            data = series['data']
            
            if len(data) == 59:
                data = data[1:]
                
            ts_values = [tuple_[1] for tuple_ in data]
            
            df = pd.concat([df, pd.Series(data = ts_values,
                                          name=(series['energy_type']), 
                                          index=dates)],
                                          axis=1)
        
        if series.get('description') == 'Population':
            
            data = series['data']
    
            
            if len(data) == 59:
                data = data[1:]
                
            ts_values = [tuple_[1] for tuple_ in data]
            
            df = pd.concat([df, pd.Series(data = ts_values,name=series.get('description'), index=dates)],axis=1)


In [255]:
above_temps = np.arange(100, 65, -5)
above_temps = ['days_above_'+str(temp) for temp in above_temps]

below_temps = np.arange(70, 0, -5)
below_temps = ['days_below_'+str(temp) for temp in below_temps]

temp_column_titles = above_temps + below_temps

In [242]:
matrix = []
for year in ny[1]['data']:
    row = ny[1]['data'][year][0] + ny[1]['data'][year][1]
    matrix.append(row)
#     print(ny[1]['data'][year])
matrix = matrix[:-1]

In [248]:
temp_df = pd.DataFrame(matrix,index=dates[::-1],columns = temp_column_titles)

In [254]:
df = pd.concat([df, temp_df],axis=1)

In [124]:
energy_types = [
    'All Petroleum Products excluding Fuel Ethanol',
    'Coal',
    'Natural Gas including Supplemental Gaseous Fuels',
    'Nuclear Power',
    'Biomass',
    'Fuel Ethanol excluding Denaturant',
    'Geothermal',
    'Hydroelectricity',
    'Solar Energy',
    'Wind Energy'
]

renewable_sources = energy_types[4:]
nonrenewable_sources = energy_types[:4]

In [184]:
df['Renewable Sources'] = df[renewable_sources].sum(axis=1)
df['Nonrenewable Sources'] = df[nonrenewable_sources].sum(axis=1)

In [185]:
df.head()

Unnamed: 0,Population,Natural Gas including Supplemental Gaseous Fuels,All Petroleum Products excluding Fuel Ethanol,Renewable Energy,Biomass,Solar Energy,Wind Energy,Geothermal,Hydroelectricity,Coal,Nuclear Power,Fuel Ethanol excluding Denaturant,Renewable Sources,Nonrenewable Sources
2017,19591,1276946,1290569,483912,152980,13918,38105,1185,277725,19553,441029,47948,531861,3028097
2016,19642,1336455,1309305,447113,150650,10670,36376,1185,248232,29679,434791,46644,493757,3110230
2015,19661,1396693,1299897,442961,154559,7726,37061,1185,242429,41241,466457,44702,487662,3204288
2014,19656,1392378,1297509,432848,140542,5296,37740,1185,248086,64655,450141,45832,478681,3204683
2013,19628,1315282,1242939,412444,135359,3860,33770,1185,238270,68707,467651,44273,456717,3094579


In [186]:
columns = list(df.columns)
columns.remove('Renewable Energy')

In [187]:
df_melt = df.reset_index().melt(id_vars='index',
                                value_vars=columns ,
                                var_name = 'Energy Source', 
                                value_name = 'Energy Consumed (Billion Btu)')
df_melt.columns = ['Year', 'Energy Source', 'Energy Consumed (Billion Btu)']

In [268]:
fig = px.line(df_melt, 
              x = 'Year', 
              y = 'Energy Consumed (Billion Btu)', 
              color = 'Energy Source',
              title = 'New York Energy Consumption',
              height = 600,
              width = 1100)

fig.show()


In [259]:
temp_df_melt = temp_df.reset_index().melt(id_vars='index',
                                value_vars=temp_column_titles,
                                var_name = 'Temperature Cutoff', 
                                value_name = 'Temperature')
temp_df_melt.columns = ['Year','Temperature Cutoff','Temperature']

In [267]:
fig = px.line(temp_df_melt, 
              x = 'Year', 
              y = 'Temperature', 
              color = 'Temperature Cutoff',
              title='New York Temperature Trends',
              height = 600,
              width = 1000)

fig.show()