In [1]:
import pandas as pd
import numpy as np
import csv
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from sqlalchemy import create_engine

from sqlalchemy.ext.declarative import declarative_base
Base = declarative_base()
from sqlalchemy import Column, Integer, String, Float
from sqlalchemy import create_engine

In [2]:
## Loading Datasets of Energy Production,Median Income & Population / State

energy_data = pd.read_csv('Data/annual_generation_state.csv')
income_data = pd.read_csv('Data/Median_Income_state-1990.csv')
population_data = pd.read_csv('Data/Population_data.csv')


In [3]:
## Cleaning energy dataset
energy_data.head()

Unnamed: 0,YEAR,STATE_id,TYPE OF PRODUCER,ENERGY SOURCE,GENERATION (Megawatthours)
0,1990,AK,Total Electric Power Industry,Total,5599506
1,1990,AK,Total Electric Power Industry,Coal,510573
2,1990,AK,Total Electric Power Industry,Hydroelectric Conventional,974521
3,1990,AK,Total Electric Power Industry,Natural Gas,3466261
4,1990,AK,Total Electric Power Industry,Petroleum,497116


In [4]:
energy_data['STATE_id'].unique()

array(['AK', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'D.C', 'DE', 'FL', 'GA',
       'HI', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME',
       'MI', 'MN', 'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM',
       'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX',
       'US-TOTAL', 'UT', 'VA', 'VT', 'WA', 'WI', 'WV', 'WY', 'DC', '  ',
       'US-Total'], dtype=object)

In [6]:
## Dropping Unwanted columns and rows
energy_data = energy_data.loc[energy_data['ENERGY SOURCE'] != 'Total']
energy_data = energy_data.loc[energy_data['GENERATION (Megawatthours)'] != 0]
energy_data = energy_data.loc[energy_data['STATE_id'] != 'US-TOTAL']
energy_data = energy_data.loc[energy_data['STATE_id'] != 'US-Total']

In [7]:
## Grouping Into Conventional  & Green Energy Sources

energy_data['ENERGY SOURCE'] = energy_data['ENERGY SOURCE'].replace({'Natural Gas':'Conventional','Petroleum':'Conventional','Coal':'Conventional','Other Biomass':'Green Energy',
                                                              'Hydroelectric Conventional':'Green Energy','Wood and Wood Derived Fuels':'Green Energy','Other':'Conventional',
                                                              'Other Gases':'Conventional','Wind':'Green Energy','Nuclear':'Green Energy','Solar Thermal and Photovoltaic':'Green Energy',
                                                              'Pumped Storage':'Green Energy','Geothermal':'Green Energy'})

In [8]:
## Summing Up Conventional & Green Energy Produced by each state per year, from 1990-2018
## Dropping the Type of Producer Column
energy_produced = energy_data[['STATE_id','YEAR','ENERGY SOURCE','GENERATION (Megawatthours)']]
energy_produced = energy_produced.groupby(['YEAR','STATE_id','ENERGY SOURCE']).sum()['GENERATION (Megawatthours)']

In [9]:
energy_produced = pd.DataFrame(energy_produced)
energy_produced['GENERATION (Megawatthours)'] = energy_produced['GENERATION (Megawatthours)'].astype(int).map("{:,}".format)

In [10]:
energy_produced.reset_index()

Unnamed: 0,YEAR,STATE_id,ENERGY SOURCE,GENERATION (Megawatthours)
0,1990,AK,Conventional,8947900
1,1990,AK,Green Energy,2251112
2,1990,AL,Conventional,110172788
3,1990,AL,Green Energy,49131478
4,1990,AR,Conventional,45732974
...,...,...,...,...
2913,2018,WI,Green Energy,31271251
2914,2018,WV,Conventional,127262795
2915,2018,WV,Green Energy,7235254
2916,2018,WY,Conventional,82156902


In [11]:
## Cleaning Income Data
income_data.head()

Unnamed: 0,YEAR,STATE,STATE_id,Median_Income
0,1990,Alaska,AK,42607
1,1990,Arizona,AL,71686
2,1990,Arizona,AZ,53309
3,1990,Arkansas,AR,41565
4,1990,California,CA,60726


In [13]:
income_data['Median_Income'] = income_data['Median_Income'].astype(int).map("{:,}".format)

In [14]:
income_data.head()

Unnamed: 0,YEAR,STATE,STATE_id,Median_Income
0,1990,Alaska,AK,42607
1,1990,Arizona,AL,71686
2,1990,Arizona,AZ,53309
3,1990,Arkansas,AR,41565
4,1990,California,CA,60726


In [15]:
# Population Data
population_data = population_data[['YEAR','STATE_id','STATE','Population']]
population_data.head()

Unnamed: 0,YEAR,STATE_id,STATE,Population
0,1990,AL,Alabama,4050055
1,1991,AL,Alabama,4099156
2,1992,AL,Alabama,4154014
3,1993,AL,Alabama,4214202
4,1994,AL,Alabama,4260229


In [16]:
## Data Types of Datasets
population_data.dtypes

YEAR           int64
STATE_id      object
STATE         object
Population    object
dtype: object

In [17]:
energy_produced.dtypes

GENERATION (Megawatthours)    object
dtype: object

In [18]:
income_data.dtypes

YEAR              int64
STATE            object
STATE_id         object
Median_Income    object
dtype: object

In [19]:
 ## Saving All Updated Datasets as csv files

energy_data.to_csv('Energy_data.csv')
income_data.to_csv('Income_data.csv')
population_data.to_csv('Population_data.csv')