# Electricity Usage Analytics of all states in USA
## Exploratory Data Analysis and Data Preprocessing

### Import Libraries and Packages

In [1]:
# Import necessary packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('ggplot')

In [2]:
# Function to read data from URL
def read_data_from_URL(url):
    df=pd.read_csv(url)
    return df

In [3]:
#set up data urls from github
parent_url='https://raw.githubusercontent.com/nthammadi-uncc/electricity_usage_analysis/main/Data/'
electric_price_url=parent_url+'avgprice_annual.csv'
monthly_consumption_url=parent_url+'consumption_monthly.csv'
annual_customers_url=parent_url+'customers_annual.csv'
annual_emission_url=parent_url+'emission_annual.csv'
monthly_generation_url=parent_url+'generation_monthly.csv'
median_household_income_url=parent_url+'median_household_income.csv'
presidential_election_url=parent_url+'presidential_election_results.csv'

### Get Annual Emissions Data(CO2, SO2,NOx)

In [4]:
all_emission_df = read_data_from_URL(annual_emission_url)
all_emission_df.head()

Unnamed: 0,Year,State,Producer Type,Energy Source,CO2\n(Metric Tons),SO2\n(Metric Tons),NOx\n(Metric Tons)
0,1990,AK,Commercial Cogen,All Sources,824004.0,13198.0,3011.0
1,1990,AK,Commercial Cogen,Coal,821929.0,13191.0,3009.0
2,1990,AK,Commercial Cogen,Petroleum,2075.0,6.0,2.0
3,1990,AK,Commercial Non-Cogen,All Sources,0.0,149.0,42.0
4,1990,AK,Commercial Non-Cogen,Petroleum,0.0,149.0,42.0


In [5]:
print("Records:", all_emission_df.shape[0], "\nFeatures:", all_emission_df.shape[1])

Records: 43258 
Features: 7


In [6]:
all_emission_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43258 entries, 0 to 43257
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Year               43258 non-null  int64  
 1   State              43258 non-null  object 
 2   Producer Type      43258 non-null  object 
 3   Energy Source      43258 non-null  object 
 4   CO2
(Metric Tons)  43258 non-null  float64
 5   SO2
(Metric Tons)  43258 non-null  float64
 6   NOx
(Metric Tons)  43258 non-null  float64
dtypes: float64(3), int64(1), object(3)
memory usage: 2.3+ MB


In [7]:
all_emission_df['Producer Type'].value_counts()

Total Electric Power Industry    10006
Industrial Cogen                  7512
Electric Utility                  6776
IPP NAICS-22 Non-Cogen            5725
IPP NAICS-22 Cogen                4750
Commercial Cogen                  4138
Industrial Non-Cogen              2295
Commercial Non-Cogen              2056
Name: Producer Type, dtype: int64

#### We will only look into "Total Electric Power Industry" so we remove all the other types of producers

In [8]:
emission_df = all_emission_df[all_emission_df['Producer Type'] == 'Total Electric Power Industry']
emission_df.drop(['Producer Type'], axis = 1, inplace = True)
print("Records:", emission_df.shape[0], "\nFeatures:", emission_df.shape[1])

Records: 10006 
Features: 6


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [9]:
emission_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10006 entries, 17 to 43257
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Year               10006 non-null  int64  
 1   State              10006 non-null  object 
 2   Energy Source      10006 non-null  object 
 3   CO2
(Metric Tons)  10006 non-null  float64
 4   SO2
(Metric Tons)  10006 non-null  float64
 5   NOx
(Metric Tons)  10006 non-null  float64
dtypes: float64(3), int64(1), object(2)
memory usage: 547.2+ KB


In [10]:
emission_df['Energy Source'].value_counts()

All Sources                    1560
Petroleum                      1556
Natural Gas                    1507
Coal                           1468
Other                          1164
Other Biomass                  1069
Wood and Wood Derived Fuels     851
Other Gases                     656
Geothermal                      175
Name: Energy Source, dtype: int64

### Get Median Household Income Data

In [11]:
median_income_df = read_data_from_URL(median_household_income_url)
median_income_df.head()

Unnamed: 0,State,2020,2019,2018,2017,2016,2015,2014,2013,2012,...,2009,2008,2007,2006,2005,2004,2003,2002,2001,2000
0,United States,67521.0,68703.0,63179.0,61136.0,59039.0,56516.0,53657.0,53585.0,51017.0,...,49777.0,50303.0,50233.0,48201.0,46326.0,44334.0,43318.0,42409.0,42228.0,41990.0
1,Alabama,54393.0,56200.0,49936.0,50865.0,47221.0,44509.0,42278.0,47320.0,43464.0,...,39980.0,44476.0,42212.0,37952.0,37150.0,36629.0,37255.0,37603.0,35160.0,35424.0
2,Alaska,74476.0,78394.0,68734.0,77987.0,75723.0,75112.0,67629.0,72472.0,63648.0,...,61604.0,63989.0,62993.0,56418.0,55891.0,55063.0,51837.0,52774.0,57363.0,52847.0
3,Arizona,66628.0,70674.0,62283.0,59700.0,57100.0,52248.0,49254.0,52611.0,47044.0,...,45739.0,46914.0,47215.0,46657.0,45245.0,43846.0,41166.0,39734.0,42704.0,39783.0
4,Arkansas,50540.0,54539.0,49781.0,49751.0,45907.0,42798.0,44922.0,39376.0,39018.0,...,36538.0,39586.0,40795.0,37057.0,36658.0,34984.0,32002.0,32387.0,33339.0,29697.0


In [12]:
print("Records:", median_income_df.shape[0], "\nFeatures:", median_income_df.shape[1])

Records: 52 
Features: 22


In [13]:
median_income_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52 entries, 0 to 51
Data columns (total 22 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   State   52 non-null     object 
 1   2020    52 non-null     float64
 2   2019    52 non-null     float64
 3   2018    52 non-null     float64
 4   2017    52 non-null     float64
 5   2016    52 non-null     float64
 6   2015    52 non-null     float64
 7   2014    52 non-null     float64
 8   2013    52 non-null     float64
 9   2012    52 non-null     float64
 10  2011    52 non-null     float64
 11  2010    52 non-null     float64
 12  2009    52 non-null     float64
 13  2008    52 non-null     float64
 14  2007    52 non-null     float64
 15  2006    52 non-null     float64
 16  2005    52 non-null     float64
 17  2004    52 non-null     float64
 18  2003    52 non-null     float64
 19  2002    52 non-null     float64
 20  2001    52 non-null     float64
 21  2000    52 non-null     float64
dtypes: f

In [14]:
median_income_df = median_income_df.iloc[1: , :]

In [15]:
median_income_df = median_income_df.melt(id_vars = 'State', var_name = 'Year', value_name = 'Median Income')
median_income_df['Year']  = median_income_df['Year'].astype(np.int64)

In [16]:
median_income_df.head()

Unnamed: 0,State,Year,Median Income
0,Alabama,2020,54393.0
1,Alaska,2020,74476.0
2,Arizona,2020,66628.0
3,Arkansas,2020,50540.0
4,California,2020,77358.0


In [17]:
median_income_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1071 entries, 0 to 1070
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   State          1071 non-null   object 
 1   Year           1071 non-null   int64  
 2   Median Income  1071 non-null   float64
dtypes: float64(1), int64(1), object(1)
memory usage: 25.2+ KB
