In [1]:
%matplotlib notebook

In [2]:
# Import dependencies and set display format to decimals rather than scientific notation
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
pd.options.display.float_format = '{:.2f}'.format

In [3]:
# Define csv source
data = "Energy_Usage_2010.csv"

# Read csv in to pandas
energy_df = pd.read_csv(data)

# Display data 
energy_df.head()

Unnamed: 0,COMMUNITY AREA NAME,CENSUS BLOCK,BUILDING TYPE,BUILDING_SUBTYPE,KWH JANUARY 2010,KWH FEBRUARY 2010,KWH MARCH 2010,KWH APRIL 2010,KWH MAY 2010,KWH JUNE 2010,...,TOTAL POPULATION,TOTAL UNITS,AVERAGE STORIES,AVERAGE BUILDING AGE,AVERAGE HOUSESIZE,OCCUPIED UNITS,OCCUPIED UNITS PERCENTAGE,RENTER-OCCUPIED HOUSING UNITS,RENTER-OCCUPIED HOUSING PERCENTAGE,OCCUPIED HOUSING UNITS
0,Archer Heights,170315704001006.0,Residential,Multi < 7,,,,,,,...,89.0,24.0,2.0,71.33,3.87,23.0,0.96,9.0,0.39,23.0
1,Ashburn,170317005014004.0,Residential,Multi 7+,7334.0,7741.0,4214.0,4284.0,2518.0,4273.0,...,112.0,67.0,2.0,41.0,1.81,62.0,0.93,50.0,0.81,62.0
2,Auburn Gresham,170317105001006.0,Commercial,Multi < 7,,,,,,,...,102.0,48.0,3.0,86.0,3.0,34.0,0.71,23.0,0.68,34.0
3,Austin,170312503003003.0,Commercial,Multi < 7,,,,,,,...,121.0,56.0,2.0,84.0,2.95,41.0,0.73,32.0,0.78,41.0
4,Austin,170312504002008.0,Commercial,Multi < 7,,,,,,,...,62.0,23.0,2.0,85.0,3.26,19.0,0.83,11.0,0.58,19.0


In [4]:
# Sort the data frame by community area name
alpha_df = energy_df.sort_values("COMMUNITY AREA NAME", ascending=True)
alpha_df.head()

Unnamed: 0,COMMUNITY AREA NAME,CENSUS BLOCK,BUILDING TYPE,BUILDING_SUBTYPE,KWH JANUARY 2010,KWH FEBRUARY 2010,KWH MARCH 2010,KWH APRIL 2010,KWH MAY 2010,KWH JUNE 2010,...,TOTAL POPULATION,TOTAL UNITS,AVERAGE STORIES,AVERAGE BUILDING AGE,AVERAGE HOUSESIZE,OCCUPIED UNITS,OCCUPIED UNITS PERCENTAGE,RENTER-OCCUPIED HOUSING UNITS,RENTER-OCCUPIED HOUSING PERCENTAGE,OCCUPIED HOUSING UNITS
2307,Albany Park,170311407024006.0,Commercial,Multi < 7,1538.0,3805.0,6298.0,5147.0,6083.0,6706.0,...,169.0,48.0,1.0,0.0,3.67,46.0,0.96,35.0,0.76,46.0
2001,Albany Park,170311406011014.0,Residential,Single Family,6388.0,6039.0,4839.0,3762.0,7489.0,8974.0,...,124.0,41.0,1.22,61.22,3.65,34.0,0.83,12.0,0.35,34.0
2002,Albany Park,170311406011015.0,Residential,Multi < 7,4527.0,6254.0,6274.0,5244.0,6231.0,9146.0,...,131.0,35.0,2.5,97.75,3.97,33.0,0.94,17.0,0.52,33.0
2003,Albany Park,170311406011015.0,Residential,Single Family,7921.0,7822.0,5486.0,6554.0,9448.0,15348.0,...,131.0,35.0,1.5,66.33,3.97,33.0,0.94,17.0,0.52,33.0
2004,Albany Park,170311406011016.0,Residential,Multi < 7,3290.0,4236.0,3993.0,3762.0,4136.0,5126.0,...,164.0,38.0,2.57,92.57,4.69,35.0,0.92,23.0,0.66,35.0


In [5]:
# List column names to pinpoint relevant columns
list(energy_df.columns.values)

['COMMUNITY AREA NAME',
 'CENSUS BLOCK',
 'BUILDING TYPE',
 'BUILDING_SUBTYPE',
 'KWH JANUARY 2010',
 'KWH FEBRUARY 2010',
 'KWH MARCH 2010',
 'KWH APRIL 2010',
 'KWH MAY 2010',
 'KWH JUNE 2010',
 'KWH JULY 2010',
 'KWH AUGUST 2010',
 'KWH SEPTEMBER 2010',
 'KWH OCTOBER 2010',
 'KWH NOVEMBER 2010',
 'KWH DECEMBER 2010',
 'TOTAL KWH',
 'ELECTRICITY ACCOUNTS',
 'ZERO KWH ACCOUNTS',
 'THERM JANUARY 2010',
 'THERM FEBRUARY 2010',
 'THERM MARCH 2010',
 'TERM APRIL 2010',
 'THERM MAY 2010',
 'THERM JUNE 2010',
 'THERM JULY 2010',
 'THERM AUGUST 2010',
 'THERM SEPTEMBER 2010',
 'THERM OCTOBER 2010',
 'THERM NOVEMBER 2010',
 'THERM DECEMBER 2010',
 'TOTAL THERMS',
 'GAS ACCOUNTS',
 'KWH TOTAL SQFT',
 'THERMS TOTAL SQFT',
 'KWH MEAN 2010',
 'KWH STANDARD DEVIATION 2010',
 'KWH MINIMUM 2010',
 'KWH 1ST QUARTILE 2010',
 'KWH 2ND QUARTILE 2010',
 'KWH 3RD QUARTILE 2010',
 'KWH MAXIMUM 2010',
 'KWH SQFT MEAN 2010',
 'KWH SQFT STANDARD DEVIATION 2010',
 'KWH SQFT MINIMUM 2010',
 'KWH SQFT 1ST QUARTI

In [6]:
# Display the data frame with relevant information
cleaner_df = alpha_df[["COMMUNITY AREA NAME", "TOTAL KWH", "TOTAL POPULATION"]]
cleaner_df.head()

Unnamed: 0,COMMUNITY AREA NAME,TOTAL KWH,TOTAL POPULATION
2307,Albany Park,70328.0,169.0
2001,Albany Park,87385.0,124.0
2002,Albany Park,88369.0,131.0
2003,Albany Park,108583.0,131.0
2004,Albany Park,56383.0,164.0


In [7]:
# Drop nan rows
cleaned_df = cleaner_df.dropna(how="any")
cleaned_df.count()

COMMUNITY AREA NAME    66166
TOTAL KWH              66166
TOTAL POPULATION       66166
dtype: int64

In [8]:
# Organize data further
community_energy = cleaned_df.groupby(['COMMUNITY AREA NAME']).sum().reset_index()

community_sorted_df = community_energy.sort_values("TOTAL KWH", ascending=False)

community_sorted_df.head()

Unnamed: 0,COMMUNITY AREA NAME,TOTAL KWH,TOTAL POPULATION
41,Loop,2355801376.0,40990.0
47,Near North Side,1694452724.0,203227.0
49,Near West Side,1183554363.0,116630.0
48,Near South Side,521578417.0,42991.0
76,Woodlawn,482505776.0,76041.0


In [42]:
community_sorted_df["COMMUNITY AREA NAME"].unique()

array(['Loop', 'Near North Side', 'Near West Side', 'Near South Side',
       'Woodlawn', 'Lakeview', 'Lincoln Park', 'West Town', 'Austin',
       'South Lawndale', 'New City', 'Logan Square', 'Belmont Cragin',
       'Edgewater', 'Portage Park', 'Uptown', 'West Ridge',
       'Lower West Side', 'Norwood Park', 'South Deering', 'Rogers Park',
       'Irving Park', 'Ashburn', 'Humboldt Park', 'Forest Glen',
       'West Lawn', 'Lincoln Square', 'Roseland', 'Dunning',
       'North Center', 'Riverdale', 'Avondale', 'Auburn Gresham',
       'Garfield Ridge', 'South Shore', 'North Lawndale', 'Albany Park',
       'Douglas', 'North Park', 'Chicago Lawn', 'Bridgeport',
       'Archer Heights', 'Brighton Park', 'Jefferson Park', 'Chatham',
       'Gage Park', 'Clearing', 'Greater Grand Crossing', 'Beverly',
       'West Englewood', 'Morgan Park', 'Washington Heights',
       'West Pullman', 'Englewood', "O'Hare", 'Hyde Park',
       'Mount Greenwood', 'South Chicago', 'Kenwood', 'Armour Squa

In [9]:
# Isolate northside neighborhoods
northside_df = community_sorted_df[(community_sorted_df['COMMUNITY AREA NAME'].isin(['North Center', 'Lincoln Park', 'Avondale', 'Logan Square']))]

northside_df

Unnamed: 0,COMMUNITY AREA NAME,TOTAL KWH,TOTAL POPULATION
38,Lincoln Park,396206659.0,226915.0
40,Logan Square,247950170.0,203682.0
51,North Center,144747911.0,88334.0
7,Avondale,140404587.0,99831.0


In [10]:
# Isolate downtown neighborhoods
downtown_df = community_sorted_df[(community_sorted_df['COMMUNITY AREA NAME'].isin(['Near North Side', 'Loop', 'Near South Side']))]

downtown_df

Unnamed: 0,COMMUNITY AREA NAME,TOTAL KWH,TOTAL POPULATION
41,Loop,2355801376.0,40990.0
47,Near North Side,1694452724.0,203227.0
48,Near South Side,521578417.0,42991.0


In [11]:
# Isolate west side neighborhoods
westside_df = community_sorted_df[(community_sorted_df['COMMUNITY AREA NAME'].isin(['Humboldt Park',
                                                        'West Town',
                                                        'Austin',
                                                       'West Garfield Park',
                                                       'East Garfield Park',
                                                       'Near West Side',
                                                       'North Lawndale',
                                                       'South Lawndale',
                                                       'Lower West Side']))]

westside_df

Unnamed: 0,COMMUNITY AREA NAME,TOTAL KWH,TOTAL POPULATION
49,Near West Side,1183554363.0,116630.0
75,West Town,353180223.0,263716.0
5,Austin,339062823.0,247163.0
64,South Lawndale,277285305.0,206516.0
42,Lower West Side,197976142.0,103868.0
32,Humboldt Park,166727801.0,150179.0
52,North Lawndale,128486939.0,97159.0
19,East Garfield Park,63532640.0,59928.0
71,West Garfield Park,48148596.0,47183.0


In [12]:
# i=Isolate south side neighborhoods
southside_df = community_sorted_df[(community_sorted_df['COMMUNITY AREA NAME'].isin(['Armour Square',
                                                        'Douglas',
                                                        'Oakland',
                                                       'Fuller Park',
                                                       'Grand Boulevard',
                                                       'Kenwood',
                                                       'Washington Park',
                                                       'Hyde Park',
                                                       'Woodlawn',
                                                        'South Shore',
                                                        'Bridgeport',
                                                        'Greater Grand Crossing']))]

southside_df

Unnamed: 0,COMMUNITY AREA NAME,TOTAL KWH,TOTAL POPULATION
76,Woodlawn,482505776.0,76041.0
65,South Shore,132033061.0,175465.0
17,Douglas,123544300.0,30126.0
10,Bridgeport,116565964.0,93742.0
29,Greater Grand Crossing,95435918.0,89488.0
33,Hyde Park,78510633.0,99788.0
36,Kenwood,69705276.0,52177.0
2,Armour Square,69310555.0,34829.0
28,Grand Boulevard,67179096.0,61088.0
68,Washington Park,27122641.0,39975.0


In [13]:
# Calculate energy usage totals by neighborhood
northside_sum = (northside_df['TOTAL KWH'].sum() / 1000000)
downtown_sum = (downtown_df['TOTAL KWH'].sum() / 1000000)
westside_sum = (westside_df['TOTAL KWH'].sum() / 1000000)
southside_sum = (southside_df['TOTAL KWH'].sum() / 1000000)

print(northside_sum)
print(downtown_sum)
print(westside_sum)
print(southside_sum)

929.309327
4571.832517
2757.954832
1292.127604


In [14]:
# Calculate population totals by neighborhood
northside_pop = (northside_df['TOTAL POPULATION'].sum() / 1000)
downtown_pop = (downtown_df['TOTAL POPULATION'].sum() / 1000)
westside_pop = (westside_df['TOTAL POPULATION'].sum() / 1000)
southside_pop = (southside_df['TOTAL POPULATION'].sum() / 1000)

print(northside_pop)
print(downtown_pop)
print(westside_pop)
print(southside_pop)

618.762
287.208
1292.342
776.692


In [29]:
data = {'Community Area':['North Side', 'Downtown', 'West Side', 'South Side'],
        'Total KWH / 1 Million':[929.309327, 4571.832517, 2757.954832, 1292.127604],
       'Total Population / 1 Thousand': [618.762, 287.208, 1292.342, 776.692]}
df2 = pd.DataFrame(data)

df_ordered2 = df2.sort_values("Total KWH / 1 Million", ascending=False)

df_ordered2

Unnamed: 0,Community Area,Total KWH / 1 Million,Total Population / 1 Thousand
1,Downtown,4571.83,287.21
2,West Side,2757.95,1292.34
3,South Side,1292.13,776.69
0,North Side,929.31,618.76


In [16]:
# Filter the DataFrame down only to those columns to chart
com_and_kwh = df_ordered2[["Community Area","Total KWH"]]

# Set the index to be "Community Area" so they will be used as labels
com_and_kwh = com_and_kwh.set_index("Community Area")
com_and_kwh = com_and_kwh.sort_values('Total KWH', ascending = True)

com_and_kwh

Unnamed: 0_level_0,Total KWH
Community Area,Unnamed: 1_level_1
North Side,929.31
South Side,1292.13
West Side,2757.95
Downtown,4571.83


In [45]:
# Plot chart for Total KWH Useage (Neighborhoods Only)
com_and_kwh.plot(kind="bar", figsize=(5,5))
plt.title("Total KWH by Neighborhood (2010)")
plt.xlabel("Neighborhood")
plt.xticks(rotation=45)
plt.gca().yaxis.grid(True, linestyle='--')
plt.show()
plt.tight_layout()

plt.savefig('total_kwh.png')

<IPython.core.display.Javascript object>

In [46]:
# Plot chart for Total KWH vs. Total Pop
multi_plot = df_ordered2.plot(kind="bar", figsize=(5,5))
multi_plot.set_xticklabels(df_ordered2["Community Area"], rotation=45)
plt.title("Total KWH vs. Neighborhood Population (2010)")
plt.xlabel("Neighborhood")
plt.gca().yaxis.grid(True, linestyle='--')
plt.show()
plt.tight_layout()

plt.savefig('kwh_vs_pop.png')

<IPython.core.display.Javascript object>