In [1]:
# Import dependencies
import pandas as pd

In [2]:
# Import heat and per capita income of community areas csv files

# Electricity csv import
energy_file = "HEAT_only_some_housing.csv"
heat_df = pd.read_csv(energy_file)

# Check and clean up HEAT_only_some_housing..csv
heat_df.drop(columns='Unnamed: 0', inplace=True)
heat_df.head()

Unnamed: 0,COMMUNITY AREA NAME,BUILDING TYPE,BUILDING_SUBTYPE,THERM JANUARY 2010,THERM FEBRUARY 2010,THERM MARCH 2010,TERM APRIL 2010,THERM MAY 2010,THERM JUNE 2010,THERM JULY 2010,...,THERM 1ST QUARTILE 2010,THERM 2ND QUARTILE 2010,THERM 3RD QUARTILE 2010,AVERAGE HOUSESIZE,OCCUPIED UNITS,OCCUPIED UNITS PERCENTAGE,RENTER-OCCUPIED HOUSING UNITS,RENTER-OCCUPIED HOUSING PERCENTAGE,OCCUPIED HOUSING UNITS,TOTAL POPULATION
0,Archer Heights,Residential,Multi < 7,2326.0,2131.0,1400.0,620.0,502.0,224.0,222.0,...,1334.0,1864.5,2306.0,3.87,23.0,0.9582,9.0,0.391,23.0,89.0
1,Ashburn,Residential,Multi 7+,,,,,,,,...,,,,1.81,62.0,0.9254,50.0,0.8059,62.0,112.0
2,Auburn Gresham,Commercial,Multi < 7,1561.0,1388.0,1507.0,701.0,486.0,380.0,18.0,...,94.0,3028.5,5963.0,3.0,34.0,0.7082,23.0,0.6759,34.0,102.0
3,Austin,Commercial,Multi < 7,,,,,,,,...,400.0,400.0,400.0,2.95,41.0,0.7321,32.0,0.78,41.0,121.0
4,Austin,Commercial,Multi < 7,310.0,268.0,163.0,77.0,57.0,23.0,19.0,...,949.0,949.0,949.0,3.26,19.0,0.8261,11.0,0.579,19.0,62.0


In [3]:
# Importe per capita income of community areas csv
community_income_file = "../community_and_per_capita_income.csv"
income_df = pd.read_csv(community_income_file)

# Check and clean up community_and_per_capita_income.csv
income_df.drop(['Unnamed: 0'], axis=1, inplace=True)
income_df.head()

Unnamed: 0,COMMUNITY AREA NAME,PER CAPITA INCOME
0,Rogers Park,23714
1,West Ridge,21375
2,Uptown,32355
3,Lincoln Square,35503
4,North Center,51615


In [5]:
# Recording original length of heat_df before cleaning
print(f"Original length of heat_df: {len(heat_df)}")

Original length of heat_df: 67051


In [6]:
# Clean up heat_df
# Drop commercial and industrial building types

print(f"The break down of building types: {heat_df['BUILDING TYPE'].value_counts()}")

residential_heat = heat_df.set_index('BUILDING TYPE')
residential_heat.drop(index="Commercial", inplace=True)
residential_heat.drop(index="Industrial", inplace=True)

# Record new length of residential_heat DataFrame 
print(f"----------------------------------------------")
print(f"The length of residential_heat DataFrame is {len(residential_heat)}")

The break down of building types: Residential    49747
Commercial     17185
Industrial        42
Name: BUILDING TYPE, dtype: int64
----------------------------------------------
The length of residential_heat DataFrame is 49824


In [8]:
# Continue cleaning heat data
# Reset index of residential_heat, so 'Building Type' is just a normal column
residential_heat.reset_index(inplace=True)

# Check residential_electricity, confirm dropped commercial and industrial building types
print(f"Building type in residential_heat: {residential_heat['BUILDING TYPE'].value_counts()}")
residential_heat.head()


Building type in residential_heat: Residential    49747
Name: BUILDING TYPE, dtype: int64


Unnamed: 0,index,BUILDING TYPE,COMMUNITY AREA NAME,BUILDING_SUBTYPE,THERM JANUARY 2010,THERM FEBRUARY 2010,THERM MARCH 2010,TERM APRIL 2010,THERM MAY 2010,THERM JUNE 2010,...,THERM 1ST QUARTILE 2010,THERM 2ND QUARTILE 2010,THERM 3RD QUARTILE 2010,AVERAGE HOUSESIZE,OCCUPIED UNITS,OCCUPIED UNITS PERCENTAGE,RENTER-OCCUPIED HOUSING UNITS,RENTER-OCCUPIED HOUSING PERCENTAGE,OCCUPIED HOUSING UNITS,TOTAL POPULATION
0,0,Residential,Archer Heights,Multi < 7,2326.0,2131.0,1400.0,620.0,502.0,224.0,...,1334.0,1864.5,2306.0,3.87,23.0,0.9582,9.0,0.391,23.0,89.0
1,1,Residential,Ashburn,Multi 7+,,,,,,,...,,,,1.81,62.0,0.9254,50.0,0.8059,62.0,112.0
2,2,Residential,Austin,Multi 7+,,,,,,,...,,,,2.93,27.0,0.871,27.0,1.0,27.0,79.0
3,3,Residential,Austin,Multi 7+,,,,,,,...,,,,3.82,22.0,0.6667,16.0,0.727,22.0,84.0
4,4,Residential,Austin,Multi < 7,,,,,,,...,,,,0.0,0.0,,0.0,,0.0,0.0


In [10]:
# Create new DataFrame of just community name, 'THERM MEAN 2010', total population
therm_mean_df = residential_heat[['COMMUNITY AREA NAME', 'THERM MEAN 2010', 'TOTAL POPULATION']]

print(f"Number of heat readings {len(therm_mean_df)}")

# Clean up data by dropping NaN values
therm_mean_df.dropna(inplace=True)

print(f"Number of heat readings after dropping NaN, {len(therm_mean_df)}")
print(f"Number of community areas: {therm_mean_df['COMMUNITY AREA NAME'].nunique()}")

Number of heat readings 49824
Number of heat readings after dropping NaN, 49386
Number of community areas: 77


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [11]:
# Compare community area names from income_df and therm_mean_df

# Prepare new DataFrame, income_names
income_names = income_df['COMMUNITY AREA NAME']

# Adjust income_names by 'COMMUNITY AREA NAME'
# Sort alphabetically
income_names = pd.DataFrame(income_names.sort_values())

# Check income_names DataFrame
income_names.head()

Unnamed: 0,COMMUNITY AREA NAME
13,Albany Park
56,Archer Heights
33,Armour Square
69,Ashburn
70,Auburn Gresham


In [12]:
# Reset index of income_names so it is at normal 0, 1, 2, etc
# Drop 'index' column
income_names = income_names.reset_index()
income_names = income_names.drop(columns=['index'])

# Check income_names with no index data
income_names.head()

Unnamed: 0,COMMUNITY AREA NAME
0,Albany Park
1,Archer Heights
2,Armour Square
3,Ashburn
4,Auburn Gresham


In [14]:
# Get 'COMMUNITY AREA NAME' data from therm_mean_df
therm_names = therm_mean_df['COMMUNITY AREA NAME']

# Get series of community area names, sort index (community area names) alphabetically
# Create DataFrame, therm_names from series
therm_names = pd.DataFrame(therm_names.value_counts().sort_index())

# Reset index so there is a dedicated column to community area names
therm_names = therm_names.reset_index()

# Check kwh_names DataFrame
therm_names.head()

Unnamed: 0,index,COMMUNITY AREA NAME
0,Albany Park,770
1,Archer Heights,313
2,Armour Square,177
3,Ashburn,756
4,Auburn Gresham,1151


In [15]:
# Drop column labeled 'COMMUNITY AREA NAME'
# Actual data is of number of heat readings in residential buildings
therm_names.drop(columns={'COMMUNITY AREA NAME'}, inplace=True)

# Rename 'index' to 'COMMUNITY AREA NAME' to reflect data
therm_names.rename(columns={'index': 'COMMUNITY AREA NAME'}, inplace=True)

# Check therm_names for rename and removal of columns
therm_names.head()

Unnamed: 0,COMMUNITY AREA NAME
0,Albany Park
1,Archer Heights
2,Armour Square
3,Ashburn
4,Auburn Gresham


In [16]:
# Compare income_names and therm_names 
# Determine if all COMMUNITY AREA NAMEs match up

# Adjust the display of DataFrame so all 77 rows will show 
# Will make it easier to quickly evaluate mismatched area names
pd.set_option('display.max_rows', 80)
income_names == therm_names


Unnamed: 0,COMMUNITY AREA NAME
0,True
1,True
2,True
3,True
4,True
5,True
6,True
7,True
8,True
9,True


In [17]:
# Print out the two mismatched area names at indices 37 and 44
# These results indicate where and which DataFrame to adjust
print(f"Community Area Names are mismatched at index 37") 
print(f"income: {income_names['COMMUNITY AREA NAME'][37]}")
print(f"therm: {therm_names['COMMUNITY AREA NAME'][37]}") 
print(f"-------------------------------------------")
      
print(f"Community Area Names are mismatched at index 44")
print(f"income: {income_names['COMMUNITY AREA NAME'][44]}")
print(f"therm: {therm_names['COMMUNITY AREA NAME'][44]}") 

Community Area Names are mismatched at index 37
income: Lake View
therm: Lakeview
-------------------------------------------
Community Area Names are mismatched at index 44
income: Montclaire
therm: Montclare


In [18]:
# Need to adjust therm_mean_df and income_df
# Adjustments based on City_of_Chicago_Community_Areas.pdf 

# Adjust therm_mean_df: Lakeview to Lake View
therm_mean_df.replace(to_replace='Lakeview', value='Lake View', inplace=True)

# Adjust income_df: Montclaire to Montclare
income_df.replace(to_replace='Montclaire', value='Montclare', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  method=method)


In [19]:
# Merge per capita income data from income_df to therm_mean_df
income_therm_mean = income_df.merge(therm_mean_df, how='outer', on='COMMUNITY AREA NAME')

# Check merged income_kwh_mean
income_therm_mean.head()

Unnamed: 0,COMMUNITY AREA NAME,PER CAPITA INCOME,THERM MEAN 2010,TOTAL POPULATION
0,Rogers Park,23714,4656.0,198.0
1,Rogers Park,23714,1045.0,107.0
2,Rogers Park,23714,11770.0,268.0
3,Rogers Park,23714,47.0,20.0
4,Rogers Park,23714,4533.0,77.0


In [21]:
# Save merged residential electricity and income data to separate csv file
income_therm_mean.to_csv('HEAT_mean_2010_merged_income.csv')