In [432]:
import pandas as pd
from functools import reduce

In [433]:
data = pd.read_csv('2015_Greenhouse_Gas_Report-_Data.csv')
data.head()

Unnamed: 0,Source,County,Address,Total Emissions (MT CO2e),Biogenic CO2 (MT CO2),Fossil CO2 (MT CO2),Methane (MT CO2e),Nitrous Oxide (MT CO2e),Hydrofluorocarbons (MT CO2e),Perfluorocarbons (MT CO2e),Sulfur Hexafluoride (MT CO2e),Other (MT CO2e),"Location (Latitude, Longitude)",Sector,Sub Sector,Clean Air Rule,"Energy-Intensive, Trade-Exposed",Point Size- Based on Total Emissions
0,Agrium Kennewick Fertilizer Operations (KFO) -...,Benton,"227515 E. Bowles Rd Kennewick, WA 99337",155888,0,10731,1,145157,0,0,0,0,"(46.165957, -119.017218)",Chemicals,Nitric Acid Production,YES,YES,3
1,Air Liquide Hydrogen Plant - Anacortes,Skagit,"8581 South Texas Road Anacortes, WA 98221",64413,0,64413,0,0,0,0,0,0,"(48.465617, -122.556032)",Chemicals,Hydrogen Production,NO,NO,2
2,Alcoa Intalco Works - Ferndale,Whatcom,"4050 Mountain View Road Ferndale, WA 98248",1195786,0,418042,10480,34,0,767230,0,0,"(48.8455, -122.7055)",Metals,Aluminum Production,YES,YES,4
3,Alcoa Wenatchee Works - Malaga,Chelan,"6200 Malaga/Alcoa Hwy. Malaga, WA 98828",331207,0,254025,14,16,0,77152,0,0,"(47.3567, -120.1273)",Metals,Aluminum Production,YES,YES,3
4,Ardagh Glass Inc. - Seattle,King,"5801 East Marginal Way South Seattle, WA 98134",76674,0,76614,27,32,0,0,0,0,"(47.55242, -122.33739)",Minerals,Glass Production,YES,NO,2


### Some Cleanup to Start

In [434]:
#Defining new column names that do not have random spaces at the end
new_header={data[:0][i].name:data[:0][i].name[0:-1] if data[:0][i].name.endswith(" ") 
            else data[:0][i].name for i in data[:0]}

#Renaming columns
data=data.rename(columns=new_header)

In [435]:
# Add numerical IDs to rows
data["ID"]=range(data.shape[0])

### Sector & Subsector IDs

In [436]:
#Get sectors
sectors=set(data["Sector"])

#Assign sector IDs
sector_ID={s:i for s,i in zip(sectors, range(len(sectors)))}

#Add column with IDs
data["Sector ID"]=data["Sector"].map(sector_ID)

In [437]:
#Set up list of subsectors by sector
subsectors=[set()]*len(sectors)

#Make a list of all the sets of subsets, indexed by sector IDs
for subsector,sector in zip(data["Sub Sector"],data["Sector ID"]):
    subsectors[sector]=subsectors[sector].union({subsector})

#Makes sets in above list into dictionaries
temp_subsector_ID=[{s:i for s,i in zip(subs, range(len(subs)))} for subs in subsectors]


#Define function to join dictionaries because this doesn't exist by default
def dict_join(x,y):
    temp_dict=x.copy()
    temp_dict.update(y)
    return(temp_dict)

#Flatten list of dictionaries into one dictionary
subsector_ID=reduce(dict_join, temp_subsector_ID)

#Use dictionary to add column of subsector IDs by sector
data["Sub Sector ID"]=data["Sub Sector"].map(subsector_ID)

In [438]:
#Some feature combination, though of very questionable value
data["Sector, Subsector ID"]=tuple(zip(data["Sector ID"], data["Sub Sector ID"]))

### County IDs

In [439]:
#Get counties
counties=set(data["County"])

#Assign county IDs
county_ID={c:i for c,i in zip(counties, range(len(counties)))}

#Add column with IDs
data["County ID"]=data["County"].map(county_ID)

### Converting Words to Numerical Truth Values

In [440]:
#Define a dict for converting YES and NO to numerical values
truth_dict={"YES":1, "NO":0}

#Apply dict to applicable columns
data["Clean Air Rule"] = data["Clean Air Rule"].map(truth_dict)
data["Energy-Intensive, Trade-Exposed"]=data["Energy-Intensive, Trade-Exposed"].map(truth_dict)

### Splitting Company Names from Cities

In [441]:
#Get company names and city names by splitting data from Source at " - "
companies, cities=zip(*[i.split(" - ") for i in data["Source"]])

#Make new columns
data["Company"]=companies
data["City"]=cities

In [442]:
#Make sets
companies_set=set(companies)
cities_set=set(cities)

#Assign IDs
company_ID={c:i for c,i in zip(companies_set, range(len(companies_set)))}
city_ID={c:i for c,i in zip(cities_set, range(len(cities_set)))}

#Add column with IDs
data["Company ID"]=data["Company"].map(company_ID)
data["City ID"]=data["City"].map(city_ID)

In [443]:
#Another questionably useful feature combination
data["City, County ID"]=tuple(zip(data["City ID"], data["County ID"]))

In [444]:
data

Unnamed: 0,Source,County,Address,Total Emissions (MT CO2e),Biogenic CO2 (MT CO2),Fossil CO2 (MT CO2),Methane (MT CO2e),Nitrous Oxide (MT CO2e),Hydrofluorocarbons (MT CO2e),Perfluorocarbons (MT CO2e),...,ID,Sector ID,Sub Sector ID,"Sector, Subsector ID",County ID,Company,City,Company ID,City ID,"City, County ID"
0,Agrium Kennewick Fertilizer Operations (KFO) -...,Benton,"227515 E. Bowles Rd Kennewick, WA 99337",155888,0,10731,1,145157,0,0,...,0,6,0,"(6, 0)",14,Agrium Kennewick Fertilizer Operations (KFO),Kennewick,8,66,"(66, 14)"
1,Air Liquide Hydrogen Plant - Anacortes,Skagit,"8581 South Texas Road Anacortes, WA 98221",64413,0,64413,0,0,0,0,...,1,6,1,"(6, 1)",17,Air Liquide Hydrogen Plant,Anacortes,50,22,"(22, 17)"
2,Alcoa Intalco Works - Ferndale,Whatcom,"4050 Mountain View Road Ferndale, WA 98248",1195786,0,418042,10480,34,0,767230,...,2,0,2,"(0, 2)",19,Alcoa Intalco Works,Ferndale,104,74,"(74, 19)"
3,Alcoa Wenatchee Works - Malaga,Chelan,"6200 Malaga/Alcoa Hwy. Malaga, WA 98828",331207,0,254025,14,16,0,77152,...,3,0,2,"(0, 2)",21,Alcoa Wenatchee Works,Malaga,30,35,"(35, 21)"
4,Ardagh Glass Inc. - Seattle,King,"5801 East Marginal Way South Seattle, WA 98134",76674,0,76614,27,32,0,0,...,4,11,1,"(11, 1)",4,Ardagh Glass Inc.,Seattle,86,2,"(2, 4)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
136,Vertellus Performance Chemicals LLC - Elma,Grays Harbor,"4800 State Route 12 Elma, WA 98541",17600,0,17582,8,10,0,0,...,136,6,2,"(6, 2)",16,Vertellus Performance Chemicals LLC,Elma,5,80,"(80, 16)"
137,WaferTech LLC - Camas,Clark,"5509 NW Parker St Camas, WA 98607",172725,0,13050,6,7177,10646,78165,...,137,8,2,"(8, 2)",25,WaferTech LLC,Camas,78,37,"(37, 25)"
138,Washington State University - Pullman,Whitman,"2660 Grimes Way Pullman, WA 99164-1172",57370,0,57309,28,34,0,0,...,138,5,0,"(5, 0)",5,Washington State University,Pullman,52,30,"(30, 5)"
139,Waste Management Greater Wenatchee Regional La...,Douglas,"191 Webb Road East Wenatchee, WA 98802",31208,0,0,31208,0,0,0,...,139,4,0,"(4, 0)",27,Waste Management Greater Wenatchee Regional La...,East Wenatchee,101,72,"(72, 27)"
