# Importing Data

In [3]:
import pandas as pd
import censusgeocode as cg 
df = pd.read_csv("green_data/kc_house_data.csv")
df['address']

0        2102 Southeast 21st Court, Renton, Washington ...
1        11231 Greenwood Avenue North, Seattle, Washing...
2        8504 South 113th Street, Seattle, Washington 9...
3        4079 Letitia Avenue South, Seattle, Washington...
4        2193 Northwest Talus Drive, Issaquah, Washingt...
                               ...                        
30150    4673 Eastern Avenue North, Seattle, Washington...
30151    4131 44th Avenue Southwest, Seattle, Washingto...
30152    910 Martin Luther King Jr Way, Seattle, Washin...
30153    17127 114th Avenue Southeast, Renton, Washingt...
30154    18615 7th Avenue South, Burien, Washington 981...
Name: address, Length: 30155, dtype: object

In [56]:
lead_df = pd.read_csv("environment_data/census_data/Lead_Risk_from_Housing(2010_census).csv")
lead_king_df = lead_df.loc[(lead_df["County Name"] == "King"), ["Census Tract", "% Units w/ Lead Hazard (Estimated)"]]
lead_king_df


Unnamed: 0,Census Tract,% Units w/ Lead Hazard (Estimated)
296,53033000100,10.24
297,53033000200,20.11
298,53033000300,25.27
299,53033000401,9.47
300,53033000402,19.95
...,...,...
688,53033032602,2.71
689,53033032702,6.57
690,53033032703,33.52
691,53033032704,10.22


In [57]:
traffic_df = pd.read_csv("environment_data/census_data/Proximity_to_Heavy_Traffic_Roadways (2010_census).csv")
traffic_king_df = traffic_df.loc[traffic_df["County Name"] == "King", ["Census Tract", "Proximity to Heavy Traffic Roadways"]]
traffic_king_df1 = traffic_king_df.drop(traffic_king_df.index[-1])
traffic_king_df1

Unnamed: 0,Census Tract,Proximity to Heavy Traffic Roadways
302,53033000100,106265.93
303,53033000200,215000.00
304,53033000300,215000.00
305,53033000401,140603.44
306,53033000402,173851.70
...,...,...
694,53033032602,67000.00
695,53033032702,67000.00
696,53033032703,65000.00
697,53033032704,49000.00


In [58]:
superfund_df = pd.read_csv("environment_data/census_data/Proximity_to_National_Priorities_List_Facilities(superfund)(2021_census).csv")
superfund_king_df = superfund_df.loc[superfund_df["County Name"] == "King", ["Census Tract", "Average PNPL"]]
superfund_king_df


Unnamed: 0,Census Tract,Average PNPL
300,53033000100,0.06
301,53033000200,0.06
302,53033000300,0.06
303,53033000401,0.06
304,53033000402,0.07
...,...,...
692,53033032602,0.06
693,53033032702,0.05
694,53033032703,0.05
695,53033032704,0.05


In [61]:
toxic_df = pd.read_csv("environment_data/census_data/Toxic_Releases_from_Facilities_(RSEI_Model)(2010_census).csv")
toxic_king_df = toxic_df.loc[toxic_df["County Name"] == "King", ["Census Tract", "Average RSEI Concentrations"]]
toxic_king_df1 = toxic_king_df.drop(toxic_king_df.index[-1])
toxic_king_df1

Unnamed: 0,Census Tract,Average RSEI Concentrations
299,53033000100,4447.00
300,53033000200,7088.62
301,53033000300,2410.15
302,53033000401,4036.39
303,53033000402,3909.42
...,...,...
691,53033032602,3191.08
692,53033032702,1271.05
693,53033032703,297.25
694,53033032704,943.36


In [62]:
smoke_df = pd.read_csv("environment_data/census_data/Wildfire_Smoke_(2010_census).csv")
smoke_king_df = smoke_df.loc[smoke_df["County Name"] == "King", ["Census Tract", "Smoke Days"]]
smoke_king_df

Unnamed: 0,Census Tract,Smoke Days
302,53033000100,6.29
303,53033000200,6.14
304,53033000300,6.14
305,53033000401,6.14
306,53033000402,6.14
...,...,...
694,53033032602,6.43
695,53033032702,6.86
696,53033032703,6.43
697,53033032704,6.29


In [70]:
environment_merge = pd.merge(lead_king_df, traffic_king_df1, on = "Census Tract")
environment_merge = pd.merge(environment_merge, superfund_king_df, on = "Census Tract")
environment_merge = pd.merge(environment_merge, toxic_king_df1, on = "Census Tract")
environment_merge = pd.merge(environment_merge, smoke_king_df, on = "Census Tract")
environment_merge

Unnamed: 0,Census Tract,% Units w/ Lead Hazard (Estimated),Proximity to Heavy Traffic Roadways,Average PNPL,Average RSEI Concentrations,Smoke Days
0,53033000100,10.24,106265.93,0.06,4447.00,6.29
1,53033000200,20.11,215000.00,0.06,7088.62,6.14
2,53033000300,25.27,215000.00,0.06,2410.15,6.14
3,53033000401,9.47,140603.44,0.06,4036.39,6.14
4,53033000402,19.95,173851.70,0.07,3909.42,6.14
...,...,...,...,...,...,...
392,53033032602,2.71,67000.00,0.06,3191.08,6.43
393,53033032702,6.57,67000.00,0.05,1271.05,6.86
394,53033032703,33.52,65000.00,0.05,297.25,6.43
395,53033032704,10.22,49000.00,0.05,943.36,6.29


In [20]:
GEO_ID = cg.address('11231 Greenwood Avenue North', city='Seattle', state='WA', zip='98133')[0]["geographies"]['Census Tracts'][0]["GEOID"]
GEO_ID

'53033001400'

In [21]:
GEO_ID = cg.address('4673 Eastern Avenue North', city='Seattle', state='WA', zip='98103')[0]["geographies"]['Census Tracts'][0]["GEOID"]
GEO_ID

'53033005100'

In [20]:
df.columns

Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'greenbelt', 'nuisance', 'view',
       'condition', 'grade', 'heat_source', 'sewer_system', 'sqft_above',
       'sqft_basement', 'sqft_garage', 'sqft_patio', 'yr_built',
       'yr_renovated', 'address', 'lat', 'long'],
      dtype='object')

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30155 entries, 0 to 30154
Data columns (total 25 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             30155 non-null  int64  
 1   date           30155 non-null  object 
 2   price          30155 non-null  float64
 3   bedrooms       30155 non-null  int64  
 4   bathrooms      30155 non-null  float64
 5   sqft_living    30155 non-null  int64  
 6   sqft_lot       30155 non-null  int64  
 7   floors         30155 non-null  float64
 8   waterfront     30155 non-null  object 
 9   greenbelt      30155 non-null  object 
 10  nuisance       30155 non-null  object 
 11  view           30155 non-null  object 
 12  condition      30155 non-null  object 
 13  grade          30155 non-null  object 
 14  heat_source    30123 non-null  object 
 15  sewer_system   30141 non-null  object 
 16  sqft_above     30155 non-null  int64  
 17  sqft_basement  30155 non-null  int64  
 18  sqft_g

In [24]:
df['heat_source'].unique()

array(['Gas', 'Oil', 'Electricity', 'Gas/Solar', 'Electricity/Solar',
       'Other', nan, 'Oil/Solar'], dtype=object)

In [26]:
df.shape

(30155, 25)

# EDA

# Train/Test/Split

# Training

# Test

# Check Assumptions