In [1]:
import numpy as np
import pandas as pd
import helper

In [2]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.style as style
style.use('fivethirtyeight')

In [3]:
## colors
colors = ["#FF0B04", "#F1BE48",
           "#B9975B", "#8B5B29",
           "#524727",
         ]
sns.set_palette(sns.color_palette(colors))

In [4]:
## open housing latlong info
latlong = pd.read_csv('ames_housing_latlong.csv',index_col=0,low_memory = False)

In [5]:
## load housing data
housing = pd.read_csv('Ames_Housing_Price_Data.csv', index_col=0,low_memory = False)

train, test = helper.data_processing_wrapper(housing, remove_PID=False, 
                                               num_to_cat_list = ['MSSubClass'])
cat_feats = train.select_dtypes(['object','bool']).columns.to_list()
num_cols = train.select_dtypes(['float64','int64']).columns.to_list()
num_cols.remove('SalePrice')


In [6]:
colnames = ['PID', 'Prop_Addr', 'latitude', 'longitude']
latlong = latlong[colnames]

In [23]:
combined_train = train.merge(latlong, how = 'left', left_on = 'PID', right_on = 'PID')
combined_test = test.merge(latlong, how = 'left', left_on = 'PID', right_on = 'PID')

In [8]:
## Identify missing values in lat long 
print(combined_train.latitude.isna().sum())
print(combined_train.longitude.isna().sum())
print(combined_test.latitude.isna().sum())
print(combined_test.longitude.isna().sum())

## get the geo cordinates and set the dictionary for housing dataset
combined_train, latlong_map = helper.geo_cords_imputing(combined_train)
combined_test = helper.geo_cords_imputing(combined_test, latlong_map)

## using latlong dictionary, assign lat-long to training dataset
combined_train = combined_train.dropna(subset=['latitude', 'longitude'])
combined_train = combined_train.reset_index(drop=True)

# Check missing values
## Identify missing values in lat long 
print(combined_train.latitude.isna().sum())
print(combined_train.longitude.isna().sum())
print(combined_test.latitude.isna().sum())
print(combined_test.longitude.isna().sum())

15
15
4
4
0
0
0
0


In [9]:
print(combined_train.latitude.isna().sum())
print(combined_train.longitude.isna().sum())
print(combined_test.latitude.isna().sum())
print(combined_test.longitude.isna().sum())

0
0
0
0


In [10]:
combined_train.shape

(1868, 84)

## Latlong

In [11]:
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

In [12]:
## Load parks data
parks = pd.read_csv('parks2.csv', low_memory = False)

parks_latlong = pd.read_csv('parks_latlong.csv', index_col=0, low_memory = False)

In [14]:
# # RUN THIS TO ADD LAT/LONG TO PARKS FILE
# IT IS SAVED AS PARKS_LATLONG IN FOLDER

# for i in range(parks.shape[0]):
#     try:
#         address = parks['parkAddress'][i]
#         geolocator = Nominatim(user_agent="ames_location")
#         geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)
#         loc = geolocator.geocode(address)
#         parks.loc[i,'latitude'] = loc.latitude
#         parks.loc[i,'longitude'] = loc.longitude
#         print(loc.latitude, loc.longitude)
#     except:
#         print(f'{address} didnt work')
# parks.to_csv('parks_latlong.csv')

### Find distance to nearest house for each house in housing data

In [16]:
import haversine as hs

In [24]:
combined_all = pd.concat([combined_train, combined_test], axis=0)

In [25]:
combined_all.reset_index(drop=True, inplace=True)

In [26]:
combined_all.head()

Unnamed: 0,PID,GrLivArea,SalePrice,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,...,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,Prop_Addr,latitude,longitude
0,905101330,1296,90000,90,RL,72.0,10791,Pave,0,Reg,...,,Shed,500,10,2006,WD,Normal,3915 QUEBEC ST,42.033394,-93.671438
1,909451100,1229,137000,160,RM,24.0,1488,Pave,0,Reg,...,GdPrv,,0,10,2009,WD,Normal,1505 LITTLE BLUESTEM CT UNIT 119,42.009938,-93.648234
2,527451450,948,89000,160,RM,21.0,1680,Pave,0,Reg,...,,,0,7,2006,WD,Normal,1464 BRECKINRIDGE CT,42.051657,-93.629641
3,903232190,1040,123900,50,RM,52.0,6240,Pave,0,Reg,...,,,0,5,2010,WD,Normal,1028 MARSTON AVE,42.031644,-93.62345
4,914452120,912,156000,85,RL,61.990202,7540,Pave,0,IR1,...,MnPrv,,0,6,2007,WD,Normal,418 OPAL CIR,41.9942,-93.606414


In [27]:
# ## shortest distance to any public parkspace
parksFrame = pd.DataFrame()
parksFrame['PID'] = combined_all['PID']

for i in range(combined_all.shape[0]):
    for j in range(parks_latlong.shape[0]):
        try:
            loc1 = combined_all.loc[i,['latitude', 'longitude']]
            loc2 = parks_latlong.loc[j, ['latitude', 'longitude']]
            dist = hs.haversine(loc1, loc2, unit = 'mi')
            parksFrame.loc[i,j] = dist
        except:
            print(f"{combined_all['PID'][i]} for {parks_latlong['parkName'][j]} didn't work")

            parksFrameCols = ['PID']

parksFrameCols = ['PID']
parklist = parks_latlong['parkName'].to_list()
parksFrameCols.extend(parks_latlong['parkName'].to_list())
parksFrame.columns = parksFrameCols

In [None]:
# ## shortest distance to any public parkspace
# parksFrame = pd.DataFrame()

# for i in range(combined_train.shape[0]):
#     for park in range(parks_latlong.shape[0]):
#         loc1 = combined_train.loc[i,['latitude', 'longitude']]
#         loc2 = parks_latlong.loc[j, ['latitude', 'longitude']]
#         dist = hs.haversine(loc1, loc2, unit = 'mi')

In [28]:
## Min distances
parksFrame['DistToPark'] = parksFrame.min(axis=1)

parklist = parks_latlong.loc[parks_latlong['parkRecFacility']==1,:]['parkName'].to_list()
parksFrame['DistToRec'] = parksFrame[parklist].min(axis=1)

parklist = parks_latlong.loc[parks_latlong['parkPicnicArea']==1,:]['parkName'].to_list()
parksFrame['DistToPicnic'] = parksFrame[parklist].min(axis=1)

parklist = parks_latlong.loc[parks_latlong['parkPlayground']==1,:]['parkName'].to_list()
parksFrame['DistToPlayground'] = parksFrame[parklist].min(axis=1)


In [29]:
## Num parks under 1 mile

parklist = parks_latlong['parkName'].to_list()
parksFrame['halfMileParks'] = parksFrame[(parksFrame[parklist]<0.5)].count(axis=1)

In [30]:
parksFrame.head()

Unnamed: 0,PID,Ada Hayden,Ames Dog Park,band shell park,Moore Memorial,brookside park,emma mccarthy lee park,river valley park,charles & June Calhoun Park,christofferson park,...,teagarden park,tom evans plaza,community center,municpal pool,munn woods,DistToPark,DistToRec,DistToPicnic,DistToPlayground,halfMileParks
0,905101330,3.002997,3.930975,3.196491,1.447277,2.870328,0.356751,2.602768,3.206397,2.87616,...,4.35818,2.982428,2.787511,2.10246,0.564243,0.356751,0.356751,0.356751,0.356751,2
1,909451100,1.320744,2.181423,2.259811,2.630135,2.018363,1.75187,2.214805,3.896283,0.899943,...,2.422524,2.024259,1.922946,2.29169,1.55438,0.164117,0.164117,1.02631,0.164117,1
2,527451450,2.931373,3.569639,2.036303,1.132865,1.845561,2.247466,1.260573,0.860669,3.806113,...,4.321133,1.998802,1.849471,0.761332,2.774948,0.407381,0.407381,0.860669,0.407381,1
3,903232190,1.526446,2.164646,0.793028,1.813343,0.496685,2.1168,0.249686,2.186515,2.566621,...,2.90325,0.650754,0.462554,0.757363,2.449661,0.249686,0.249686,0.249686,0.249686,5
4,914452120,1.345306,0.59808,2.209667,4.36585,2.313233,3.978178,2.890029,4.854778,1.897304,...,0.235077,2.169181,2.297986,3.467474,3.924008,0.172435,0.172435,1.345306,0.172435,2


In [31]:
parksFrame.to_csv('parksFeatures.csv')