In [5]:
import numpy as np
import pandas as pd
import helper

In [6]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.style as style
style.use('fivethirtyeight')

In [7]:
## colors
colors = ["#FF0B04", "#F1BE48",
           "#B9975B", "#8B5B29",
           "#524727",
         ]
sns.set_palette(sns.color_palette(colors))

In [8]:
## open housing latlong info
latlong = pd.read_csv('ames_housing_latlong.csv',index_col=0,low_memory = False)

In [9]:
## load housing data
housing = pd.read_csv('Ames_Housing_Price_Data.csv', index_col=0,low_memory = False)

train, test = helper.data_processing_wrapper(housing, remove_PID=False, 
                                               num_to_cat_list = ['MSSubClass'])
cat_feats = train.select_dtypes(['object','bool']).columns.to_list()
num_cols = train.select_dtypes(['float64','int64']).columns.to_list()
num_cols.remove('SalePrice')


In [10]:
colnames = ['PID', 'Prop_Addr', 'latitude', 'longitude']
latlong = latlong[colnames]

In [11]:
combined_train = train.merge(latlong, how = 'left', left_on = 'PID', right_on = 'PID')
combined_test = test.merge(latlong, how = 'left', left_on = 'PID', right_on = 'PID')

In [12]:
## Identify missing values in lat long 
print(combined_train.latitude.isna().sum())
print(combined_train.longitude.isna().sum())
print(combined_test.latitude.isna().sum())
print(combined_test.longitude.isna().sum())

## get the geo cordinates and set the dictionary for housing dataset
combined_train, latlong_map = helper.geo_cords_imputing(combined_train)
combined_test = helper.geo_cords_imputing(combined_test, latlong_map)

## using latlong dictionary, assign lat-long to training dataset
combined_train = combined_train.dropna(subset=['latitude', 'longitude'])
combined_train = combined_train.reset_index(drop=True)

# Check missing values
## Identify missing values in lat long 
print(combined_train.latitude.isna().sum())
print(combined_train.longitude.isna().sum())
print(combined_test.latitude.isna().sum())
print(combined_test.longitude.isna().sum())

15
15
4
4
0
0
0
0


## Latlong

In [14]:
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

In [15]:
## Load parks data
parks = pd.read_csv('parks2.csv', low_memory = False)

parks_latlong = pd.read_csv('parks_latlong.csv', index_col=0, low_memory = False)

In [14]:
# # RUN THIS TO ADD LAT/LONG TO PARKS FILE
# IT IS SAVED AS PARKS_LATLONG IN FOLDER

# for i in range(parks.shape[0]):
#     try:
#         address = parks['parkAddress'][i]
#         geolocator = Nominatim(user_agent="ames_location")
#         geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)
#         loc = geolocator.geocode(address)
#         parks.loc[i,'latitude'] = loc.latitude
#         parks.loc[i,'longitude'] = loc.longitude
#         print(loc.latitude, loc.longitude)
#     except:
#         print(f'{address} didnt work')
# parks.to_csv('parks_latlong.csv')

### Find distance to nearest house for each house in housing data

In [16]:
import haversine as hs

In [17]:
combined_all = pd.concat([combined_train, combined_test], axis=0)
combined_all.reset_index(drop=True, inplace=True)

In [27]:
# ## shortest distance to any public parkspace
parksFrame = pd.DataFrame()
parksFrame['PID'] = combined_all['PID']

for i in range(combined_all.shape[0]):
    for j in range(parks_latlong.shape[0]):
        try:
            loc1 = combined_all.loc[i,['latitude', 'longitude']]
            loc2 = parks_latlong.loc[j, ['latitude', 'longitude']]
            dist = hs.haversine(loc1, loc2, unit = 'mi')
            parksFrame.loc[i,j] = dist
        except:
            print(f"{combined_all['PID'][i]} for {parks_latlong['parkName'][j]} didn't work")

            parksFrameCols = ['PID']

parksFrameCols = ['PID']
parklist = parks_latlong['parkName'].to_list()
parksFrameCols.extend(parks_latlong['parkName'].to_list())
# parksFrame.columns = ['distance to ' + s for s in parksFrameCols]
parksFrame.columns = parksFrameCols

In [28]:
## Min distances
parksFrame['DistToPark'] = parksFrame.min(axis=1)

parklist = parks_latlong.loc[parks_latlong['parkRecFacility']==1,:]['parkName'].to_list()
parksFrame['DistToRec'] = parksFrame[parklist].min(axis=1)

parklist = parks_latlong.loc[parks_latlong['parkPicnicArea']==1,:]['parkName'].to_list()
parksFrame['DistToPicnic'] = parksFrame[parklist].min(axis=1)

parklist = parks_latlong.loc[parks_latlong['parkPlayground']==1,:]['parkName'].to_list()
parksFrame['DistToPlayground'] = parksFrame[parklist].min(axis=1)


In [30]:
## Num parks under 1 mile

parklist = parks_latlong['parkName'].to_list()
parksFrame['halfMileParks'] = parksFrame[(parksFrame[parklist]<0.5)].count(axis=1)

In [29]:
## Closest park
parksFrame['closestPark'] = parksFrame[parks_latlong['parkName'].to_list()].idxmin(axis=1)

In [79]:
## Save to .csv file
# parksFrame.to_csv('parksFeatures.csv')

## Elementary Schools

In [45]:
schools = pd.read_csv('school_latlong.csv', low_memory = False)

In [46]:
# ## shortest distance to any public school
schoolsFrame = pd.DataFrame()
schoolsFrame['PID'] = combined_all['PID']

for i in range(combined_all.shape[0]):
    for j in range(schools.shape[0]):
        try:
            loc1 = combined_all.loc[i,['latitude', 'longitude']]
            loc2 = schools.loc[j, ['latitude', 'longitude']]
            dist = hs.haversine(loc1, loc2, unit = 'mi')
            schoolsFrame.loc[i,j] = dist
        except:
            print(f"{combined_all['PID'][i]} for {schools['parkName'][j]} didn't work")

schoolsFrameCols = ['PID']
schoolslist = schools['schoolName'].to_list()
schoolsFrameCols.extend(schools['schoolName'].to_list())
schoolsFrame.columns = schoolsFrameCols

In [70]:
## Min distances
schoolsFrame['DistToSchool'] = schoolsFrame.min(axis=1)

## Distance to elementary schools
schoollist = schools.loc[schools['schoolElementary']==1,:]['schoolName'].to_list()
schoolsFrame['DistToElementary'] = schoolsFrame[schoollist].min(axis=1)

## Closest school
schoolsFrame['closestSchool'] = schoolsFrame[schools['schoolName'].to_list()].idxmin(axis=1)

## Closest elementary
schoollist = schools.loc[schools['schoolElementary']==1,:]['schoolName'].to_list()
schoolsFrame['closestElementary'] = schoolsFrame[schoollist].idxmin(axis=1)

  schoolsFrame['DistToSchool'] = schoolsFrame.min(axis=1)


In [68]:
schoolsFrame['closestSchool'] = schoolsFrame[schools['schoolName'].to_list()].idxmin(axis=1)

In [73]:
## Save to 
# schoolsFrame.to_csv('schoolFeatures.csv')