In [1]:
import numpy as np
import pandas as pd
from scipy import stats
from statsmodels.formula.api import ols
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
from sklearn.feature_selection import RFECV
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
import seaborn as sns
sns.set_theme()
import pickle
from datetime import datetime
import pytz
from pytz import common_timezones
import matplotlib.pyplot as plt
%matplotlib inline

pd.set_option('display.max_columns', 100)

In [None]:
# CLEANING: formatting date column from ICS datetime to datetime64(ns) for easier manipulation with Pandas
df.date = pd.to_datetime(df['date'], format='%Y/%m/%d') 

# creates variable representing today with format matching the cleaned 'date' column formatting
today = pd.to_datetime(pd.Timestamp.today(), format='%Y/%m/%d') 

# FE: number representing how many years ago (from today) that the sale occurred
df['yrs_since_sale'] = pd.to_numeric((today-df.date)/(3.154*10**16)) 

# FE: number representing age of property in years
df['yrs_old'] = 2021-df.yr_built 

# FE: how many years since renovation
df['yrs_since_renovation'] = 2021-df.yr_renovated 

# CLEANING: corrects properties that have never been renovated for engineered feature 'yrs_since_renovation'
df['yrs_since_renovation'] = np.where(df.yrs_since_renovation == 2021, 
                                      df.yrs_old, 
                                      df.yrs_since_renovation)

# FE: how many years since renovation at year of sale, create from 'yrs_since_renovation' minus 'yrs_since_sale'
df['yrs_since_ren_at_sale'] = (df.yrs_since_renovation - df.yrs_since_sale) 

# FE: subtracts total property livable sq. ft. divided by number of floors from lot size to estimate yard size 
df['sqft_yard'] = (df.sqft_lot-(df.sqft_living/df.floors)) 

# CLEANING: sets negative yard values to 0
df['sqft_yard'] = np.where(df.sqft_yard < 0, 
                                      0, 
                                      df.sqft_yard)

# CLEANING: sets 33 bedrooms to 3 (median and likely intended entry)
df['bedrooms'] = np.where(df.bedrooms == 33, 
                                      3, 
                                      df.bedrooms)

# CLEANING: caps total number of bedrooms to IQR max
df['bedrooms'] =  np.where(df['bedrooms'] >8, 
                                      8, 
                                      df.bedrooms)

# CLEANING: drops properties with 0 bedrooms
df['bedrooms'] =  np.where(df['bedrooms'] < 1, 
                                      1, 
                                      df.bedrooms)

# CLEANING: caps total number of bathrooms where number of bathrooms > 4 to IQR max
df['bathrooms'] =  np.where(df.bathrooms >4, 
                                      4, 
                                      df.bathrooms)

# CLEANING: caps properties total number of bathrooms where number of bathrooms < .75 to .75 bathrooms
df['bathrooms'] =  np.where(df.bathrooms <.75, 
                                      .75, 
                                      df.bathrooms)

# FE: creates a dummy variable similar to 'waterfront', that marks a property does not have 0 listed for the sqft of their basement 
df['sqft_basement_not_zero'] = np.where(df.sqft_basement == 0,
                                       1,
                                       0)

# CLEANING: caps properties with grades < 5 to grade 5
df['grade'] = np.where(df.grade < 5,
                       5,
                       df.grade)

# CLEANING: caps properties with grades > 11 to 11
df['grade'] = np.where(df.grade > 11, 
                       11, 
                       df.grade)

# FE: creates feature 'price_per_sqft_living' from 'price and 'sqft_living' (for data analysis, not to be used in final model)
df['price_per_sqft_living'] = (df['price']/df['sqft_living'])

# DUMMY: creates dummy variable for rural or not
df['rural'] = np.where(df.long > -121.961527, 
                                      1, 
                                      0)

# DUMMY: creates dummy variable for if property is within Seattle city limits or not
df['within_seattle_city_limits'] = np.where((df.long < -122.251569)
                                            & (df.long > -122.438230)
                                            & (df.lat < 47.734178)
                                            & (df.lat > 47.495479),
                                            1, 
                                            0)  

# FE: creates 'sale_month' from 'date' column's datetime64 type data
df['sale_month'] = df.date.dt.month # isolating sale month from 'date' column datetime64 object

# FE: creates 'sale_year' from 'date' column's datetime64 type data
df['sale_year'] = df.date.dt.year 

# DUMMY: dummy variable columns for each season
df['sale_season_winter'] = np.where(((df.sale_month == 1)
                                   | (df.sale_month == 2)
                                   | (df.sale_month == 3)),
                                   1,
                                   0)

df['sale_season_spring'] = np.where(((df.sale_month == 4)
                            | (df.sale_month == 5)
                            | (df.sale_month == 6)),
                             1,
                             0)

df['sale_season_summer'] = np.where(((df.sale_month == 7)
                                   | (df.sale_month == 8)
                                   | (df.sale_month == 9)),
                                   1,
                                   0)

df['sale_season_fall'] = np.where(((df.sale_month == 10)
                                 | (df.sale_month == 11)
                                 | (df.sale_month == 12)),
                                  1,
                                  0)

# CLEANING: not relevant to analysis nor predictive model
"""
CHECK WITH SEAN
SHOULD I NOT DROP THIS AT BEGINNING, AND INSTEAD ONLY DROP AT FEATURE SELECTION STAGE?
"""
df.drop(columns='id', inplace = True) 