## Imports

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from scorer import add_binary_basement,make_season,score,k_fold_test
from sklearn.preprocessing import PolynomialFeatures
import numpy as np
import seaborn as sns
from sklearn.linear_model import Lasso,Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold

## Data Init

In [2]:
df = pd.read_csv("kc_housing_data_for_feat_engineering_lab.csv")
# Make season column based off the date
df = make_season(df)
# Make Binary column based off basement sqft -> yes if not zero
df = add_binary_basement(df)
# drop price (we are predicting log_price)
# drop date, should have converted it to date time instead
df.drop(["date","price"],axis=1,inplace=True)

## Create DataFrame with all 2 degree Interactions & Polynomials

In [3]:
# Create new df
p = PolynomialFeatures(degree=2).fit(df)
features = pd.DataFrame(p.transform(df), columns=p.get_feature_names(df.columns))

# Do not include interactions with price_log
col_list = list(features.columns)
to_remove = list(filter(lambda col : "price" in col,col_list))
to_remove.remove("price_log")
features.drop(to_remove,axis=1,inplace=True)


## Create DataFrame with Best Interaction & Best Polynomial

In [4]:
# Get the r2 for each feature independently predicting log_price
results = []
for col in features.columns:
    if col != "price_log":
        model = LinearRegression()
        remove_list = list(features.columns)
        remove_list.remove(col)
        
        model.fit(features.drop(remove_list,axis=1),features.price_log)
        results.append((col,score(features,model,[col])))

# Sort them to find the best interactions
best_interaction = sorted(results,key=lambda inst : inst[1],reverse=True)

#index 0 is the best interaction, 6 is the best polynomial
selected_interactions = [best_interaction[0],best_interaction[6]]
selected_model = features.copy()
remove_these = list(selected_model.columns)
# keep the best interaction
for col in selected_interactions:
    remove_these.remove(col[0])
# keep original features
for col in df.columns:
    remove_these.remove(col)

# Drop all interactions that are not the best
selected_model = selected_model.drop(remove_these,axis=1)



# Test
With our two new datasets, run some basic Lasso and Ridge models

In [5]:
# First scale both datasets to properly work with Lasso and Ridge
scalar = StandardScaler()
x = features.drop("price_log",axis=1)
x_sel = selected_model.drop("price_log",axis=1)
scalar.fit(x)
scal_x  = scalar.transform(x)
scalar.fit(x_sel)
scal_x_sel = scalar.transform(x_sel)

# Create the different models to be used
las_model_2 = Lasso(alpha=.5)
las_model_3 = Lasso(alpha=.05)
las_model_4 = Lasso(alpha=.0005)
las_model_5 = Lasso(alpha=.00005)
rid_model = Ridge()
rid_model_sel = Ridge()

# Fit all interaction models
model1b = las_model_2.fit(scal_x,features.price_log)
model1c = las_model_3.fit(scal_x,features.price_log)
model1d = las_model_4.fit(scal_x,features.price_log)
model1f = las_model_5.fit(scal_x,features.price_log)
model2 = rid_model.fit(scal_x,features.price_log)
# Fit selected feature model
model3 = rid_model_sel.fit(scal_x_sel,selected_model.price_log)

# Print results
print(score(scal_x,model1b,y=features.price_log))
print(score(scal_x,model1c,y=features.price_log))
print(score(scal_x,model1d,y=features.price_log))
print(score(scal_x,model1f,y=features.price_log))
print(score(scal_x,model2,y=features.price_log))
print(score(scal_x_sel,model3,y=selected_model.price_log))




0.0
0.7306040312298105
0.793293358348929
0.7970408502439861
0.8001470546156451
0.7752115965185657




## Do the same thing but now with k fold test train split

In [6]:

# Same models as above
alphas = [.5,.05,.0005]
rid_model = Ridge()
rid_model_sel = Ridge()

# Run all five models and print results
for a in alphas:
    print(f"alpha = {a}\n",k_fold_test(Lasso(alpha=a,max_iter = 100000),scal_x,features.price_log,3))
print("Ridge on all features\n",k_fold_test(Ridge(),scal_x,features.price_log,3))
print("Ridge on selected feature\n",k_fold_test(Ridge(),scal_x_sel,selected_model.price_log,3))



alpha = 0.5
 -0.003850349159432683
alpha = 0.05
 0.726832775449655
alpha = 0.0005
 0.787962857426964
Ridge on all features
 0.7863567482199495
Ridge on selected feature
 0.7716325184893597
