In [1]:
# 📚 Basic libraries
import pandas as pd
import numpy as np 

# 🤖 Machine Learning
# models
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, root_mean_squared_error


# Project libraries
from utils.file_handler import *
from utils.plots import *
from utils import eda
from utils.linear_regression import *
from utils.common_viz import model_validation
from utils.common_viz import corr_heatmap

# ⚙️ Settings
pd.set_option('display.max_columns', None) # display all columns
import warnings
warnings.filterwarnings('ignore') # ignore warnings

# Logging
import logging
# setup logging level
logging.getLogger().setLevel(logging.DEBUG)

## Feature engineering
Import results from feature engineering performed in feature_engineering.ipynb

In [2]:
df_local_path = "dataframes/df_featuretuned_linear_reg.pkl"
df = read_df_pickle(df_local_path).copy()
df.head()

Unnamed: 0,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,lat,long,sqft_living15,sqft_lot15,zipcode_cluster,zipcode_encoded,yr_sold,building_age,yrs_since_renovation,bath_bed_ratio,sqft_per_room,log_sqft_lot,log_sqft_lot15,log_sqft_above,log_sqft_basement,log_sqft_living,sale_count,multiple_sales,yrs_since_previous_sale
2496,2014-09-16,280000.0,6,3.0,2400,9373,2.0,0,0,3,7,2400,0,1991,0,47.3262,-122.214,2060,7316,2,1,2014,23,0,0.5,266.666667,9.145695,8.897956,7.783641,0.0,7.783641,2,1,0.0
2497,2015-04-22,300000.0,6,3.0,2400,9373,2.0,0,0,3,7,2400,0,1991,0,47.3262,-122.214,2060,7316,2,1,2015,24,0,0.5,266.666667,9.145695,8.897956,7.783641,0.0,7.783641,2,1,1.0
6735,2014-05-08,647500.0,4,1.75,2060,26036,1.0,0,0,4,8,1160,900,1947,0,47.4444,-122.351,2590,21891,1,63,2014,67,0,0.4375,358.26087,10.167274,9.993877,7.057037,6.803505,7.630947,1,0,0.0
8411,2014-08-11,400000.0,3,1.0,1460,43000,1.0,0,0,3,7,1460,0,1952,0,47.4434,-122.347,2250,20023,1,63,2014,62,0,0.333333,365.0,10.668979,9.904687,7.286876,0.0,7.286876,1,0,0.0
8809,2015-04-01,235000.0,3,1.0,1430,7599,1.5,0,0,4,6,1010,420,1930,0,47.4783,-122.265,1290,10320,1,64,2015,85,0,0.333333,357.5,8.935904,9.241936,6.918695,6.042633,7.266129,1,0,0.0


In [3]:
target_variable = 'price'

continuous_features = ['bath_bed_ratio', 'lat', 'log_sqft_above', 'log_sqft_basement', 'log_sqft_living', 'log_sqft_lot', 'log_sqft_lot15', 'long', 'sqft_above', 'sqft_basement', 'sqft_living', 'sqft_living15', 'sqft_lot', 'sqft_lot15', 'sqft_per_room']

descrete_features = ['bathrooms', 'bedrooms', 'building_age', 'floors', 'multiple_sales', 'sale_count', 'view', 'yr_built', 'yr_renovated', 'yr_sold', 'yrs_since_previous_sale', 'yrs_since_renovation', 'zipcode_cluster', 'zipcode_encoded']

ordinal_encoded_categorical_features = ['condition', 'grade', 'waterfront']

all_numeric_features = continuous_features + descrete_features

all_features = all_numeric_features + ordinal_encoded_categorical_features

## Linear Regression Hyperparameters tuning

In [4]:
logging.getLogger().setLevel(logging.INFO)

# Define different subsets of features
feature_subsets = {
    "continuous": continuous_features,
    "discrete": descrete_features,
    "categorical": ordinal_encoded_categorical_features,
    "all_numeric": all_numeric_features, 
    "all_features": all_features
}

# Define test sizes to experiment with
test_sizes = [0.2, 0.3, 0.4]

# Define different random_state values for variability
random_states = [15, 42, 100]

test_results_df = linear_regression_combo_test(df, feature_subsets, target_variable, test_sizes, random_states)

test_results_df

Unnamed: 0,test_size,random_state,R2,MAE,RMSE,MSE
LR_all_features_ts0.3_rs15,0.3,15.0,0.7406,116958.0194,194242.887,37730300000.0
LR_all_features_ts0.2_rs100,0.2,100.0,0.7404,117315.9754,187250.1495,35062620000.0
LR_all_features_ts0.2_rs15,0.2,15.0,0.7401,114897.4798,193091.5915,37284360000.0
LR_all_features_ts0.3_rs100,0.3,100.0,0.7386,118840.6201,195158.2124,38086730000.0
LR_all_features_ts0.4_rs100,0.4,100.0,0.7382,118270.5533,198311.175,39327320000.0
LR_all_features_ts0.4_rs42,0.4,42.0,0.7371,117172.5888,181617.5723,32984940000.0
LR_all_features_ts0.4_rs15,0.4,15.0,0.7355,117523.6848,193580.9559,37473590000.0
LR_all_features_ts0.3_rs42,0.3,42.0,0.7339,117760.056,180816.1316,32694470000.0
LR_all_features_ts0.2_rs42,0.2,42.0,0.7209,118420.6984,182609.2029,33346120000.0
LR_all_numeric_ts0.2_rs15,0.2,15.0,0.7128,124402.4665,203006.7754,41211750000.0


## Linear Regression 
- All features after feature engineering transformation
- test size 30%
- random state 15

In [12]:
final_linear_regression_model, dict_test_results = linear_regression_control(df, all_features, target_variable, test_size=0.3, random_state=15)

for key, value in dict_test_results.items():
    print(f"{key}: {value}")


test_size: 0.3
random_state: 15
R2: 0.7406148459970789
MAE: 116958.01941747658
RMSE: 194242.8870034209
MSE: 37730299151.423744


In [9]:
# Save model to a file
save_model_pickle(model=final_linear_regression_model, filename='final_linear_regression_model')


INFO:root:Model file pickle is updated: /Users/bubblegum_doubledrops/Library/Mobile Documents/com~apple~CloudDocs/0prio - Important heavy backups/IronHack/mini_projects/mini-project-ironkaggle/models/final_linear_regression_model.pkl
