In [137]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.metrics import mean_squared_error
from sklearn.dummy import DummyRegressor
from sklearn.feature_selection import RFE
from sklearn.impute import SimpleImputer, KNNImputer

from sklearn import set_config
set_config(display='diagram')

In [138]:
housing = pd.read_csv('../datasets/train_clean.csv')

In [139]:
housing.head()

Unnamed: 0,id,pid,ms_subclass,ms_zoning,lot_frontage,lot_area,street,alley,lot_shape,land_contour,...,screen_porch,pool_area,pool_qc,fence,misc_feature,misc_val,mo_sold,yr_sold,sale_type,saleprice
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,130500
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,0,,,,0,4,2009,WD,220000
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,0,0,,,,0,1,2010,WD,109000
3,318,916386060,60,RL,73.0,9802,Pave,,Reg,Lvl,...,0,0,,,,0,4,2010,WD,174000
4,255,906425045,50,RL,82.0,14235,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,138500


In [140]:
# removing outliers

housing = housing[housing['1st_flr_sf'] < 3000]
housing = housing[(housing['total_bsmt_sf'] > 0) & (housing['total_bsmt_sf'] < 3000)]
housing = housing[housing['gr_liv_area'] < 4000]
housing = housing[housing['lot_area'] < 40000]
housing = housing[housing['garage_area'] > 0]
housing = housing[housing['2nd_flr_sf'] > 0]

housing = housing[(housing['saleprice'] > 50_000) & (housing['saleprice'] < 500_000)]

X = Features to be trained on

y = What the model is trying to predict

In [141]:
X = housing[
    [
        "overall_qual",
        "gr_liv_area",
        'exter_qual',
        "kitchen_qual",
        # 'garage_cars',
        'bsmt_qual',
        # 'year_built',
        # "1st_flr_sf",
        # 'year_remod_add',
        'full_bath',
        # 'fireplace_qu',
        "fireplaces",
        "garage_area",
        "total_bsmt_sf",
        "neighborhood",
    ]
]

y = housing["saleprice"]

Splitting data into training and testing sets

In [142]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

### Model

In [143]:
# column transformer that uses OneHotEncoder to encode nominal features
ct = make_column_transformer(
    (OneHotEncoder(sparse=False, handle_unknown='ignore'), make_column_selector(dtype_include=object)),
    remainder='passthrough',
    verbose_feature_names_out=False
)

X_train_transform = ct.fit_transform(X_train)
X_test_transform = ct.transform(X_test)

In [144]:
# Scaling the data
ss = StandardScaler()
X_train_scaled = ss.fit_transform(X_train_transform)
X_test_scaled = ss.transform(X_test_transform)

In [145]:
# fitting the model using linear regression
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)

#### Scores

In [146]:
preds = lr.predict(X_test_scaled)

print('Train R2: ', lr.score(X_train_scaled, y_train))
print(' Test R2: ', lr.score(X_test_scaled, y_test))
print('    RMSE: ', mean_squared_error(y_test, preds, squared=False))

Train R2:  0.8892685672730294
 Test R2:  0.8740267047051126
    RMSE:  24878.377384886164
