In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
train_df = pd.read_csv('train.csv')


In [3]:
train_df.shape

(1460, 81)

In [4]:
train_df['SalePrice'].describe()

count      1460.000000
mean     180921.195890
std       79442.502883
min       34900.000000
25%      129975.000000
50%      163000.000000
75%      214000.000000
max      755000.000000
Name: SalePrice, dtype: float64

In [5]:
train_df['SalePrice'].isnull().sum()

0

No nulls in the target variable

In [6]:
train_data, test_data = train_test_split(train_df, test_size=0.2, random_state=1)

# Feature Selection

In [7]:
correlations = train_data.select_dtypes(exclude='object').corrwith(train_data['SalePrice']).abs()

In [8]:
columns_to_drop = list(correlations[correlations<=0.1].index)
train_data = train_data.drop(columns_to_drop, axis=1)
test_data = test_data.drop(columns_to_drop, axis=1)

In [9]:
n_unique_val_columns = [col for col in train_data.columns if len(train_data[col].unique())==1]

In [57]:
n_unique_val_columns   #No column with only 1 unique value

[]

In [10]:
null_pcts = train_data.isnull().sum()/len(train_data)*100

In [11]:
null_columns = list(null_pcts[null_pcts>=80].index)

In [12]:
train_data = train_data.drop(null_columns, axis=1)
test_data = test_data.drop(null_columns, axis=1)

# Feature Engineering

In [13]:
cat_features = list(train_data.select_dtypes(include='object').columns)

In [14]:
train_data_copy = train_data.copy()
test_data_copy = test_data.copy()

for col in cat_features:
    train_data[col] = train_data[col].astype('category')

for col in cat_features:
    test_data[col] = test_data[col].astype('category')

train_data['YearBuilt'] = 2025 - train_data['YearBuilt']
train_data = train_data.rename(columns={'YearBuilt':'Age'})

test_data['YearBuilt'] = 2025 - test_data['YearBuilt']
test_data = test_data.rename(columns={'YearBuilt':'Age'})

train_data_copy['YearBuilt'] = 2025 - train_data_copy['YearBuilt']
train_data_copy = train_data_copy.rename(columns={'YearBuilt':'Age'})

test_data_copy['YearBuilt'] = 2025 - test_data_copy['YearBuilt']
test_data_copy = test_data_copy.rename(columns={'YearBuilt':'Age'})

In [15]:
def get_1st_2nd_total_area(df):
    df['total_area_1st_2nd_floor'] = df['1stFlrSF'] + df['2ndFlrSF']

def get_1st_2nd_bsmt_total_area(df):
    df['total_area_1st_2nd_floor_bsmt'] = df['1stFlrSF'] + df['2ndFlrSF'] + df['BsmtUnfSF']

def get_bsmt_diff(df):
    df['bsmt_diff'] = df['TotalBsmtSF'] - df['BsmtUnfSF']


get_1st_2nd_total_area(train_data)
get_1st_2nd_total_area(test_data)

get_1st_2nd_bsmt_total_area(train_data)
get_1st_2nd_bsmt_total_area(test_data)

get_bsmt_diff(train_data)
get_bsmt_diff(test_data)

get_1st_2nd_total_area(train_data_copy)
get_1st_2nd_total_area(test_data_copy)

get_1st_2nd_bsmt_total_area(train_data_copy)
get_1st_2nd_bsmt_total_area(test_data_copy)

get_bsmt_diff(train_data_copy)
get_bsmt_diff(test_data_copy)


In [17]:
from sklearn.metrics import r2_score, mean_absolute_percentage_error, mean_squared_error
from sklearn.linear_model import LinearRegression, Ridge
from lightgbm import LGBMRegressor
import mlflow
import dagshub
import mlflow.lightgbm
import mlflow.sklearn

mlflow.set_experiment("house_price_prediction_regression")
dagshub.init(repo_owner='nipkha21', repo_name='House-Prices---Advanced-Regression-Techniques', mlflow=True)
mlflow.set_tracking_uri('https://dagshub.com/nipkha21/House-Prices---Advanced-Regression-Techniques.mlflow')

2025/04/06 16:07:34 INFO mlflow.tracking.fluent: Experiment with name 'house_price_prediction_regression' does not exist. Creating a new experiment.


We will use Linear Regression(classical and Ridge) and LightGBM. Since Linear Regression requires data preprocessing(null imputation and categorical feature transformation), we are going to create separate data set for feeding data to linear regression algorithm

In [18]:
cat_info = {col:len(train_data_copy[col].unique()) for col in train_data_copy.select_dtypes(include='object').columns}

train_data_copy_encoded = train_data_copy.copy()
test_data_copy_encoded = test_data_copy.copy()
cat_dict = {}
for col, n_unique in cat_info.items():
    if n_unique > 2:
        top_cats = train_data_copy[col].value_counts().nlargest(2).index
        cat_dict[col] = list(top_cats)
        # Replace rare categories with 'other' based on train data
        train_data_copy_encoded[col] = train_data_copy[col].where(train_data_copy[col].isin(top_cats), other='other')
        test_data_copy_encoded[col] = test_data_copy[col].where(test_data_copy[col].isin(top_cats), other='other')

# One-hot encode
train_data_copy_encoded = pd.get_dummies(train_data_copy_encoded, columns=cat_info.keys(), drop_first=True)
test_data_copy_encoded = pd.get_dummies(test_data_copy_encoded, columns=cat_info.keys(), drop_first=True)

test_data_copy_encoded = test_data_copy_encoded.reindex(columns=train_data_copy_encoded.columns, fill_value=0)

In [19]:
train_data_copy_encoded[train_data_copy_encoded.select_dtypes(include='boolean').columns] = train_data_copy_encoded[train_data_copy_encoded.select_dtypes(include='boolean').columns].astype(int)
test_data_copy_encoded[test_data_copy_encoded.select_dtypes(include='boolean').columns] = test_data_copy_encoded[test_data_copy_encoded.select_dtypes(include='boolean').columns].astype(int)

# Training

In [20]:
with mlflow.start_run(run_name="preliminary_linear_regression"):

    lr = LinearRegression()
    lr.fit(train_data_copy_encoded.fillna(0).drop('SalePrice', axis=1), train_data_copy_encoded['SalePrice'])

    test_preds = lr.predict(test_data_copy_encoded.fillna(0).drop('SalePrice',axis=1))
    train_preds = lr.predict(train_data_copy_encoded.fillna(0).drop('SalePrice',axis=1))

    test_mape = round(mean_absolute_percentage_error(test_data_copy_encoded['SalePrice'], test_preds)*100,2)
    train_mape = round(mean_absolute_percentage_error(train_data_copy_encoded['SalePrice'], train_preds)*100,2)

    train_r2 = round(r2_score(train_data_copy_encoded['SalePrice'], train_preds)*100,2)
    test_r2 = round(r2_score(test_data_copy_encoded['SalePrice'], test_preds)*100,2)

    test_rmse = round(np.sqrt(mean_squared_error(test_data_copy_encoded['SalePrice'], test_preds)*100),2)
    train_rmse = round(np.sqrt(mean_squared_error(train_data_copy_encoded['SalePrice'], train_preds)*100),2)
    
    mlflow.log_metric("test_mape", test_mape)
    mlflow.log_metric("train_mape", train_mape)

    mlflow.log_metric("test_r2", test_r2)
    mlflow.log_metric("train_r2", train_r2)

    mlflow.log_metric("test_rmse", test_rmse)
    mlflow.log_metric("train_rmse", train_rmse)

    mlflow.sklearn.log_model(lr, "linear_regression_model_preliminary")

The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh(<full-path-to-git-executable>)

All git commands will error until this is rectified.

This initial message can be silenced or aggravated in the future by setting the
$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - quiet|q|silence|s|silent|none|n|0: for no message or exception
    - error|e|exception|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet



🏃 View run preliminary_linear_regression at: https://dagshub.com/nipkha21/House-Prices---Advanced-Regression-Techniques.mlflow/#/experiments/2/runs/35c65badcecb4cf9b466b7cb035583bd
🧪 View experiment at: https://dagshub.com/nipkha21/House-Prices---Advanced-Regression-Techniques.mlflow/#/experiments/2


In [21]:
for alpha in [1000, 100, 10, 1]:
    with mlflow.start_run(description="ridge_regression"):
        params = {"alpha":alpha}
        lr = Ridge(**params)
        lr.fit(train_data_copy_encoded.fillna(0).drop('SalePrice', axis=1), train_data_copy_encoded['SalePrice'])

        test_preds = lr.predict(test_data_copy_encoded.fillna(0).drop('SalePrice',axis=1))
        train_preds = lr.predict(train_data_copy_encoded.fillna(0).drop('SalePrice',axis=1))

        test_mape = round(mean_absolute_percentage_error(test_data_copy_encoded['SalePrice'], test_preds)*100,2)
        train_mape = round(mean_absolute_percentage_error(train_data_copy_encoded['SalePrice'], train_preds)*100,2)

        train_r2 = round(r2_score(train_data_copy_encoded['SalePrice'], train_preds)*100,2)
        test_r2 = round(r2_score(test_data_copy_encoded['SalePrice'], test_preds)*100,2)

        test_rmse = round(np.sqrt(mean_squared_error(test_data_copy_encoded['SalePrice'], test_preds)*100),2)
        train_rmse = round(np.sqrt(mean_squared_error(train_data_copy_encoded['SalePrice'], train_preds)*100),2)

        mlflow.log_params(params)
        mlflow.log_metric("test_mape", test_mape)
        mlflow.log_metric("train_mape", train_mape)

        mlflow.log_metric("test_r2", test_r2)
        mlflow.log_metric("train_r2", train_r2)

        mlflow.log_metric("test_rmse", test_rmse)
        mlflow.log_metric("train_rmse", train_rmse)
        mlflow.set_tag("algorithm", "ridge_regression")
        mlflow.sklearn.log_model(lr, "ridge_regression")



🏃 View run powerful-wasp-722 at: https://dagshub.com/nipkha21/House-Prices---Advanced-Regression-Techniques.mlflow/#/experiments/2/runs/b031f59e8b7848e8bdd2182c0b59ba9c
🧪 View experiment at: https://dagshub.com/nipkha21/House-Prices---Advanced-Regression-Techniques.mlflow/#/experiments/2




🏃 View run salty-wren-886 at: https://dagshub.com/nipkha21/House-Prices---Advanced-Regression-Techniques.mlflow/#/experiments/2/runs/43505411076a4378a21e2dfe8d083909
🧪 View experiment at: https://dagshub.com/nipkha21/House-Prices---Advanced-Regression-Techniques.mlflow/#/experiments/2




🏃 View run thoughtful-robin-436 at: https://dagshub.com/nipkha21/House-Prices---Advanced-Regression-Techniques.mlflow/#/experiments/2/runs/0906e65b91b74ae587a755f20f4fba09
🧪 View experiment at: https://dagshub.com/nipkha21/House-Prices---Advanced-Regression-Techniques.mlflow/#/experiments/2




🏃 View run beautiful-grouse-252 at: https://dagshub.com/nipkha21/House-Prices---Advanced-Regression-Techniques.mlflow/#/experiments/2/runs/2f7304103e6241d19280428ba302abf8
🧪 View experiment at: https://dagshub.com/nipkha21/House-Prices---Advanced-Regression-Techniques.mlflow/#/experiments/2


In [29]:
with mlflow.start_run(description="lightgbm_regression"):
    params = {
        'n_estimators': 260,
        'learning_rate': 0.05,
        'max_depth': 5,
        'random_state': 42,
        'num_leaves': 23,
        'categorical_features': cat_features,
        'min_child_samples': 40,
        'lambda_l1': 0.6
    }

    model = LGBMRegressor(**params)
    model.fit(train_data.drop('SalePrice',axis=1), train_data['SalePrice'])

    test_preds = model.predict(test_data.drop('SalePrice',axis=1))
    train_preds = model.predict(train_data.drop('SalePrice',axis=1))

    test_mape = round(mean_absolute_percentage_error(test_data['SalePrice'], test_preds)*100,2)
    train_mape = round(mean_absolute_percentage_error(train_data['SalePrice'], train_preds)*100,2)

    train_r2 = round(r2_score(test_data['SalePrice'], test_preds)*100,2)
    test_r2 = round(r2_score(train_data['SalePrice'], train_preds)*100,2)

    test_rmse = round(np.sqrt(mean_squared_error(test_data['SalePrice'], test_preds)*100),2)
    train_rmse = round(np.sqrt(mean_squared_error(train_data['SalePrice'], train_preds)*100),2)
    
    mlflow.log_params(params)
    mlflow.log_metric("test_mape", test_mape)
    mlflow.log_metric("train_mape", train_mape)

    mlflow.log_metric("test_r2", test_r2)
    mlflow.log_metric("train_r2", train_r2)

    mlflow.log_metric("test_rmse", test_rmse)
    mlflow.log_metric("train_rmse", train_rmse)
    mlflow.set_tag("algorithm","lightgbm")
    mlflow.lightgbm.log_model(model, "lightgbm_model")



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001159 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3778
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 66
[LightGBM] [Info] Start training from score 182208.341610




🏃 View run auspicious-shrew-199 at: https://dagshub.com/nipkha21/House-Prices---Advanced-Regression-Techniques.mlflow/#/experiments/2/runs/74d81ba9d5e1437db72cb667655335fc
🧪 View experiment at: https://dagshub.com/nipkha21/House-Prices---Advanced-Regression-Techniques.mlflow/#/experiments/2


In [30]:
logged_model = 'runs:/74d81ba9d5e1437db72cb667655335fc/lightgbm_model'
lgb_model = mlflow.lightgbm.load_model(logged_model)

importances = lgb_model.feature_importances_

In [31]:
importance_df = pd.DataFrame(lgb_model.feature_importances_,index=lgb_model.feature_names_in_,columns=['Importance'])
zero_importance_cols = list(importance_df[importance_df['Importance']==0].index)

In [32]:
train_data = train_data.drop(zero_importance_cols, axis=1)
test_data = test_data.drop(zero_importance_cols, axis=1)

In [33]:
with mlflow.start_run(description="lightgbm_regression"):
    params = {
        'n_estimators': 250,
        'learning_rate': 0.03,
        'max_depth': 6,
        'random_state': 42,
        'num_leaves': 20,
        'categorical_features': cat_features,
        'min_child_samples': 50,
        'lambda_l1': 0.6
    }

    model = LGBMRegressor(**params)
    model.fit(train_data.drop('SalePrice',axis=1), train_data['SalePrice'])

    test_preds = model.predict(test_data.drop('SalePrice',axis=1))
    train_preds = model.predict(train_data.drop('SalePrice',axis=1))

    test_mape = round(mean_absolute_percentage_error(test_data['SalePrice'], test_preds)*100,2)
    train_mape = round(mean_absolute_percentage_error(train_data['SalePrice'], train_preds)*100,2)

    train_r2 = round(r2_score(test_data['SalePrice'], test_preds)*100,2)
    test_r2 = round(r2_score(train_data['SalePrice'], train_preds)*100,2)

    test_rmse = round(np.sqrt(mean_squared_error(test_data['SalePrice'], test_preds)*100),2)
    train_rmse = round(np.sqrt(mean_squared_error(train_data['SalePrice'], train_preds)*100),2)
    
    mlflow.log_params(params)
    mlflow.log_metric("test_mape", test_mape)
    mlflow.log_metric("train_mape", train_mape)

    mlflow.log_metric("test_r2", test_r2)
    mlflow.log_metric("train_r2", train_r2)

    mlflow.log_metric("test_rmse", test_rmse)
    mlflow.log_metric("train_rmse", train_rmse)
    mlflow.set_tag("algorithm","lightgbm")
    mlflow.lightgbm.log_model(model, "lightgbm_model")



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000515 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3701
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 51
[LightGBM] [Info] Start training from score 182208.341610




🏃 View run nervous-stoat-922 at: https://dagshub.com/nipkha21/House-Prices---Advanced-Regression-Techniques.mlflow/#/experiments/2/runs/6900f6c56f934fd7a9ed73e0a68808de
🧪 View experiment at: https://dagshub.com/nipkha21/House-Prices---Advanced-Regression-Techniques.mlflow/#/experiments/2


In [53]:
train_columns = list(train_data.drop('SalePrice',axis=1).columns)

In [54]:
import pickle

In [55]:
with open('train_columns.pkl', 'wb') as f:
    pickle.dump(train_columns,f)

In [51]:
cat_columns = ['BsmtCond',
                'BsmtExposure',
                'BsmtFinType1',
                'CentralAir',
                'Condition1',
                'ExterQual',
                'Exterior1st',
                'Exterior2nd',
                'FireplaceQu',
                'GarageFinish',
                'GarageType',
                'HeatingQC',
                'HouseStyle',
                'KitchenQual',
                'LotConfig',
                'LotShape',
                'MSZoning',
                'MasVnrType',
                'Neighborhood',
                'PavedDrive',
                'SaleCondition',
                'SaleType']

In [52]:
with open('cat_columns.pkl', 'wb') as f:
    pickle.dump(cat_columns,f)

In [None]:
from lightgbm import LGBMRegressor

model = LGBMRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42, categorical_feature=cat_features)
model.fit(train_data.drop('SalePrice',axis=1), train_data['SalePrice'])