# Setup and imports

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
test = pd.read_csv("../data/test.csv")
train = pd.read_csv("../data/train.csv")
sample = pd.read_csv("../data/sample_submission.csv")


In [None]:
from sklearn.model_selection import train_test_split

X = train.drop(columns=['SalePrice'])  # train dataframe remains unchanged, X is now train dataframe without SalePrice
y = train['SalePrice']                 # train dataframe SalePrices is copied to y

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


# Cleaning

In [None]:
train_ids = X_train.pop('Id')
test_ids = X_val.pop('Id')

In [None]:
with pd.option_context('display.max_rows', None):
    print(X_train.isna().mean()[X_train.isna().mean() > 0])
    print(X_val.isna().mean()[X_val.isna().mean() > 0])

In [None]:
print(X_train['PoolArea'].value_counts(dropna=False))
print(X_train['PoolQC'].value_counts(dropna=False))
print(X_train['PoolQC'].isna().mean())    # 99.4% of PoolQC records are NA

print(X_val['PoolArea'].value_counts(dropna=False))
print(X_val['PoolQC'].value_counts(dropna=False))
print(X_val['PoolQC'].isna().mean()) 

PoolQC_train = X_train.pop('PoolQC')
PoolQC_test = X_val.pop('PoolQC')

In [None]:
print(X_train['MiscFeature'].value_counts(dropna=False))
print(X_train['MiscVal'].value_counts(dropna=False))

MiscFeature_train = X_train.pop('MiscFeature')
MiscVal_train = X_train.pop('MiscVal')

MiscFeature_test = X_val.pop('MiscFeature')
MiscVal_test = X_val.pop('MiscVal')

In [None]:
X_train['Alley'].value_counts(dropna=False)
Alley_train = X_train.pop('Alley')
Alley_val = X_val.pop('Alley')

In [None]:
X_train['MasVnrType'].isna().mean()
MasVnrType_train = X_train.pop('MasVnrType')
MasVnrType_test = X_val.pop('MasVnrType')
X_train['MasVnrArea'] = X_train['MasVnrArea'].fillna(0) 
X_val['MasVnrArea'] = X_val['MasVnrArea'].fillna(0)

In [None]:
print(X_train['Fireplaces'].value_counts(dropna=False))
print(X_train['FireplaceQu'].value_counts(dropna=False))
X_train['FireplaceQu'] = X_train['FireplaceQu'].fillna('No Fireplace')
X_val['FireplaceQu'] = X_val['FireplaceQu'].fillna('No Fireplace')


In [None]:
print(X_train['Fireplaces'].value_counts(dropna=False))
print(X_train['FireplaceQu'].value_counts(dropna=False))

In [None]:
X_train['LotFrontage'] = X_train['LotFrontage'].fillna(0)
print(X_train['LotFrontage'].value_counts(dropna=False))

X_val['LotFrontage'] = X_val['LotFrontage'].fillna(0)
print(X_val['LotFrontage'].value_counts(dropna=False))

In [None]:
X_train['Electrical'].value_counts(dropna=False)

In [None]:
most_common = X_train['Electrical'].mode()[0]
X_train['Electrical'].fillna(most_common, inplace=True)
X_val['Electrical'].fillna(most_common, inplace=True)   # filling test data from most_common of train data is intentional to avoid data leakage

In [None]:
no_basement_value = "No Basement"
basement_fields = ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2']
# Replace NaN values in the basement columns with 'No Basement'
for field in basement_fields:
    X_train[field].fillna(no_basement_value, inplace=True)
    X_val[field].fillna(no_basement_value, inplace=True)

In [None]:
no_garage_value = "No Garage"

garage_cat_fields = ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']

for field in garage_cat_fields:
    X_train[field].fillna(no_garage_value, inplace=True)
    X_val[field].fillna(no_garage_value, inplace=True)


median = X_train['GarageYrBlt'].median()

X_train['GarageYrBlt'].fillna(median, inplace=True)
X_val['GarageYrBlt'].fillna(median, inplace=True)

In [None]:
with pd.option_context('display.max_rows', None):
    print(X_train.isna().mean()[X_train.isna().mean() > 0])
    print(X_val.isna().mean()[X_val.isna().mean() > 0])

In [None]:
# Skewed and dominated by one category
streets_train = X_train.pop('Street')
streets_test = X_val.pop('Street')
condition_2_train = X_train.pop('Condition2')
condition_2_test = X_val.pop('Condition2')
X_train.pop('RoofMatl')
X_val.pop('RoofMatl')
X_train.pop('Heating')
X_val.pop('Heating')

## Feature Engineering

In [None]:
print("Before:")
print(X_train['Fence'].value_counts(dropna=False))
X_train['Fence'] = X_train['Fence'].notna().astype(int)  # changed Fence to binary data -> has Fend or not
X_val['Fence'] = X_val['Fence'].notna().astype(int)
print("After: ")
print(X_train['Fence'].value_counts(dropna=False))

In [None]:
cat_cols = [col for col in X_train.columns if X_train[col].dtype == 'object']
num_cols = [col for col in X_train.columns if X_train[col].dtype != 'object']

original_cat_cols = cat_cols

print(f"Categorical columns ({len(cat_cols)}): {cat_cols}")
print(f"Numerical columns ({len(num_cols)}): {num_cols}")



In [None]:
target_encoded_fields = ['Neighborhood', 'Exterior1st', 'Exterior2nd']

for field in target_encoded_fields:
    # change Neighborhood column to target_encoded
    target_mean = y_train.groupby(X_train[field]).mean()
    X_train[f'{field}_encoded'] = X_train[field].map(target_mean)
    X_val[f'{field}_encoded'] = X_val[field].map(target_mean)
    X_train.pop(field)
    X_val.pop(field)


In [None]:
X_train['Condition1'].value_counts()

In [None]:
threshold = 15

def transform_condition1(df, condition1_counts, threshold):
    transformed_condition1 = []
    for value in df['Condition1']:
        if condition1_counts[value] > threshold:
            transformed_condition1.append(value)  # Keep the original value if it's frequent
        else:
            transformed_condition1.append('Other')  # Group rare categories into 'Other'
    return transformed_condition1

condition1_counts = X_train['Condition1'].value_counts()

X_train['Condition1'] = transform_condition1(X_train, condition1_counts, threshold)
X_val['Condition1'] = transform_condition1(X_val, condition1_counts, threshold)

X_train_encoded = pd.get_dummies(X_train, columns=['Condition1'], drop_first=True, dtype=int)
X_val_encoded = pd.get_dummies(X_val, columns=['Condition1'], drop_first=True, dtype=int)

X_train = X_train_encoded
X_val = X_val_encoded


In [None]:
threshold = 10

def transform_saletype(df, saletype_counts, threshold):
    transformed = []
    for value in df['SaleType']:
        # If the frequency of the value is greater than the threshold, keep it,
        # otherwise group it as "Other"
        if saletype_counts.get(value, 0) > threshold:
            transformed.append(value)
        else:
            transformed.append('Other')
    return transformed

saletype_counts = X_train['SaleType'].value_counts()

X_train['SaleType'] = transform_saletype(X_train, saletype_counts, threshold)
X_val['SaleType']   = transform_saletype(X_val, saletype_counts, threshold)

X_train_encoded = pd.get_dummies(X_train, columns=['SaleType'], drop_first=True, dtype=int)
X_val_encoded   = pd.get_dummies(X_val, columns=['SaleType'], drop_first=True, dtype=int)

X_train_encoded, X_val_encoded = X_train_encoded.align(X_val_encoded, join='outer', axis=1, fill_value=0)

X_train = X_train_encoded
X_val = X_val_encoded


In [None]:
with pd.option_context('display.max_columns', None):
    print(X_train)
    print(X_val)

In [None]:
numericals = {}
numericals['LandContour'] = {"Lvl": 3, "Bnk":2, "HLS":1, "Low": 0}
numericals['Utilities'] = {"AllPub": 3, "NoSewr": 2, "NoSeWa": 1, "ELO": 0}
numericals['LandSlope'] = {"Gtl": 2, "Mod": 1, "Sev": 0}
numericals['Common'] = {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, "No Basement": 0, "No Garage": 0, "No Fireplace": 0}
numericals['BsmtFinType'] = {"GLQ":6, "ALQ":5, "BLQ":4, "Rec":3, "LwQ":2, "Unf":1, "No Basement": 0}
numericals['Functional'] = {"Typ": 7, "Min1":6, "Min2":5,"Mod":4, "Maj1":3, "Maj2":2, "Sev":1,"Sal": 0}
numericals['GarageFinish'] = {"Fin": 3, "RFn": 2, "Unf":1, "No Garage": 0}
numericals['BsmtExposure'] = {'Gd': 4, 'Av': 3, 'Mn': 2, 'No':1, 'No Basement': 0}
numericals['PavedDrive'] = {'Y': 2, 'P': 1, 'N': 0}
numericals['LotShape'] = {'Reg': 3, 'IR1': 2, 'IR2': 1, 'IR3': 0}
numericals['FireplaceQu'] = {'No Fireplace': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5} 


# For columns with their own specific mapping
for cat in ['LandContour', 'Utilities', 'LandSlope', 'Functional', 'GarageFinish', 'BsmtExposure', 'PavedDrive', 'LotShape']:
    X_train[cat] = X_train[cat].replace(numericals[cat])
    X_val[cat] = X_val[cat].replace(numericals[cat])

# For columns using the 'Common' mapping
common_cols = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'KitchenQual', 'HeatingQC', 'GarageQual', 'GarageCond', "FireplaceQu"]
for col in common_cols:
    X_train[col] = X_train[col].replace(numericals['Common'])
    X_val[col] = X_val[col].replace(numericals['Common'])

# For columns using the 'BsmtFinType' mapping
bsmt_fin_cols = ['BsmtFinType1', 'BsmtFinType2']  # Assuming you have BsmtFinType2 as well
for col in bsmt_fin_cols:
    X_train[col] = X_train[col].replace(numericals['BsmtFinType'])
    X_val[col] = X_val[col].replace(numericals['BsmtFinType'])

# Binary
X_train['CentralAir'] = X_train['CentralAir'].map({'Y': 1, 'N': 0})
X_val['CentralAir'] = X_val['CentralAir'].map({'Y': 1, 'N': 0})




In [None]:
cat_cols = [col for col in X_train.columns if X_train[col].dtype == 'object']

print(f"Categorical columns ({len(cat_cols)}): {cat_cols}")
one_hot_columns = list(X_train[cat_cols].nunique().index)

In [None]:
# One-hot encode function
def apply_one_hot_encoding(X_train, X_val, columns, drop_first=True, dummy_na=True):
 
    X_train_encoded = pd.get_dummies(X_train, columns=columns, drop_first=drop_first, dummy_na=dummy_na, dtype=int)
    X_val_encoded = pd.get_dummies(X_val, columns=columns, drop_first=drop_first, dummy_na=dummy_na, dtype=int)
    
    X_train_encoded, X_val_encoded = X_train_encoded.align(X_val_encoded, join='outer', axis=1, fill_value=0)
    
    return X_train_encoded, X_val_encoded

X_train, X_val = apply_one_hot_encoding(X_train, X_val, one_hot_columns)



with pd.option_context('display.max_columns', None):
    print(X_train)



# Feature Selection

## Correlaction Filter

In [None]:
X_corr = X_train.copy()
X_corr['target'] = y_train

In [None]:
corr_matrix = X_corr.corr().abs()

In [None]:
import numpy as np
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))

threshold = 0.8
high_corr_pairs = []

for i in range(len(corr_matrix.columns)):
    for j in range(i+1, len(corr_matrix.columns)):
        if corr_matrix.iloc[i, j] > threshold:
            high_corr_pairs.append((corr_matrix.columns[i], corr_matrix.columns[j], corr_matrix.iloc[i, j]))

if high_corr_pairs:
    print("Highly correlated feature pairs:")
    for feat1, feat2, corr in high_corr_pairs:
        print(f"{feat1} and {feat2}: {corr:.4f}")
else:
    print(f"No feature pairs with correlation above {threshold} found.")

# To remove one feature from each highly correlated pair
# (typically the one with lower correlation with target)
features_to_drop = []
for feat1, feat2, _ in high_corr_pairs:
    # Compare correlation with target
    if abs(X_train[feat1].corr(y)) < abs(X_train[feat2].corr(y)):
        features_to_drop.append(feat1)
    else:
        features_to_drop.append(feat2)

# Remove duplicates
features_to_drop = list(set(features_to_drop))
print(f"Features to drop due to high correlation: {features_to_drop}")

In [None]:
features_to_drop = ['SaleType_New', 'GarageArea', 'Exterior1st_encoded', 'MSZoning_RL', 'HouseStyle_2Story', '1stFlrSF', 'RoofStyle_Gable', 'GarageType_No Garage', 'GarageCond', 'Fireplaces', 'TotRmsAbvGrd']
X_filtered = X_train.drop(columns=features_to_drop)
print(f"Original shape: {X_train.shape}, New shape: {X_filtered.shape}")
X_train = X_filtered

X_val_filtered = X_val.drop(columns=features_to_drop)
X_val = X_val_filtered

## RFE example

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
import  matplotlib.pyplot as plt

scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)

model = LinearRegression()
rfe = RFE(estimator=model, n_features_to_select=20, step=1)
rfe.fit(X_train_scaled, y_train)

rfe_selected_features = X_train.columns[rfe.support_].tolist()

print("Features selected by RFE: ")
for i, feature in enumerate(rfe_selected_features, 1):
    print(f"{i}. {feature}")

plt.figure(figsize=(12, 8))
feature_ranking = pd.Series(rfe.ranking_, index=X_train.columns)
feature_ranking.sort_values().plot(kind='bar')
plt.title('Top 20 Features by RFE Ranking (lower is better)')
plt.ylabel('Ranking')
plt.tight_layout()
plt.show()

## RFE on different models with experiment tracking

In [None]:

import os
os.environ['MLFLOW_TRACKING_USERNAME'] = 'lmamu21' 
os.environ['MLFLOW_TRACKING_PASSWORD'] = 'd392b86afa2c37911a6814230b474c4b5df06fcb'
os.environ['MLFLOW_TRACKING_URI'] = 'https://dagshub.com/lmamu21/House-Prices.mlflow'

In [None]:
# dagshub setup
import dagshub
dagshub.init(repo_owner='lmamu21', repo_name='House-Prices', mlflow=True)


In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error, r2_score
import  matplotlib.pyplot as plt
import mlflow
import numpy as np
import pandas as pd

mlflow.set_experiment("House Prices - Advanced Regression Techniques")

with mlflow.start_run(run_name="LinearRegression_RFE_Scaling"):
    mlflow.set_tracking_uri("https://dagshub.com/lmamu21/House-Prices.mlflow")

    mlflow.log_param("model_type", "LinearRegression")
    mlflow.log_param("random_state", 42)
    mlflow.log_param("n_features_to_select", 20)
    mlflow.log_param("rfe_step", 1)
    mlflow.log_param("scaling_method", "StandardScaler")

    scaler = StandardScaler()
    X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)

    scale_mean = scaler.mean_.tolist()
    scale_var = scaler.var_.tolist()
    mlflow.log_param("scaler_mean_avg", np.mean(scale_mean))
    mlflow.log_param("scaler_var_avg", np.mean(scale_var))

    model = LinearRegression()
    rfe = RFE(estimator=model, n_features_to_select=20, step=1)
    rfe.fit(X_train_scaled, y_train)

    
    # Get selected features
    rfe_selected_features = X_train.columns[rfe.support_].tolist()
    print("Features selected by RFE:")
    for i, feature in enumerate(rfe_selected_features, 1):
        print(f"{i}. {feature}")


    # Log selected features as parameters
    for i, feature in enumerate(rfe_selected_features, 1):
        mlflow.log_param(f"selected_feature_{i}", feature)

    # Log feature rankings
    feature_ranking = pd.Series(rfe.ranking_, index=X_train.columns)
    feature_ranking_dict = feature_ranking.to_dict()
    mlflow.log_params({f"rank_{feat}": rank for feat, rank in feature_ranking_dict.items()})

   
   
    # Log the RFE selector for future reference
    mlflow.sklearn.log_model(rfe, "rfe_selector")
    mlflow.sklearn.log_model(scaler, "standard_scaler")

    X_train_selected = X_train_scaled.loc[:, rfe.support_]
    X_val_scaled = pd.DataFrame(scaler.transform(X_val), columns=X_val.columns)
    X_val_selected = X_val_scaled.loc[:, rfe.support_]

    # Train model
    model.fit(X_train_selected, y_train)

    # Predict and evaluate
    y_val_pred = model.predict(X_val_selected)
    val_rmse = root_mean_squared_error(y_val, y_val_pred)
    val_r2 = r2_score(y_val, y_val_pred)

    # Log metrics
    mlflow.log_metric("val_rmse", val_rmse)
    mlflow.log_metric("val_r2", val_r2)

    # Optionally log model
    mlflow.sklearn.log_model(model, "regression_model")
    # End the MLflow run
    mlflow.end_run()



In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error, r2_score
from sklearn.model_selection import KFold
import mlflow
import numpy as np
import pandas as pd

mlflow.set_experiment("House Prices - Advanced Regression Techniques")
mlflow.set_tracking_uri("https://dagshub.com/lmamu21/House-Prices.mlflow")

with mlflow.start_run(run_name="LinearRegression_RFE_Scaling_KFold"):

    mlflow.log_param("model_type", "LinearRegression")
    mlflow.log_param("random_state", 42)
    mlflow.log_param("n_features_to_select", 20)
    mlflow.log_param("rfe_step", 1)
    mlflow.log_param("scaling_method", "StandardScaler")
    mlflow.log_param("cv_folds", 5)

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    rmse_scores = []
    r2_scores = []

    for fold, (train_idx, val_idx) in enumerate(kf.split(X_train), 1):
        X_tr, X_val_fold = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]

        # Scale
        scaler = StandardScaler()
        X_tr_scaled = pd.DataFrame(scaler.fit_transform(X_tr), columns=X_tr.columns)
        X_val_scaled = pd.DataFrame(scaler.transform(X_val_fold), columns=X_val_fold.columns)

        # Feature selection
        model = LinearRegression()
        rfe = RFE(estimator=model, n_features_to_select=20, step=1)
        rfe.fit(X_tr_scaled, y_tr)

        X_tr_sel = X_tr_scaled.loc[:, rfe.support_]
        X_val_sel = X_val_scaled.loc[:, rfe.support_]

        # Train and evaluate
        model.fit(X_tr_sel, y_tr)
        preds = model.predict(X_val_sel)

        rmse = root_mean_squared_error(y_val_fold, preds)
        r2 = r2_score(y_val_fold, preds)

        print(f"Fold {fold}: RMSE = {rmse:.2f}, R² = {r2:.4f}")
        rmse_scores.append(rmse)
        r2_scores.append(r2)

    # Log average CV metrics
    mlflow.log_metric("cv_rmse_mean", np.mean(rmse_scores))
    mlflow.log_metric("cv_rmse_std", np.std(rmse_scores))
    mlflow.log_metric("cv_r2_mean", np.mean(r2_scores))
    mlflow.log_metric("cv_r2_std", np.std(r2_scores))

    mlflow.end_run()


In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error, r2_score
from sklearn.model_selection import KFold
import mlflow
import numpy as np
import pandas as pd

mlflow.set_experiment("House Prices - Advanced Regression Techniques")
mlflow.set_tracking_uri("https://dagshub.com/lmamu21/House-Prices.mlflow")

with mlflow.start_run(run_name="LinearRegression_RFE_Scaling_KFold_W/O_MSSubClass"):

    mlflow.log_param("model_type", "LinearRegression")
    mlflow.log_param("random_state", 42)
    mlflow.log_param("n_features_to_select", 20)
    mlflow.log_param("rfe_step", 1)
    mlflow.log_param("scaling_method", "StandardScaler")
    mlflow.log_param("cv_folds", 5)

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    rmse_scores = []
    r2_scores = []
    X_train_experiment = X_train.copy()
    X_train_experiment = X_train_experiment.drop(columns=['MSSubClass'])

    for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_experiment), 1):
        X_tr, X_val_fold = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]

        # Scale
        scaler = StandardScaler()
        X_tr_scaled = pd.DataFrame(scaler.fit_transform(X_tr), columns=X_tr.columns)
        X_val_scaled = pd.DataFrame(scaler.transform(X_val_fold), columns=X_val_fold.columns)

        # Feature selection
        model = LinearRegression()
        rfe = RFE(estimator=model, n_features_to_select=20, step=1)
        rfe.fit(X_tr_scaled, y_tr)

        X_tr_sel = X_tr_scaled.loc[:, rfe.support_]
        X_val_sel = X_val_scaled.loc[:, rfe.support_]

        # Train and evaluate
        model.fit(X_tr_sel, y_tr)
        preds = model.predict(X_val_sel)

        rmse = root_mean_squared_error(y_val_fold, preds)
        r2 = r2_score(y_val_fold, preds)

        print(f"Fold {fold}: RMSE = {rmse:.2f}, R² = {r2:.4f}")
        rmse_scores.append(rmse)
        r2_scores.append(r2)

    # Log average CV metrics
    mlflow.log_metric("cv_rmse_mean", np.mean(rmse_scores))
    mlflow.log_metric("cv_rmse_std", np.std(rmse_scores))
    mlflow.log_metric("cv_r2_mean", np.mean(r2_scores))
    mlflow.log_metric("cv_r2_std", np.std(r2_scores))

    mlflow.end_run()


In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error, r2_score
from sklearn.model_selection import KFold
import mlflow
import numpy as np
import pandas as pd

mlflow.set_experiment("House Prices - Advanced Regression Techniques")
mlflow.set_tracking_uri("https://dagshub.com/lmamu21/House-Prices.mlflow")

with mlflow.start_run(run_name="LinearRegression_RFE_Scaling_KFold_W/O_MSSubClass"):

    mlflow.log_param("model_type", "LinearRegression")
    mlflow.log_param("random_state", 42)
    mlflow.log_param("n_features_to_select", 15)
    mlflow.log_param("rfe_step", 1)
    mlflow.log_param("scaling_method", "StandardScaler")
    mlflow.log_param("cv_folds", 10)

    kf = KFold(n_splits=10, shuffle=True, random_state=42)
    rmse_scores = []
    r2_scores = []

    for fold, (train_idx, val_idx) in enumerate(kf.split(X_train), 1):
        X_tr, X_val_fold = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]

        # Scale
        scaler = StandardScaler()
        X_tr_scaled = pd.DataFrame(scaler.fit_transform(X_tr), columns=X_tr.columns)
        X_val_scaled = pd.DataFrame(scaler.transform(X_val_fold), columns=X_val_fold.columns)

        # Feature selection
        model = LinearRegression()
        rfe = RFE(estimator=model, n_features_to_select=15, step=1)
        rfe.fit(X_tr_scaled, y_tr)

        X_tr_sel = X_tr_scaled.loc[:, rfe.support_]
        X_val_sel = X_val_scaled.loc[:, rfe.support_]

        # Train and evaluate
        model.fit(X_tr_sel, y_tr)
        preds = model.predict(X_val_sel)

        rmse = root_mean_squared_error(y_val_fold, preds)
        r2 = r2_score(y_val_fold, preds)

        print(f"Fold {fold}: RMSE = {rmse:.2f}, R² = {r2:.4f}")
        rmse_scores.append(rmse)
        r2_scores.append(r2)

    # Log average CV metrics
    mlflow.log_metric("cv_rmse_mean", np.mean(rmse_scores))
    mlflow.log_metric("cv_rmse_std", np.std(rmse_scores))
    mlflow.log_metric("cv_r2_mean", np.mean(r2_scores))
    mlflow.log_metric("cv_r2_std", np.std(r2_scores))

    mlflow.end_run()


In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error, r2_score
from sklearn.model_selection import KFold
import mlflow
import numpy as np
import pandas as pd

mlflow.set_experiment("House Prices - Advanced Regression Techniques")
mlflow.set_tracking_uri("https://dagshub.com/lmamu21/House-Prices.mlflow")

with mlflow.start_run(run_name="LinearRegression_RFE_Scaling_KFold"):

    mlflow.log_param("model_type", "LinearRegression")
    mlflow.log_param("random_state", 42)
    mlflow.log_param("n_features_to_select", 10)
    mlflow.log_param("rfe_step", 1)
    mlflow.log_param("scaling_method", "StandardScaler")
    mlflow.log_param("cv_folds", 5)

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    rmse_scores = []
    r2_scores = []
    X_train_experiment = X_train.copy()
    X_train_experiment = X_train_experiment.drop(columns=['MSSubClass'])

    for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_experiment), 1):
        X_tr, X_val_fold = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]

        # Scale
        scaler = StandardScaler()
        X_tr_scaled = pd.DataFrame(scaler.fit_transform(X_tr), columns=X_tr.columns)
        X_val_scaled = pd.DataFrame(scaler.transform(X_val_fold), columns=X_val_fold.columns)

        # Feature selection
        model = LinearRegression()
        rfe = RFE(estimator=model, n_features_to_select=10, step=1)
        rfe.fit(X_tr_scaled, y_tr)

        X_tr_sel = X_tr_scaled.loc[:, rfe.support_]
        X_val_sel = X_val_scaled.loc[:, rfe.support_]

        # Train and evaluate
        model.fit(X_tr_sel, y_tr)
        preds = model.predict(X_val_sel)

        rmse = root_mean_squared_error(y_val_fold, preds)
        r2 = r2_score(y_val_fold, preds)

        print(f"Fold {fold}: RMSE = {rmse:.2f}, R² = {r2:.4f}")
        rmse_scores.append(rmse)
        r2_scores.append(r2)

    # Log average CV metrics
    mlflow.log_metric("cv_rmse_mean", np.mean(rmse_scores))
    mlflow.log_metric("cv_rmse_std", np.std(rmse_scores))
    mlflow.log_metric("cv_r2_mean", np.mean(r2_scores))
    mlflow.log_metric("cv_r2_std", np.std(r2_scores))

    mlflow.end_run()


In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error, r2_score
from sklearn.model_selection import KFold
import mlflow
import numpy as np
import pandas as pd

mlflow.set_experiment("House Prices - Advanced Regression Techniques")
mlflow.set_tracking_uri("https://dagshub.com/lmamu21/House-Prices.mlflow")

with mlflow.start_run(run_name="LinearRegression_RFE_Scaling_KFold_W/O_MSSubClass"):

    mlflow.log_param("model_type", "LinearRegression")
    mlflow.log_param("random_state", 42)
    mlflow.log_param("n_features_to_select", 15)
    mlflow.log_param("rfe_step", 1)
    mlflow.log_param("scaling_method", "MinMaxScaler")
    mlflow.log_param("cv_folds", 10)

    kf = KFold(n_splits=10, shuffle=True, random_state=42)
    rmse_scores = []
    r2_scores = []
    X_train_experiment = X_train.copy()
    X_train_experiment = X_train_experiment.drop(columns=['MSSubClass'])

    for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_experiment), 1):
        X_tr, X_val_fold = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]

        # Scale
        scaler = MinMaxScaler()
        X_tr_scaled = pd.DataFrame(scaler.fit_transform(X_tr), columns=X_tr.columns)
        X_val_scaled = pd.DataFrame(scaler.transform(X_val_fold), columns=X_val_fold.columns)

        # Feature selection
        model = LinearRegression()
        rfe = RFE(estimator=model, n_features_to_select=15, step=1)
        rfe.fit(X_tr_scaled, y_tr)

        X_tr_sel = X_tr_scaled.loc[:, rfe.support_]
        X_val_sel = X_val_scaled.loc[:, rfe.support_]

        # Train and evaluate
        model.fit(X_tr_sel, y_tr)
        preds = model.predict(X_val_sel)

        rmse = root_mean_squared_error(y_val_fold, preds)
        r2 = r2_score(y_val_fold, preds)

        print(f"Fold {fold}: RMSE = {rmse:.2f}, R² = {r2:.4f}")
        rmse_scores.append(rmse)
        r2_scores.append(r2)

    # Log average CV metrics
    mlflow.log_metric("cv_rmse_mean", np.mean(rmse_scores))
    mlflow.log_metric("cv_rmse_std", np.std(rmse_scores))
    mlflow.log_metric("cv_r2_mean", np.mean(r2_scores))
    mlflow.log_metric("cv_r2_std", np.std(r2_scores))

    mlflow.end_run()


In [None]:

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error, r2_score
import  matplotlib.pyplot as plt
import mlflow
import numpy as np
import pandas as pd

mlflow.set_experiment("House Prices - Advanced Regression Techniques")

with mlflow.start_run(run_name="LinearRegression_RFE_Scaling"):
    mlflow.set_tracking_uri("https://dagshub.com/lmamu21/House-Prices.mlflow")

    mlflow.log_param("model_type", "LinearRegression")
    mlflow.log_param("random_state", 42)
    mlflow.log_param("n_features_to_select", 20)
    mlflow.log_param("rfe_step", 1)
    mlflow.log_param("scaling_method", "StandardScaler")

    scaler = MinMaxScaler()
    X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)

    # scale_mean = scaler.mean_.tolist()
    # scale_var = scaler.var_.tolist()
    # mlflow.log_param("scaler_mean_avg", np.mean(scale_mean))
    # mlflow.log_param("scaler_var_avg", np.mean(scale_var))
    
    model = LinearRegression()
    rfe = RFE(estimator=model, n_features_to_select=20, step=1)
    rfe.fit(X_train_scaled, y_train)

    
    # Get selected features
    rfe_selected_features = X_train.columns[rfe.support_].tolist()
    print("Features selected by RFE:")
    for i, feature in enumerate(rfe_selected_features, 1):
        print(f"{i}. {feature}")


    # Log selected features as parameters
    for i, feature in enumerate(rfe_selected_features, 1):
        mlflow.log_param(f"selected_feature_{i}", feature)

    # Log feature rankings
    feature_ranking = pd.Series(rfe.ranking_, index=X_train.columns)
    feature_ranking_dict = feature_ranking.to_dict()
    mlflow.log_params({f"rank_{feat}": rank for feat, rank in feature_ranking_dict.items()})

   
   
    # Log the RFE selector for future reference
    mlflow.sklearn.log_model(rfe, "rfe_selector")
    mlflow.sklearn.log_model(scaler, "standard_scaler")

    X_train_selected = X_train_scaled.loc[:, rfe.support_]
    X_val_scaled = pd.DataFrame(scaler.transform(X_val), columns=X_val.columns)
    X_val_selected = X_val_scaled.loc[:, rfe.support_]

    # Train model
    model.fit(X_train_selected, y_train)

    # Predict and evaluate
    y_val_pred = model.predict(X_val_selected)
    val_rmse = root_mean_squared_error(y_val, y_val_pred)
    val_r2 = r2_score(y_val, y_val_pred)

    # Log metrics
    mlflow.log_metric("val_rmse", val_rmse)
    mlflow.log_metric("val_r2", val_r2)

    # Optionally log model
    mlflow.sklearn.log_model(model, "regression_model")
    # End the MLflow run
    mlflow.end_run()



In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error, r2_score
import  matplotlib.pyplot as plt
import mlflow
import numpy as np
import pandas as pd

mlflow.set_experiment("House Prices - Advanced Regression Techniques")

with mlflow.start_run(run_name="LinearRegression_RFE_Scaling"):
    mlflow.set_tracking_uri("https://dagshub.com/lmamu21/House-Prices.mlflow")

    mlflow.log_param("model_type", "LinearRegression")
    mlflow.log_param("random_state", 42)
    mlflow.log_param("n_features_to_select", 20)
    mlflow.log_param("rfe_step", 1)
    mlflow.log_param("scaling_method", "StandardScaler")

    scaler = StandardScaler()
    X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)

    scale_mean = scaler.mean_.tolist()
    scale_var = scaler.var_.tolist()
    mlflow.log_param("scaler_mean_avg", np.mean(scale_mean))
    mlflow.log_param("scaler_var_avg", np.mean(scale_var))

    model = LinearRegression()
    rfe = RFE(estimator=model, n_features_to_select=20, step=1)
    rfe.fit(X_train_scaled, y_train)

    
    # Get selected features
    rfe_selected_features = X_train.columns[rfe.support_].tolist()
    print("Features selected by RFE:")
    for i, feature in enumerate(rfe_selected_features, 1):
        print(f"{i}. {feature}")


    # Log selected features as parameters
    for i, feature in enumerate(rfe_selected_features, 1):
        mlflow.log_param(f"selected_feature_{i}", feature)

    # Log feature rankings
    feature_ranking = pd.Series(rfe.ranking_, index=X_train.columns)
    feature_ranking_dict = feature_ranking.to_dict()
    mlflow.log_params({f"rank_{feat}": rank for feat, rank in feature_ranking_dict.items()})

   
   
    # Log the RFE selector for future reference
    mlflow.sklearn.log_model(rfe, "rfe_selector")
    mlflow.sklearn.log_model(scaler, "standard_scaler")

    X_train_selected = X_train_scaled.loc[:, rfe.support_]
    X_val_scaled = pd.DataFrame(scaler.transform(X_val), columns=X_val.columns)
    X_val_selected = X_val_scaled.loc[:, rfe.support_]

    # Train model
    model.fit(X_train_selected, y_train)

    # Predict and evaluate
    y_val_pred = model.predict(X_val_selected)
    val_rmse = root_mean_squared_error(y_val, y_val_pred)
    val_r2 = r2_score(y_val, y_val_pred)

    # Log metrics
    mlflow.log_metric("val_rmse", val_rmse)
    mlflow.log_metric("val_r2", val_r2)

    # Optionally log model
    mlflow.sklearn.log_model(model, "regression_model")
    # End the MLflow run
    mlflow.end_run()



In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error, r2_score
from sklearn.model_selection import KFold
import mlflow
import numpy as np
import pandas as pd

import joblib
# Best prediction

mlflow.set_experiment("House Prices - Advanced Regression Techniques")
mlflow.set_tracking_uri("https://dagshub.com/lmamu21/House-Prices.mlflow")

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_train)

model = LinearRegression()
rfe = RFE(estimator=model, n_features_to_select=20)
X_selected = rfe.fit_transform(X_scaled, y_train)

kf = KFold(n_splits=10, shuffle=True, random_state=42)

rmse_scores = []

with mlflow.start_run(run_name="LinearRegression_Final_KFold"):
    for fold, (train_idx, val_idx) in enumerate(kf.split(X_selected)):
        X_train_fold, X_val_fold = X_selected[train_idx], X_selected[val_idx]
        y_train_fold, y_val_fold = y[train_idx], y[val_idx]

        model.fit(X_train_fold, y_train_fold)
        preds = model.predict(X_val_fold)
        rmse = root_mean_squared_error(y_val_fold, preds)
        rmse_scores.append(rmse)

        mlflow.log_metric(f"rmse_fold_{fold}", rmse)

    avg_rmse = np.mean(rmse_scores)
    mlflow.log_metric("avg_kfold_rmse", avg_rmse)

    # Save artifacts
    mlflow.sklearn.log_model(model, artifact_path="model", registered_model_name="BestHousePriceModel")
    joblib.dump(scaler, "standard_scaler.pkl")
    joblib.dump(rfe, "rfe_selector.pkl")
    mlflow.log_artifact("standard_scaler.pkl")
    mlflow.log_artifact("rfe_selector.pkl")

    mlflow.log_param("k_folds", 10)
    mlflow.log_param("rfe_features", 20)