In [44]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv
/kaggle/input/house-prices-advanced-regression-techniques/data_description.txt
/kaggle/input/house-prices-advanced-regression-techniques/train.csv
/kaggle/input/house-prices-advanced-regression-techniques/test.csv


In [45]:
!pip install mlflow dagshub



# imports

In [46]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.feature_selection import SelectFromModel
import mlflow
import mlflow.sklearn
from mlflow.models.signature import infer_signature
import warnings
warnings.filterwarnings('ignore')


### Dagshub Login

In [47]:
dagshub.init(repo_owner='lkata22', repo_name='house-prices-regression', mlflow=True)
mlflow.set_experiment("HousePriceRegression-5")

<Experiment: artifact_location='mlflow-artifacts:/1f4dc3c1e00947b9bf0d971135d60da7', creation_time=1744319294493, experiment_id='9', last_update_time=1744319294493, lifecycle_stage='active', name='HousePriceRegression-5', tags={}>

# Prepare Data

In [None]:
train = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')

train.drop('Id', axis=1, inplace=True)


# Target variable(log-transformed for better modeling)
y = np.log1p(train['SalePrice'])
train.drop('SalePrice', axis=1, inplace=True)

# Split data
X_train, X_val, y_train, y_val = train_test_split(train, y, test_size=0.2, random_state=42)

# Feature Engineering

## Class - Feature Engineering

In [None]:
class FeatureEngineer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.columns_to_drop = [
            'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
            'FullBath', 'HalfBath', 'BsmtFullBath', 'BsmtHalfBath',
            'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch',
            'YearBuilt', 'YearRemodAdd', 'YrSold'
        ]
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        
        X['TotalSF'] = X['TotalBsmtSF'] + X['1stFlrSF'] + X['2ndFlrSF']
        X['TotalBath'] = X['FullBath'] + 0.5*X['HalfBath'] + X['BsmtFullBath'] + 0.5*X['BsmtHalfBath']
        X['TotalPorch'] = X['OpenPorchSF'] + X['EnclosedPorch'] + X['3SsnPorch'] + X['ScreenPorch']
        X['Age'] = X['YrSold'] - X['YearBuilt']
        X['RemodAge'] = X['YrSold'] - X['YearRemodAdd']
        X['IsRemod'] = (X['YearBuilt'] != X['YearRemodAdd']).astype(int)
        X['IsNew'] = (X['YrSold'] == X['YearBuilt']).astype(int)
        
        # Drop original columns
        for col in self.columns_to_drop:
            if col in X.columns:
                X.drop(col, axis=1, inplace=True)
        
        return X


# Cleaning

In [None]:
class HandleMissingValues(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        
        # Fill numerical features
        num_cols = X.select_dtypes(include=['int64', 'float64']).columns
        for col in num_cols:
            if X[col].isnull().sum() > 0:
                if 'SF' in col or 'Area' in col or 'MiscVal' in col:
                    X[col].fillna(0, inplace=True)
                else:
                    X[col].fillna(X[col].median(), inplace=True)
        
        # Fill categorical features
        cat_cols = X.select_dtypes(include=['object']).columns
        for col in cat_cols:
            if X[col].isnull().sum() > 0:
                X[col].fillna('None', inplace=True)
        
        return X

# Feature Transformation

In [None]:
class OutlierHandler(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=3):
        self.threshold = threshold
        self.outlier_cols = []
    
    def fit(self, X, y=None):
        num_cols = X.select_dtypes(include=['int64', 'float64']).columns
        ofr col in num_cols:
            z_scores = (X[col] - X[col].mean()) / X[col].std()
            if (abs(z_scores) > self.threshold).any():
                self.outlier_cols.append(col)
        return self
    
    def transform(self, X):
        X = X.copy()
        for col in self.outlier_cols:
            mean = X[col].mean()
            std = X[col].std()
            lower_bound = mean - (self.threshold * std)
            upper_bound = mean + (self.threshold * std)
            X[col] = np.where(X[col] > upper_bound, upper_bound, 
                            np.where(X[col] < lower_bound, lower_bound, X[col]))
        return X

In [None]:
# First apply feature engineering to get correct columns
feature_engineer = FeatureEngineer()
X_train_eng = feature_engineer.fit_transform(X_train)
X_val_eng = feature_engineer.transform(X_val)

# Get numerical and categorical columns after feature engineering
num_cols = X_train_eng.select_dtypes(include=['int64', 'float64']).columns
cat_cols = X_train_eng.select_dtypes(include=['object']).columns

# Numerical pipeline
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Categorical pipeline
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='None')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

# Full preprocessing pipeline
preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
])

# Main pipeline
main_pipeline = Pipeline([
    ('feature_engineer', FeatureEngineer()),
    ('handle_missing', HandleMissingValues()),
    ('outlier_handler', OutlierHandler()),
    ('preprocessor', preprocessor)
])

# Define models and parameters
models = {
    'LinearRegression': {
        'model': LinearRegression(),
        'params': {}
    },
    'RandomForest': {
        'model': RandomForestRegressor(random_state=42),
        'params': {
            'model__n_estimators': [100, 200],
            'model__max_depth': [None, 10, 20],
            'model__min_samples_split': [2, 5]
        }
    },
    'XGBoost': {
        'model': XGBRegressor(random_state=42, enable_categorical=True),
        'params': {
            'model__n_estimators': [100, 200],
            'model__learning_rate': [0.01, 0.1],
            'model__max_depth': [3, 6, 9],
            'model__subsample': [0.8, 1.0],
            'model__colsample_bytree': [0.8, 1.0]
        }
    }
}

# Run experiments
best_score = float('inf')
best_model = None

for model_name, config in models.items():
    with mlflow.start_run(run_name=f"{model_name}_experiment"):
        # Create complete pipeline
        complete_pipeline = Pipeline([
            ('preprocessing', main_pipeline),
            ('model', config['model'])
        ])
        
        # Grid search
        grid_search = GridSearchCV(
            complete_pipeline,
            param_grid=config['params'],
            cv=5,
            scoring='neg_mean_squared_error',
            n_jobs=-1,
            verbose=1
        )
        
        grid_search.fit(X_train, y_train)
        
        # Get best model
        model = grid_search.best_estimator_
        
        # Predictions
        y_pred_train = model.predict(X_train)
        y_pred_val = model.predict(X_val)
        
        # Calculate RMSE
        train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
        val_rmse = np.sqrt(mean_squared_error(y_val, y_pred_val))
        
        # Log parameters and metrics
        mlflow.log_params(grid_search.best_params_)
        mlflow.log_metrics({
            'train_rmse': train_rmse,
            'val_rmse': val_rmse
        })
        
        # Log model
        signature = infer_signature(X_train, y_pred_train)
        mlflow.sklearn.log_model(model, model_name, signature=signature)
        
        print(f"{model_name} - Train RMSE: {train_rmse:.4f}, Val RMSE: {val_rmse:.4f}")
        print(f"Best params: {grid_search.best_params_}")
        
        # Track best model
        if val_rmse < best_score:
            best_score = val_rmse
            best_model = model
            best_model_name = model_name
            
# Register best model
if best_model is not None:
    with mlflow.start_run(run_name="best_model"):
        mlflow.log_metric("best_val_rmse", best_score)
        mlflow.sklearn.log_model(best_model, "best_model")
        mlflow.set_tag("best_model", best_model_name)
        
    # Save best model to registry
    model_uri = f"runs:/{mlflow.active_run().info.run_id}/best_model"
    registered_model = mlflow.register_model(model_uri, "HousePricePredictor")
    
    print(f"Best model: {best_model_name} with validation RMSE: {best_score:.4f}")