In [62]:
import os
import sys
# Add src/ to sys.path (relative to current notebook)
sys.path.append(os.path.abspath("../../src"))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pathlib
import mlflow

from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.decomposition import PCA
from sklearn.linear_model import Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neural_network import  MLPRegressor
from xgboost import XGBRegressor
from tqdm import tqdm 


from utils.VariableAnalysis import UnivariateAnalysis,BivariateAnalysis

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns',None)

In [63]:
file_path = pathlib.Path('D:\Capstone Project\dataset\House_Price_dataset')
df = pd.read_csv(f"{file_path}/10.gurgaon_properties_post_feature_selection_v2.csv")
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 36,0.82,3,2,2,New Property,850.0,0,0,0,Low,Low Floor
1,flat,sector 89,0.95,2,2,2,New Property,1226.0,1,0,0,Low,Mid Floor
2,flat,sohna road,0.32,2,2,1,New Property,1000.0,0,0,0,Low,High Floor
3,flat,sector 92,1.6,3,4,3+,Relatively New,1615.0,1,0,1,High,Mid Floor
4,flat,sector 102,0.48,2,2,1,Relatively New,582.0,0,1,0,High,Mid Floor


In [64]:
## Furniture Type Unique Value
df['furnishing_type'].value_counts()

furnishing_type
0    2349
1    1018
2     187
Name: count, dtype: int64

In [65]:
## Level Incoding 
# 0 -> unfurnished
# 1 -> semifurnished
# 2 -> furnished
df['furnishing_type'] = df['furnishing_type'].replace({0.0:'unfurnished',1.0:'semifurnished',2.0:'furnished'})
df['furnishing_type'].value_counts()

furnishing_type
unfurnished      2349
semifurnished    1018
furnished         187
Name: count, dtype: int64

In [66]:
## Data split into dependent variable and Independent variable 
X = df.drop(columns=['price'])
y = df['price']

In [67]:
# Applying the log1p transformation to the target variable for normal distribution
y_log_tran = np.log1p(y)

## 1. Ordinal Encoding Approach for Categoriacal Value
- Pipe Line Preparation and Test Run

In [68]:
columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

In [69]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('numerical_transform', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('categorical_tranform', OrdinalEncoder(), columns_to_encode)
    ], 
    remainder='passthrough'
)

# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

scores = cross_val_score(pipeline, X, y_log_tran, cv=kfold, scoring='r2')
print(f"Score Mean:{scores.mean()}, Score std:{scores.std()}")

## Train and Test split and Model train 
X_train, X_test, y_train, y_test = train_test_split(X,y_log_tran,test_size=0.2,random_state=42)
print(pipeline.fit(X_train,y_train))

y_pred = pipeline.predict(X_test)
y_pred = np.expm1(y_pred) ## Reversing Log normal tranformation into Original Scale applied in the Target Feature
mabe = mean_absolute_error(np.expm1(y_test),y_pred)
print(f"mean_absolute_error:  {mabe}")

Score Mean:0.7363096633436828, Score std:0.03238005754429938
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('numerical_transform',
                                                  StandardScaler(),
                                                  ['bedRoom', 'bathroom',
                                                   'built_up_area',
                                                   'servant room',
                                                   'store room']),
                                                 ('categorical_tranform',
                                                  OrdinalEncoder(),
                                                  ['property_type', 'sector',
                                                   'balcony', 'agePossession',
                                                   'furnishing_type',
                                                   'luxury_c

In [70]:
# ### Original Script 
# def scorer(model_name, model, y_transformed=y_log_tran):
#     """Function For Different Models """
#     output = []
#     output.append(model_name)

#     pipeline = Pipeline([
#         ('preprocessor', preprocessor),
#         ('regressor', model)
#     ])

#     # K-fold cross-validation (Model Evaluation)
#     kfold = KFold(n_splits=10, shuffle=True, random_state=42)
#     scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
#     output.append(round(scores.mean(),4))
    
#     X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)   ## Train Test Split
#     pipeline.fit(X_train,y_train) ## Model Training 
#     y_pred = pipeline.predict(X_test) ## y^ Calulation 
#     y_pred = np.expm1(y_pred) ## Reversing Log normal tranformation into Original Scale in the Target Feature
#     output.append(mean_absolute_error(np.expm1(y_test),y_pred))
#     return output

# ### Models Dictionary
# model_dict = {
#     'LinearRegression':LinearRegression(),
#     'SVR':SVR(),
#     'Ridge':Ridge(),
#     'Lasso':Lasso(),
#     'DecisionTreeRegressor': DecisionTreeRegressor(),
#     'RandomForestRegressor':RandomForestRegressor(),
#     'ExtraTreesRegressor': ExtraTreesRegressor(),
#     'GradientBoostingRegressor': GradientBoostingRegressor(),
#     'AdaBoostRegressor': AdaBoostRegressor(),
#     'MLPRegressor': MLPRegressor(),
#     'XGBRegressor':XGBRegressor()
# }


# # Creating a column transformer for preprocessing
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('numerical_transform', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
#         ('categorical_tranform', OrdinalEncoder(), columns_to_encode)
#     ], 
#     remainder='passthrough'
# )

# ## Function Call for all Linear Models in dictionary 
# model_output = []
# for model_name,model in tqdm(model_dict.items()):
#     model_output.append(scorer(model_name, model))

# model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])
# model_df.sort_values(['mae'])


In [71]:
## Modified Script For MLFLOW Tracking 

def scorer(model_name, model, y_transformed=y_log_tran):
    """Function For Different Models """
    Mlflow_info = {}
    transformer_info = {}
    cross_val_info = {}
    metric_info = {}

    output = []
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])

    for name, transformer, columns in preprocessor.transformers:
        transformer_info[name]=transformer
       
    # K-fold cross-validation (Model Evaluation)
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    metric_info['R2'] = scores.mean() ## mlflow info
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)   ## Train Test Split
    pipeline.fit(X_train,y_train) ## Model Training 
    y_pred = pipeline.predict(X_test) ## y^ Calulation 
    y_pred = np.expm1(y_pred) ## Reversing Log normal tranformation into Original Scale in the Target Feature
    metric_info['MAE'] = mean_absolute_error(np.expm1(y_test),y_pred) ## mlflow info
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))

    ## Mlflow Parameter 
    # Mlflow_info[model_name] = model
    Mlflow_info['transformers'] = transformer_info
    Mlflow_info['kfold_params'] = {
    'n_splits': kfold.n_splits,
    'shuffle': kfold.shuffle,
    'random_state': kfold.random_state}
    Mlflow_info['metric'] = metric_info

    return output, Mlflow_info

### Models Dictionary
model_dict = {
    'LinearRegression':LinearRegression(),
    'SVR':SVR(),
    'Ridge':Ridge(),
    'Lasso':Lasso(),
    'DecisionTreeRegressor': DecisionTreeRegressor(),
    'RandomForestRegressor':RandomForestRegressor(),
    'ExtraTreesRegressor': ExtraTreesRegressor(),
    'GradientBoostingRegressor': GradientBoostingRegressor(),
    'AdaBoostRegressor': AdaBoostRegressor(),
    'MLPRegressor': MLPRegressor(),
    'XGBRegressor':XGBRegressor()
}

## Function Call for all Linear Models in dictionary 
model_output = []
mlflow_experiment = {}
for model_name,model in tqdm(model_dict.items()):
    a,b = scorer(model_name, model)
    model_output.append(a)
    mlflow_experiment[model_name] = b

model_output

100%|██████████| 11/11 [00:20<00:00,  1.83s/it]


[['LinearRegression', np.float64(0.7363096633436828), 0.9463822160089355],
 ['SVR', np.float64(0.7642012011196353), 0.8472636473483927],
 ['Ridge', np.float64(0.7363125343993554), 0.9463387741853388],
 ['Lasso', np.float64(0.05943378064493572), 1.528905986892753],
 ['DecisionTreeRegressor', np.float64(0.775395040113438), 0.7202349594530469],
 ['RandomForestRegressor', np.float64(0.8821967569733975), 0.5429834375873436],
 ['ExtraTreesRegressor', np.float64(0.8681156462627382), 0.545822855179476],
 ['GradientBoostingRegressor',
  np.float64(0.8725977667416565),
  0.5758344449032665],
 ['AdaBoostRegressor', np.float64(0.759757778536479), 0.8339565332912158],
 ['MLPRegressor', np.float64(0.810798997190707), 0.6813618603305466],
 ['XGBRegressor', np.float64(0.8894876835260124), 0.5040475141482346]]

## For Mlfow Eperiment Tracking Only

In [72]:
import mlflow
from mlflow.tracking import MlflowClient

client = MlflowClient()
# Works across most versions
experiments = client.search_experiments()

## Fetching any Experiment ID
for exp in experiments:
    print(f"{exp.experiment_id}: {exp.name}")

## mlflow Experiment Setup
mlflow.set_tracking_uri("http://127.0.0.1:5000/")  # if using remote/local server
mlflow.set_experiment("House Price Prediction")       # will create if doesn't exist

for key,val in mlflow_experiment.items():
    with mlflow.start_run(run_name=key):
        for k,v in val.items():
            if k == 'transformers':
                mlflow.log_param(k, v)
            elif k == 'kfold_params':
                mlflow.log_param(k, v)
            elif k == 'metric':
                for metric_name, metric_value in v.items():
                    mlflow.log_metric(metric_name, metric_value)

2025/04/24 13:26:25 INFO mlflow.tracking.fluent: Experiment with name 'House Price Prediction' does not exist. Creating a new experiment.


0: Default
🏃 View run LinearRegression at: http://127.0.0.1:5000/#/experiments/631641031006492260/runs/39e213d249954cf4acd7f486bf288e01
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/631641031006492260
🏃 View run SVR at: http://127.0.0.1:5000/#/experiments/631641031006492260/runs/d468b0830a094debbf3dc2e36f31cebe
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/631641031006492260
🏃 View run Ridge at: http://127.0.0.1:5000/#/experiments/631641031006492260/runs/d19e1d1702cf430b8d6501d671300e1c
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/631641031006492260
🏃 View run Lasso at: http://127.0.0.1:5000/#/experiments/631641031006492260/runs/4b240b0a56b249f596751a5c60c9775e
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/631641031006492260
🏃 View run DecisionTreeRegressor at: http://127.0.0.1:5000/#/experiments/631641031006492260/runs/d86f2c91ded6455e975e3200ca1eab5b
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/631641031006492260
🏃 View ru

In [None]:

# Step 1: Function to run the experiment
def run_experiment(vectorizer_type, ngram_range, vectorizer_max_features, vectorizer_name):
    # Step 2: Vectorization
    if vectorizer_type == "BoW":
        vectorizer = CountVectorizer(ngram_range=ngram_range, max_features=vectorizer_max_features)
    else:
        vectorizer = TfidfVectorizer(ngram_range=ngram_range, max_features=vectorizer_max_features)

    X_train, X_test, y_train, y_test = train_test_split(df['clean_comment'], df['category'], test_size=0.2, random_state=42, stratify=df['category'])

    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)

    # Step 4: Define and train a Random Forest model
    with mlflow.start_run() as run:
        # Set tags for the experiment and run
        mlflow.set_tag("mlflow.runName", f"{vectorizer_name}_{ngram_range}_RandomForest")
        mlflow.set_tag("experiment_type", "feature_engineering")
        mlflow.set_tag("model_type", "RandomForestClassifier")

        # Add a description
        mlflow.set_tag("description", f"RandomForest with {vectorizer_name}, ngram_range={ngram_range}, max_features={vectorizer_max_features}")

        # Log vectorizer parameters
        mlflow.log_param("vectorizer_type", vectorizer_type)
        mlflow.log_param("ngram_range", ngram_range)
        mlflow.log_param("vectorizer_max_features", vectorizer_max_features)

        # Log Random Forest parameters
        n_estimators = 200
        max_depth = 15

        mlflow.log_param("n_estimators", n_estimators)
        mlflow.log_param("max_depth", max_depth)

        # Initialize and train the model
        model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
        model.fit(X_train, y_train)

        # Step 5: Make predictions and log metrics
        y_pred = model.predict(X_test)

        # Log accuracy
        accuracy = accuracy_score(y_test, y_pred)
        mlflow.log_metric("accuracy", accuracy)

        # Log classification report
        classification_rep = classification_report(y_test, y_pred, output_dict=True)
        for label, metrics in classification_rep.items():
            if isinstance(metrics, dict):
                for metric, value in metrics.items():
                    mlflow.log_metric(f"{label}_{metric}", value)

        # Log confusion matrix
        conf_matrix = confusion_matrix(y_test, y_pred)
        plt.figure(figsize=(8, 6))
        sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues")
        plt.xlabel("Predicted")
        plt.ylabel("Actual")
        plt.title(f"Confusion Matrix: {vectorizer_name}, {ngram_range}")
        plt.savefig("confusion_matrix.png")
        mlflow.log_artifact("confusion_matrix.png")
        plt.close()

        # Log the model
        mlflow.sklearn.log_model(model, f"random_forest_model_{vectorizer_name}_{ngram_range}")

# Step 6: Run experiments for BoW and TF-IDF with different n-grams
ngram_ranges = [(1, 1), (1, 2), (1, 3)]  # unigrams, bigrams, trigrams
max_features = 5000  # Example max feature size

for ngram_range in ngram_ranges:
    # BoW Experiments
    run_experiment("BoW", ngram_range, max_features, vectorizer_name="BoW")

    # TF-IDF Experiments
    run_experiment("TF-IDF", ngram_range, max_features, vectorizer_name="TF-IDF")



In [61]:
for key,val in mlflow_experiment.items():
    with mlflow.start_run(run_name=key):
        for k,v in val.items():
            if k == 'transformers':
                mlflow.log_param(k, v)
            elif k == 'kfold_params':
                mlflow.log_param(k, v)
            elif k == 'metric':
                for metric_name, metric_value in v.items():
                    mlflow.log_metric(metric_name, metric_value)

🏃 View run LinearRegression at: http://127.0.0.1:5000/#/experiments/160764845707065275/runs/2c4077eb70da44c1bc5c4397dba189e3
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/160764845707065275
🏃 View run SVR at: http://127.0.0.1:5000/#/experiments/160764845707065275/runs/c4bf623fcd3e4d0a8cbc27b262f75f01
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/160764845707065275
🏃 View run Ridge at: http://127.0.0.1:5000/#/experiments/160764845707065275/runs/c616c4fa513a4deeacb2b85d85b43f39
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/160764845707065275
🏃 View run Lasso at: http://127.0.0.1:5000/#/experiments/160764845707065275/runs/1135f207ce304613bf7a0e83f52f05a1
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/160764845707065275
🏃 View run DecisionTreeRegressor at: http://127.0.0.1:5000/#/experiments/160764845707065275/runs/bf8ed477bd5c48b9ab0847bcb84d8f27
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/160764845707065275
🏃 View run RandomFor

In [6]:
# with mlflow.start_run(run_name="My_Run_1"):
#     mlflow.log_param("learning_rate", 0.01)
#     mlflow.log_metric("rmse", 3.22)


## 2. One Hot encoding Approach for Categoriacal Value
- Pipe Line Preparation and Test Run in Regression Models

In [12]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first'),['sector','agePossession','furnishing_type'])
    ], 
    remainder='passthrough'
)

# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_log_tran, cv=kfold, scoring='r2')
print(f"Score Mean:{scores.mean()}, Score std:{scores.std()}")


X_train, X_test, y_train, y_test = train_test_split(X,y_log_tran,test_size=0.2,random_state=42)
print(pipeline.fit(X_train,y_train))

y_pred = pipeline.predict(X_test)
y_pred = np.expm1(y_pred) ## Reversing Log normal tranformation into Original Scale applied in the Target Feature
mabe = mean_absolute_error(np.expm1(y_test),y_pred)
print(f"mean_absolute_error:  {mabe}")

Score Mean:0.8546054073648314, Score std:0.01599847663314007
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('num', StandardScaler(),
                                                  ['bedRoom', 'bathroom',
                                                   'built_up_area',
                                                   'servant room',
                                                   'store room']),
                                                 ('cat', OrdinalEncoder(),
                                                  ['property_type', 'sector',
                                                   'balcony', 'agePossession',
                                                   'furnishing_type',
                                                   'luxury_category',
                                                   'floor_category']),
                                                 ('c

In [None]:
## Function Call for all Models in dictionary 
model_output = []
for model_name,model in tqdm(model_dict.items()):
    model_output.append(scorer(model_name, model))

model_output

100%|██████████| 11/11 [00:47<00:00,  4.35s/it]


[['linear_reg', np.float64(0.8547329323594415), 0.6491718597838251],
 ['svr', np.float64(0.7697814568996155), 0.8350527446470336],
 ['ridge', np.float64(0.8549844636010894), 0.653065857510922],
 ['LASSO', np.float64(0.05943378064493572), 1.528905986892753],
 ['decision tree', np.float64(0.803333407866767), 0.674313359361995],
 ['random forest', np.float64(0.8921681580717135), 0.49848474651435337],
 ['extra trees', np.float64(0.8953747469382535), 0.46663546513503557],
 ['gradient boosting', np.float64(0.8767580494487396), 0.5694740518859313],
 ['adaboost', np.float64(0.7546900545226712), 0.8498375466780826],
 ['mlp', np.float64(0.8720867129993891), 0.5197428958931],
 ['xgboost', np.float64(0.8941972876061056), 0.4914336243343084]]

In [14]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
6,extra trees,0.894933,0.474956
10,xgboost,0.89585,0.493456
5,random forest,0.890276,0.498054
9,mlp,0.869908,0.55343
7,gradient boosting,0.876804,0.568952
0,linear_reg,0.854605,0.649738
2,ridge,0.854678,0.652914
4,decision tree,0.806491,0.70411
8,adaboost,0.751974,0.833694
1,svr,0.769741,0.834124
