In [10]:
import os
import sys
# Add src/ to sys.path (relative to current notebook)
sys.path.append(os.path.abspath("../../src"))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pathlib
import mlflow

from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.decomposition import PCA
from sklearn.linear_model import Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neural_network import  MLPRegressor
from xgboost import XGBRegressor
from tqdm import tqdm 


from utils.VariableAnalysis import UnivariateAnalysis,BivariateAnalysis

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns',None)

In [11]:
file_path = pathlib.Path('D:\Capstone Project\dataset\House_Price_dataset')
df = pd.read_csv(f"{file_path}/gurgaon_properties_post_feature_selection_v2.csv")
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 36,0.82,3.0,2.0,2,New Property,850.0,0.0,0.0,0.0,Low,Low Floor
1,flat,sector 89,0.95,2.0,2.0,2,New Property,1226.0,1.0,0.0,0.0,Low,Mid Floor
2,flat,sohna road,0.32,2.0,2.0,1,New Property,1000.0,0.0,0.0,0.0,Low,High Floor
3,flat,sector 92,1.6,3.0,4.0,3+,Relatively New,1615.0,1.0,0.0,1.0,High,Mid Floor
4,flat,sector 102,0.48,2.0,2.0,1,Relatively New,582.0,0.0,1.0,0.0,High,Mid Floor


In [12]:
## Furniture Type Unique Value
df['furnishing_type'].value_counts()

furnishing_type
0.0    2349
1.0    1018
2.0     187
Name: count, dtype: int64

In [13]:
## Level Incoding 
# 0 -> unfurnished
# 1 -> semifurnished
# 2 -> furnished
df['furnishing_type'] = df['furnishing_type'].replace({0.0:'unfurnished',1.0:'semifurnished',2.0:'furnished'})
df['furnishing_type'].value_counts()

furnishing_type
unfurnished      2349
semifurnished    1018
furnished         187
Name: count, dtype: int64

In [14]:
## Data split into dependent variable and Independent variable 
X = df.drop(columns=['price'])
y = df['price']

In [15]:
# Applying the log1p transformation to the target variable for normal distribution
y_log_tran = np.log1p(y)

In [16]:
## Categorical Column 
categorical_col = X.select_dtypes(include=(object)).columns.tolist()
categorical_col

['property_type',
 'sector',
 'balcony',
 'agePossession',
 'furnishing_type',
 'luxury_category',
 'floor_category']

In [17]:
## Numerical Column 
numerical_cols = X.select_dtypes(include=(int,float)).columns.tolist()
numerical_cols

['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']

## 1. Ordinal Encoding Approach for Categoriacal Value
- Pipe Line Preparation and Test Run

In [18]:
## Single Model Test Run
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('numerical_transform', StandardScaler(), numerical_cols),
        ('categorical_tranform', OrdinalEncoder(), categorical_col)
    ], 
    remainder='passthrough'
)

# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

scores = cross_val_score(pipeline, X, y_log_tran, cv=kfold, scoring='r2')
print(f"Score Mean:{scores.mean()}, Score std:{scores.std()}")

## Train and Test split and Model train 
X_train, X_test, y_train, y_test = train_test_split(X,y_log_tran,test_size=0.2,random_state=42)
print(pipeline.fit(X_train,y_train))

y_pred = pipeline.predict(X_test)
y_pred = np.expm1(y_pred) ## Reversing Log normal tranformation into Original Scale applied in the Target Feature
mabe = mean_absolute_error(np.expm1(y_test),y_pred)
print(f"mean_absolute_error:  {mabe}")

Score Mean:0.7363096633436828, Score std:0.03238005754429936
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('numerical_transform',
                                                  StandardScaler(),
                                                  ['bedRoom', 'bathroom',
                                                   'built_up_area',
                                                   'servant room',
                                                   'store room']),
                                                 ('categorical_tranform',
                                                  OrdinalEncoder(),
                                                  ['property_type', 'sector',
                                                   'balcony', 'agePossession',
                                                   'furnishing_type',
                                                   'luxury_c

In [66]:
# ### Original Script 
# def scorer(model_name, model, y_transformed=y_log_tran):
#     """Function For Different Models """
#     output = []
#     output.append(model_name)

#     pipeline = Pipeline([
#         ('preprocessor', preprocessor),
#         ('regressor', model)
#     ])

#     # K-fold cross-validation (Model Evaluation)
#     kfold = KFold(n_splits=10, shuffle=True, random_state=42)
#     scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
#     output.append(round(scores.mean(),4))
    
#     X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)   ## Train Test Split
#     pipeline.fit(X_train,y_train) ## Model Training 
#     y_pred = pipeline.predict(X_test) ## y^ Calulation 
#     y_pred = np.expm1(y_pred) ## Reversing Log normal tranformation into Original Scale in the Target Feature
#     output.append(mean_absolute_error(np.expm1(y_test),y_pred))
#     return output

# ### Models Dictionary
# model_dict = {
#     'LinearRegression':LinearRegression(),
#     'SVR':SVR(),
#     'Ridge':Ridge(),
#     'Lasso':Lasso(),
#     'DecisionTreeRegressor': DecisionTreeRegressor(),
#     'RandomForestRegressor':RandomForestRegressor(),
#     'ExtraTreesRegressor': ExtraTreesRegressor(),
#     'GradientBoostingRegressor': GradientBoostingRegressor(),
#     'AdaBoostRegressor': AdaBoostRegressor(),
#     'MLPRegressor': MLPRegressor(),
#     'XGBRegressor':XGBRegressor()
# }


# # Creating a column transformer for preprocessing
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('numerical_transform', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
#         ('categorical_tranform', OrdinalEncoder(), columns_to_encode)
#     ], 
#     remainder='passthrough'
# )

# ## Function Call for all Linear Models in dictionary 
# model_output = []
# for model_name,model in tqdm(model_dict.items()):
#     model_output.append(scorer(model_name, model))

# model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])
# model_df.sort_values(['mae'])


In [19]:
## Modified Script For MLFLOW Tracking 
def scorer(model_name, model, y_transformed=y_log_tran):
    """Function For Different Models """
    Mlflow_info = {}
    transformer_info = {}
    cross_val_info = {}
    metric_info = {}
    output = []
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])

    for name, transformer, columns in preprocessor.transformers:
        transformer_info[name]=transformer
       
    # K-fold cross-validation (Model Evaluation)
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    metric_info['R2'] = scores.mean() ## mlflow info
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)   ## Train Test Split
    pipeline.fit(X_train,y_train) ## Model Training 
    y_pred = pipeline.predict(X_test) ## y^ Calulation 
    y_pred = np.expm1(y_pred) ## Reversing Log normal tranformation into Original Scale in the Target Feature
    metric_info['MAE'] = mean_absolute_error(np.expm1(y_test),y_pred) ## mlflow info
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))

    ## Mlflow Parameter 
    # Mlflow_info[model_name] = model
    Mlflow_info['transformers'] = transformer_info
    Mlflow_info['kfold_params'] = {
    'n_splits': kfold.n_splits,
    'shuffle': kfold.shuffle,
    'random_state': kfold.random_state}
    Mlflow_info['metric'] = metric_info
    return output, Mlflow_info




In [21]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('numerical_transform', StandardScaler(), numerical_cols),
        ('categorical_tranform', OrdinalEncoder(), categorical_col)
    ], 
    remainder='passthrough'
)

### Models Dictionary
model_dict = {
    'LinearRegression':LinearRegression(),
    'SVR':SVR(),
    'Ridge':Ridge(),
    'Lasso':Lasso(),
    'DecisionTreeRegressor': DecisionTreeRegressor(),
    'RandomForestRegressor':RandomForestRegressor(),
    'ExtraTreesRegressor': ExtraTreesRegressor(),
    'GradientBoostingRegressor': GradientBoostingRegressor(),
    'AdaBoostRegressor': AdaBoostRegressor(),
    'MLPRegressor': MLPRegressor(),
    'XGBRegressor':XGBRegressor()
}


## Function Call for all Linear Models in dictionary 
model_output = []
mlflow_experiment = {}
for model_name,model in tqdm(model_dict.items()):
    a,b = scorer(model_name, model)
    model_output.append(a)
    mlflow_experiment[model_name] = b
model_output

100%|██████████| 11/11 [00:48<00:00,  4.40s/it]


[['LinearRegression',
  np.float64(0.7363096633436828),
  np.float64(0.9463822160089356)],
 ['SVR', np.float64(0.7642012011196353), np.float64(0.8472636473483922)],
 ['Ridge', np.float64(0.7363125343993554), np.float64(0.9463387741853386)],
 ['Lasso', np.float64(0.05943378064493573), np.float64(1.528905986892753)],
 ['DecisionTreeRegressor',
  np.float64(0.776257367527909),
  np.float64(0.7371208379419942)],
 ['RandomForestRegressor',
  np.float64(0.8816775922540476),
  np.float64(0.5331711984647937)],
 ['ExtraTreesRegressor',
  np.float64(0.8685615236174661),
  np.float64(0.5478896151594743)],
 ['GradientBoostingRegressor',
  np.float64(0.8724876758923734),
  np.float64(0.5761038048226718)],
 ['AdaBoostRegressor',
  np.float64(0.7608828303693477),
  np.float64(0.8599131971338162)],
 ['MLPRegressor',
  np.float64(0.8013124403079356),
  np.float64(0.7471418164706045)],
 ['XGBRegressor',
  np.float64(0.8894876835260124),
  np.float64(0.5040475141482346)]]

In [22]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
10,XGBRegressor,0.889488,0.504048
5,RandomForestRegressor,0.881678,0.533171
6,ExtraTreesRegressor,0.868562,0.54789
7,GradientBoostingRegressor,0.872488,0.576104
4,DecisionTreeRegressor,0.776257,0.737121
9,MLPRegressor,0.801312,0.747142
1,SVR,0.764201,0.847264
8,AdaBoostRegressor,0.760883,0.859913
2,Ridge,0.736313,0.946339
0,LinearRegression,0.73631,0.946382


## 2. One Hot encoding Approach for Categoriacal Value
- Pipe Line Preparation and Test Run in Regression Models

In [23]:
### Single Modle Test Run

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OrdinalEncoder(), categorical_col),
        ('cat1',OneHotEncoder(drop='first'),['sector','agePossession','furnishing_type'])
    ], 
    remainder='passthrough'
)

# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_log_tran, cv=kfold, scoring='r2')
print(f"Score Mean:{scores.mean()}, Score std:{scores.std()}")


X_train, X_test, y_train, y_test = train_test_split(X,y_log_tran,test_size=0.2,random_state=42)
print(pipeline.fit(X_train,y_train))

y_pred = pipeline.predict(X_test)
y_pred = np.expm1(y_pred) ## Reversing Log normal tranformation into Original Scale applied in the Target Feature
mabe = mean_absolute_error(np.expm1(y_test),y_pred)
print(f"mean_absolute_error:  {mabe}")

Score Mean:0.8546094810971422, Score std:0.015997422908695623
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('num', StandardScaler(),
                                                  ['bedRoom', 'bathroom',
                                                   'built_up_area',
                                                   'servant room',
                                                   'store room']),
                                                 ('cat', OrdinalEncoder(),
                                                  ['property_type', 'sector',
                                                   'balcony', 'agePossession',
                                                   'furnishing_type',
                                                   'luxury_category',
                                                   'floor_category']),
                                                 ('

In [24]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OrdinalEncoder(), categorical_col),
        ('cat1',OneHotEncoder(drop='first'),['sector','agePossession','furnishing_type'])
    ], 
    remainder='passthrough'
)

### Models Dictionary
model_dict = {
    'LinearRegression':LinearRegression(),
    'SVR':SVR(),
    'Ridge':Ridge(),
    'Lasso':Lasso(),
    'DecisionTreeRegressor': DecisionTreeRegressor(),
    'RandomForestRegressor':RandomForestRegressor(),
    'ExtraTreesRegressor': ExtraTreesRegressor(),
    'GradientBoostingRegressor': GradientBoostingRegressor(),
    'AdaBoostRegressor': AdaBoostRegressor(),
    'MLPRegressor': MLPRegressor(),
    'XGBRegressor':XGBRegressor()
}

## Function Call for all Models in dictionary 
model_output = []
mlflow_experiment = {}
for model_name,model in tqdm(model_dict.items()):
    a,b = scorer(model_name, model)
    model_output.append(a)
    mlflow_experiment[model_name] = b
model_output

100%|██████████| 11/11 [05:38<00:00, 30.78s/it]


[['LinearRegression',
  np.float64(0.8546094810971422),
  np.float64(0.6497514315131458)],
 ['SVR', np.float64(0.7697413260547326), np.float64(0.8341243500492146)],
 ['Ridge', np.float64(0.8547390737480411), np.float64(0.652914841218881)],
 ['Lasso', np.float64(0.05943378064493578), np.float64(1.528905986892753)],
 ['DecisionTreeRegressor',
  np.float64(0.8098774130601006),
  np.float64(0.7052316629237838)],
 ['RandomForestRegressor',
  np.float64(0.8909465915934238),
  np.float64(0.4989780739508377)],
 ['ExtraTreesRegressor',
  np.float64(0.8941069955303564),
  np.float64(0.46800925191530446)],
 ['GradientBoostingRegressor',
  np.float64(0.8766218029766483),
  np.float64(0.5703191298997323)],
 ['AdaBoostRegressor',
  np.float64(0.756778589942707),
  np.float64(0.8443680114684254)],
 ['MLPRegressor',
  np.float64(0.8710600566095487),
  np.float64(0.540886755109473)],
 ['XGBRegressor',
  np.float64(0.8958499681743852),
  np.float64(0.4934562667923469)]]

In [25]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
6,ExtraTreesRegressor,0.894107,0.468009
10,XGBRegressor,0.89585,0.493456
5,RandomForestRegressor,0.890947,0.498978
9,MLPRegressor,0.87106,0.540887
7,GradientBoostingRegressor,0.876622,0.570319
0,LinearRegression,0.854609,0.649751
2,Ridge,0.854739,0.652915
4,DecisionTreeRegressor,0.809877,0.705232
1,SVR,0.769741,0.834124
8,AdaBoostRegressor,0.756779,0.844368
