In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use("fivethirtyeight")

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score



In [2]:
# Load the dataset
df = pd.read_csv('../data/processed/cleaned_data_Namrata.csv')


In [3]:
import warnings
warnings.filterwarnings("ignore")

import sys
sys.path.append('/Users/ayushyapare/Desktop/Ayushyas_Life/Work/Projects/Snippets')

from DataFrame_Analysis import analyze_dataframe

In [4]:
# Perform EDA now
#analyze_dataframe(df)

In [5]:
df.columns

Index(['sq_mt_built', 'n_rooms', 'n_bathrooms', 'buy_price',
       'buy_price_by_area', 'has_central_heating', 'has_individual_heating',
       'has_ac', 'has_fitted_wardrobes', 'has_lift', 'is_exterior',
       'energy_certificate', 'has_parking', 'neighborhood', 'District',
       'HouseType'],
      dtype='object')

In [6]:
# Replace '8.0+' with '8'
#df['n_rooms'] = df['n_rooms'].str.replace('8.0+', '8', regex=False)
#df['n_bathrooms'] = df['n_bathrooms'].str.replace('8.0+', '8', regex=False)

In [7]:
# Assuming the dataset is already loaded into df
# Define features and target variable
X = df[['sq_mt_built', 'n_rooms', 'n_bathrooms', 'buy_price',
        'has_central_heating', 'has_individual_heating',
       'has_ac', 'has_fitted_wardrobes', 'has_lift', 'is_exterior',
       'energy_certificate', 'has_parking', 'neighborhood', 'District',
       'HouseType']]
y = df['buy_price_by_area']



In [8]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Identify numerical and categorical columns
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object', 'category']).columns

# Define preprocessing for numerical features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values with the mean
    ('scaler', StandardScaler())  # Standardize numerical features
])

# Define preprocessing for categorical features
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing values with the most frequent value
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One-hot encode categorical features
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])



In [9]:

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [10]:
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression, RidgeClassifier, Lasso, ElasticNet


In [11]:
# Initialize models
logistic_regression_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LogisticRegression())
])

ridge_classifier_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RidgeClassifier())
])

lasso_logistic_regression_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LogisticRegression(penalty='l1', solver='saga'))
])

elastic_net_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', ElasticNet())
])

# Train and evaluate


In [12]:
#conda install -c conda-forge pycaret

In [13]:
from pycaret.regression import *

In [14]:
df_ = pd.concat([X_train, y_train], axis=1)

In [15]:
from pycaret.regression import *

In [16]:
# Setup the environment in PyCaret
regression_setup = setup(
    data=df_,
    target = 'buy_price_by_area',
    session_id=9,
    )

Unnamed: 0,Description,Value
0,Session id,9
1,Target,buy_price_by_area
2,Target type,Regression
3,Original data shape,"(16708, 16)"
4,Transformed data shape,"(16708, 56)"
5,Transformed train set shape,"(11695, 56)"
6,Transformed test set shape,"(5013, 56)"
7,Numeric features,4
8,Categorical features,8
9,Preprocess,True


In [17]:
# Log the parameters and metrics with MLflow
best_model = compare_models()


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lightgbm,Light Gradient Boosting Machine,113.8583,43346.2469,206.4894,0.9882,0.0438,0.028,0.318
rf,Random Forest Regressor,92.8188,56916.207,236.8643,0.9845,0.0456,0.0211,0.646
dt,Decision Tree Regressor,166.6809,128389.6525,356.585,0.9651,0.0666,0.0376,0.042
et,Extra Trees Regressor,238.3897,202125.3458,448.5772,0.9449,0.0969,0.059,0.703
gbr,Gradient Boosting Regressor,325.783,217181.3547,465.5819,0.9408,0.1152,0.0879,0.262
lr,Linear Regression,537.3217,611783.9619,781.6123,0.8331,0.2119,0.1461,0.251
ridge,Ridge Regression,537.2929,611791.2516,781.6159,0.8331,0.2119,0.146,0.028
br,Bayesian Ridge,537.2586,612054.3946,781.7784,0.8331,0.2125,0.146,0.034
lasso,Lasso Regression,537.8116,613855.1616,782.9279,0.8326,0.2149,0.1462,0.057
llar,Lasso Least Angle Regression,545.8576,629079.5917,792.7743,0.8283,0.2222,0.1488,0.029


Processing:   0%|          | 0/77 [00:00<?, ?it/s]