In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
import xgboost as xgb
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np

In [3]:
# Loading Dataset
df = pd.read_pickle('encoded_data.pkl')

In [4]:
X = df.drop('price',axis=1)
y= df['price']

In [5]:
# Splitting df -- 70% For training purpose, 15% for validation,15% for testing (untouched till final model)

X_train,X_temp,y_train,y_temp = train_test_split(X, y, test_size=0.3, random_state=21)
X_val,X_test,y_val,y_test = train_test_split(X_temp,y_temp, test_size=0.5 , random_state=101)

In [6]:
# models
models = models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting': GradientBoostingRegressor(),
    'XGBoost': xgb.XGBRegressor(),
    'AdaBoost': AdaBoostRegressor(),
    'Support Vector Machine': SVR(),
    'K-Nearest Neighbors': KNeighborsRegressor()
}

In [None]:
# Initialize lists to store results
model_names = []
r2_scores = []
adjusted_r2_scores = []
mae_scores = []
rmse_scores = []

# Train and evaluate each model
for name, model in models.items():
    # Train model
    model.fit(X_train, y_train)
    
    # Predict on validation set
    y_pred_val = model.predict(X_val)
    
    # Calculate evaluation metrics
    r2 = r2_score(y_val, y_pred_val)
    mae = mean_absolute_error(y_val, y_pred_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred_val))
    n = len(X_val)
    p = X_val.shape[1]
    adjusted_r2 = 1 - ((1 - r2) * (n - 1) / (n - p - 1))
    
    # Append results to lists
    model_names.append(name)
    r2_scores.append(r2)
    adjusted_r2_scores.append(adjusted_r2)
    mae_scores.append(mae)
    rmse_scores.append(rmse)

In [None]:
results_df = pd.DataFrame({
    'Model': model_names,
    'R2': r2_scores,
    'Adj R2':adjusted_r2_scores,
    'MAE': mae_scores,
    'RMSE': rmse_scores
})
results_df

In [15]:
# results_df = pd.DataFrame({
#     'Model': model_names,
#     'R2': r2_scores,
#     'Adj R2':adjusted_r2_scores,
#     'MAE': mae_scores,
#     'RMSE': rmse_scores
# })
# results_df

Unnamed: 0,Model,R2,MAE,RMSE
0,Linear Regression,0.567813,3199.739289,4855.756177
1,Ridge Regression,0.567813,3199.733778,4855.756555
2,Lasso Regression,0.56781,3199.556576,4855.772436
3,Decision Tree,0.73897,2163.948944,3773.688313
4,Random Forest,0.842461,1761.914123,2931.669774
5,Gradient Boosting,0.825321,1905.881984,3087.03367
6,XGBoost,0.849078,1737.282932,2869.44596
7,AdaBoost,0.629479,3466.005339,4496.013748
8,Support Vector Machine,0.080311,4506.214451,7083.400188
9,K-Nearest Neighbors,0.761003,2173.571385,3610.914147


In [6]:
# # Initialize lists to store results
# model_names = []
# r2_scores = []
# mae_scores = []
# rmse_scores = []

# # Train and evaluate each model
# for name, model in models.items():
#     # Train model
#     model.fit(X_train, y_train)
    
#     # Predict on validation set
#     y_pred_val = model.predict(X_val)
    
#     # Calculate evaluation metrics
#     r2 = r2_score(y_val, y_pred_val)
#     mae = mean_absolute_error(y_val, y_pred_val)
#     rmse = np.sqrt(mean_squared_error(y_val, y_pred_val))
    
#     # Append results to lists
#     model_names.append(name)
#     r2_scores.append(r2)
#     mae_scores.append(mae)
#     rmse_scores.append(rmse)

# # Create DataFrame to store results
# results_df = pd.DataFrame({
#     'Model': model_names,
#     'R2': r2_scores,
#     'MAE': mae_scores,
#     'RMSE': rmse_scores
# })

# # Display results
# print(results_df)

                    Model        R2          MAE         RMSE
0       Linear Regression  0.676977  2768.401085  4197.957833
1        Ridge Regression  0.676977  2768.398652  4197.958370
2        Lasso Regression  0.676972  2768.291161  4197.987258
3           Decision Tree  0.755393  2058.921616  3653.048141
4           Random Forest  0.861642  1621.129684  2747.406871
5       Gradient Boosting  0.837842  1806.662695  2974.337957
6                AdaBoost  0.630332  3414.915576  4490.837747
7  Support Vector Machine  0.149449  4482.018033  6811.950402
8     K-Nearest Neighbors  0.733575  2272.209754  3812.487629
