# REGRESSION

### GETTING SUMMARY STATISTICS ON COLUMNS

In [None]:
######################## GETTING SUMMARY STATISTICS ############################

# import libaries
import pandas as pd
from src.config import Config
import seaborn as sns
import matplotlib.pyplot as plt


# loading data_forsale_new.csv into df_main
df_main = pd.read_csv("./data/data_forsale_new.csv")

# finding nan values 

# for x in df_main.columns:
#    print(x, Config.expand_dataframe(df_main[df_main[x].isna() == True].shape))

# # filling nan values 
# Config.fill_nan(df=df_main, column="plot_area")
# Config.expand_dataframe(df_main[df_main["plot_area"].isna() == True])

# feature selection
## correlation

display(df_main.columns)

Config.expand_display(x=df_main[["price", "price_per_sqmeter", "plot_area", "habitable_surface", 
         "land_surface", "bedroom_count", "room_count"]].corr()["price"])


df_set_one = df_main[["price", "bedroom_count", "room_count", "habitable_surface"]]









based on the calculated correlation value above, we can pick 3 variables that have an adequately significant correlation with variable **price**:
1. **habitable_surface** : 0.344623
2. **room_count**  : 0.376763
3. **bedroom_count** : 0.382781

In [1]:
#################################################### LOAD AND PREPROCESS DATA ####################################################

# Import library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from src.config import Config, ModelConfig
from sklearn.model_selection import  train_test_split
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import  StandardScaler

# Loading data_forsale_new.csv into df_main
df_set_one = ModelConfig.load_data(filepath='./data/data_forsale_new.csv', file_type="csv", usecols=["price", "room_count", "bedroom_count", "habitable_surface"])

# cheeck df_set_one
Config.expand_display(df_set_one["price"].median())

# fill missing values
# Config.fill_nan(df=df_set_one, column=["habitable_surface", "bedroom_count", "room_count"])

# Extract the features (X) and target (y)
X, y = ModelConfig.feature_target_config(df=df_set_one)

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

# Define column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num_imputer', KNNImputer(n_neighbors=5), ['room_count', "bedroom_count", "habitable_surface" ]),
        ('poly_features', ModelConfig.poly_features_config(degree=5), ["price", "room_count", "bedroom_count", "habitable_surface"]),
        ('std_scaler', StandardScaler(with_mean=False, with_std=True), ["price", "room_count", "bedroom_count", "habitable_surface"])
    ],
    remainder='passthrough'
)



359000.0

None

None

In [2]:
#################################################### SETTING UP PIPELINE PARTS, INITIALIZING MODEL PIPELINE ####################################################
from src.config import Config, ModelConfig
from sklearn.pipeline import make_pipeline


# Create the XGBRegressor with specified parameters
xgb_reg = ModelConfig.XGBREGRConfig()

# Create the model_pipeline with preprocessing and XGBoost regression
model_pipeline = make_pipeline(preprocessor, xgb_reg)


In [3]:
#################################################### HYPERPARAMETER TUNING USING GRIDSEARCHCV ####################################################
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, mean_absolute_percentage_error, explained_variance_score

# Hyperparameter tuning using ModelConfig.XGB_ParamGrid
param_grid = ModelConfig.XGB_ParamGrid

# Specify the metric to use for refitting (explained variance score in this case)
refit_metric = 'explained_variance'

# Initialize GridSearchCV
grid_search = ModelConfig.XGBGridSearchCV(estimator=model_pipeline, param=param_grid, cv_fold=5, scoring={
    'explained_variance': 'explained_variance',
    'mape': make_scorer(mean_absolute_percentage_error),
    'r2': 'r2'
}, refit=refit_metric)


In [4]:
#################################################### FIT AND TEST MODEL, SORT INDICES  AND EVALUATE THE MODEL ####################################################
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import explained_variance_score, mean_absolute_percentage_error, r2_score

# Fit the model_pipeline on the training data and generate predictions on the test data
grid_search.fit(X_train, y_train)

# Make predictions using the best model
y_pred = grid_search.predict(X_test)

# Sort the test data points based on feature values
sort_indices = np.argsort(X_test[:, 0])
X_test_sorted = X_test[sort_indices]
y_test_sorted = y_test[sort_indices]
y_pred_sorted = y_pred[sort_indices]

Fitting 5 folds for each of 39690000 candidates, totalling 198450000 fits


In [None]:
#################################################### EVALUATE THE MODEL ####################################################
from src.model_config import ModelConfig
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, explained_variance_score

# RMSE (Root Mean Squared Error)
rmse = mean_squared_error(y_test, y_pred, squared=False)

# MAPE (Mean Absolute Percentage Error)
mape_score = ModelConfig.MAPE(y_test, y_pred)


In [None]:
## subsetting df_main

df_set_one = df_main[["price", "bedroom_count", "room_count", "habitable_surface"]]

Config.expand_display(x=df_set_one.describe())
Config.expand_display(pd.DataFrame(df_set_one.median()))

## removing extreme values
df_set_one.drop(df_set_one[df_set_one["price"] < 35000].index, inplace=True) 
                                                # 35000 is just an arbitrary number
df_set_one.drop(df_set_one[df_set_one["price"] > 5500000].index, inplace=True) 
                                                # 5500000 is just an arbitrary number

## see the correlation if it is improving?
Config.expand_display(df_set_one.corr()) 

