In [130]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import GridSearchCV, cross_validate
import pickle

from sklearn import set_config
set_config(transform_output='pandas')

# ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('../data/csv/cleaned_data.csv')

## Train-Test Split

In [3]:
numerical_features = ['Floor Area','Lot Area']
categorical_features = ['Town/City','Region','Bedrooms']
features = numerical_features + categorical_features
target_variable = 'Price'

In [4]:
# feature and target columns
X,y = df[features],df[target_variable]

In [5]:
X.head()

Unnamed: 0,Floor Area,Lot Area,Town/City,Region,Bedrooms
0,222,96,Las Piñas,Metro Manila,4
1,189,120,Las Piñas,Metro Manila,4
2,128,120,Las Piñas,Metro Manila,4
3,216,105,Las Piñas,Metro Manila,4
4,120,85,Las Piñas,Metro Manila,4


In [6]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=101)
X_train.shape, X_test.shape

((1152, 5), (289, 5))

## Feature Engineering
- Categorical features: Convert to binary columns by using One Hot Encoding
- Numerical features: Normalize by using StandardScaler, PolynomialFeatures

### Baseline Model

In [82]:
numerical_transformer = Pipeline(
    steps=[
        ('scaler', StandardScaler()),
        ('polynomial', PolynomialFeatures(degree=2))
    ]
)

In [8]:
categorical_transformer = Pipeline(
    steps=[
        ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False,drop='first'))
    ]
)

In [83]:
# Create a column transformer for categorical and numerical features
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

In [67]:
# Create a Pipeline that includes the preprocessor and the LinearRegression model
model = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('classifier', LinearRegression())
    ]
)

In [68]:
# transform price to log to make it more normally distributed
y_train_log = np.log1p(y_train)

In [69]:
# fit model
model.fit(X_train, y_train_log)

In [70]:
# training score
training_score = model.score(X_train, y_train_log)

# test score
y_test_log = np.log1p(y_test)
test_score = model.score(X_test, y_test_log)

# print r2 score - the closer to 1, the better
print('Training score: ', training_score)
print('Test score: ', test_score)

Training score:  0.8984838842540035
Test score:  0.9017470572322711


In [14]:
### Predict train ###
# Make predictions using the model
y_pred_log = model.predict(X_train)

# Transform the predictions back to the original scale
y_pred = np.exp(y_pred_log)-1

### Predict test ###
y_test_pred_log = model.predict(X_test)

# Transform the predictions back to the original scale
y_test_pred = np.exp(y_test_pred_log)-1


In [15]:
# Get mean squared log error for train and test data - the lower the better
train_mean_sq_log_error_val = np.sqrt(mean_squared_log_error(y_train, y_pred))
print("Train RMSLE: %.2f" % train_mean_sq_log_error_val)

test_mean_sq_log_error_val = np.sqrt(mean_squared_log_error(y_test, y_test_pred))
print("Test RMSLE: %.2f" % test_mean_sq_log_error_val)

Train RMSLE: 0.31
Test RMSLE: 0.31


## Cross validation
Provide a more accurate assessment of how well a model generalizes to new data. \
It helps in detecting issues like overfitting (when a model performs well on the training data but poorly on new data) and provides a more realistic estimate of the model's performance.

In [16]:
y_log = np.log(y)
cv = cross_validate(model, X, y_log,
                    cv=5,
                    scoring='r2',
                    return_train_score=True
)
# convert the dictionary of lists into a DataFrame
cv = pd.DataFrame(cv)
print(cv)

   fit_time  score_time  test_score  train_score
0  0.058560    0.016636    0.787055     0.911990
1  0.173185    0.018689    0.904349     0.894338
2  0.067660    0.020354    0.893766     0.898964
3  0.074362    0.018877    0.892325     0.895879
4  0.033078    0.015413    0.806232     0.907127


## Grid Search CV

**Define Models**

In [17]:
m_lasso = Lasso(random_state=42)
m_ridge = Ridge(random_state=42)
m_rf = RandomForestRegressor(random_state=42)
m_lr = LinearRegression()

In [18]:
# Create a function to get the parameters set in each model
def get_parameters(model):
    # Common parameters for all models
    common_params = {
        'preprocessor__num__polynomial__degree': [1, 2, 3, 4],
        'preprocessor__num__polynomial__interaction_only': [False, True]
    }
    match model:
        case 'lasso':
            return {
                **common_params,
                'classifier__alpha' : [1.,0.001,0.1,0.01,0.05,10],
                'classifier__max_iter' : [1_000,500,100,10],
                'classifier' : [m_lasso]
            }
        case 'ridge':
            return {
                **common_params,
                'classifier__alpha' : [1.,0.001,0.1,0.01,0.05,10],
                'classifier__max_iter' : [1_000,500,100,10],
                'classifier' : [m_ridge]
            }
        case 'random_forest':
            return{
                **common_params,
                'classifier__max_depth' : [5,10],
                'classifier__n_estimators' : [5,10,20,50,100],
                'classifier' : [m_rf]
            }
        case 'linear_regression':
            return {
                **common_params,
                'classifier' : [m_lr]
            }
        case default:
            return {
                **common_params
            }

In [19]:
model_names = ['lasso', 'ridge', 'random_forest', 'linear_regression']
param_list = [get_parameters(model) for model in model_names]

In [20]:
gscv = GridSearchCV(
    estimator=model,
    param_grid=param_list,
    scoring='r2',
    cv=5, 
    n_jobs=-1,
    verbose=1
)

**Grid-search cross validation**

In [21]:
import time
# initial time
ti = time.time()

# grid-search cross-validation
gscv.fit(X_train,y_train_log)

# final time 
tf = time.time()

# time taken
print(f"time taken: {round(tf-ti,2)} sec")

Fitting 5 folds for each of 472 candidates, totalling 2360 fits


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

time taken: 81.66 sec


**Cross validation results**

In [22]:
def get_cv_results(cv_results):
    # list of columns to show
    column_list = ['param_classifier',
                   'param_preprocessor__num__polynomial__degree',
                   'param_preprocessor__num__polynomial__interaction_only',
                   'param_classifier__alpha',
                   'param_classifier__max_iter',
                   'param_classifier__max_depth',
                   'param_classifier__n_estimators',
                   'mean_test_score',
                   'std_test_score',
                   'rank_test_score'
                  ]

    # create result dataframe
    result_df = pd.DataFrame(cv_results)[column_list]

    # rename columns
    result_df.rename(
        columns=lambda name: name.split('__')[-1],inplace=True
    )

    # order by rank
    result_df.sort_values(
        by='rank_test_score', ascending=True, inplace=True, ignore_index=True
    )
    return result_df

In [23]:
df_gscv_result = get_cv_results(gscv.cv_results_)
df_gscv_result.head(50)

Unnamed: 0,param_classifier,degree,interaction_only,alpha,max_iter,max_depth,n_estimators,mean_test_score,std_test_score,rank_test_score
0,"RandomForestRegressor(max_depth=10, random_sta...",4,True,,,10.0,100.0,0.900954,0.012143,1
1,"RandomForestRegressor(max_depth=10, random_sta...",3,True,,,10.0,100.0,0.900954,0.012143,1
2,"RandomForestRegressor(max_depth=10, random_sta...",2,True,,,10.0,100.0,0.900954,0.012143,1
3,"RandomForestRegressor(max_depth=10, random_sta...",1,False,,,10.0,100.0,0.900536,0.013731,4
4,"RandomForestRegressor(max_depth=10, random_sta...",1,True,,,10.0,100.0,0.900536,0.013731,4
5,"RandomForestRegressor(max_depth=10, random_sta...",2,False,,,10.0,100.0,0.900432,0.01212,6
6,"RandomForestRegressor(max_depth=10, random_sta...",1,False,,,10.0,50.0,0.899458,0.013664,7
7,"RandomForestRegressor(max_depth=10, random_sta...",1,True,,,10.0,50.0,0.899458,0.013664,7
8,"RandomForestRegressor(max_depth=10, random_sta...",2,False,,,10.0,50.0,0.898767,0.012042,9
9,"RandomForestRegressor(max_depth=10, random_sta...",4,True,,,10.0,50.0,0.898685,0.012401,10


**Best hyperparameters and model**

In [24]:
gscv.best_params_

{'classifier': RandomForestRegressor(max_depth=10, random_state=42),
 'classifier__max_depth': 10,
 'classifier__n_estimators': 100,
 'preprocessor__num__polynomial__degree': 2,
 'preprocessor__num__polynomial__interaction_only': True}

In [27]:
best_model = gscv.best_estimator_
best_model

**Build model**

In [28]:
best_model.fit(X_train,y_train_log)

In [29]:
# training score
training_score = best_model.score(X_train,y_train_log)

# test score
test_score = best_model.score(X_test,y_test_log)

print(f'Train score: {round(training_score,6)}')
print(f'Test score : {round(test_score,6)}')

Train score: 0.971831
Test score : 0.91136


Result shows that this might be overfitting, so I try to use other models and check results

## Selected model

When I used the best_estimator_ with RandomForestRegressor with n_estimators=100 and max_depth=10 \ 
it gave me a train score of 0.97 and test score of 0.91 \
and cross_validation train mean score of 0.97 and test mean score of 0.87. \
And for this I feel like the gap is too big.

In [123]:
# Create a Pipeline that includes the preprocessor and the LinearRegression model
sel_model = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        # ('classifier', Ridge(alpha=0.1, max_iter=10,random_state=42))
        # ('classifier', Lasso(alpha=0.01, max_iter=1000,random_state=42))
        # ('classifier', LinearRegression())
        ('classifier', RandomForestRegressor(n_estimators=15, max_depth=5,random_state=42))
    ]
)

In [124]:
# fit model
sel_model.fit(X_train, y_train_log)

In [125]:
# training score
training_score = sel_model.score(X_train, y_train_log)

# test score
test_score = sel_model.score(X_test, y_test_log)

# print r2 score - the closer to 1, the better
print('Training score: ', training_score)
print('Test score: ', test_score)

Training score:  0.9124886315299038
Test score:  0.8781243262727407


In [126]:
cv = cross_validate(sel_model, X, y_log,
                    cv=5,
                    scoring='r2',
                    return_train_score=True
)
# convert the dictionary of lists into a DataFrame
cv = pd.DataFrame(cv)
print(cv.mean())

fit_time       0.188001
score_time     0.042485
test_score     0.848562
train_score    0.910937
dtype: float64


In [127]:
### Predict train ###
# Make predictions using the model
y_pred_log = sel_model.predict(X_train)

# Transform the predictions back to the original scale
y_pred = np.exp(y_pred_log)-1

### Predict test ###
y_test_pred_log = sel_model.predict(X_test)

# Transform the predictions back to the original scale
y_test_pred = np.exp(y_test_pred_log)-1

In [128]:
# Get mean squared log error for train and test data - the lower the better
train_mean_sq_log_error_val = np.sqrt(mean_squared_log_error(y_train, y_pred))
print("Train RMSLE: %.2f" % train_mean_sq_log_error_val)

test_mean_sq_log_error_val = np.sqrt(mean_squared_log_error(y_test, y_test_pred))
print("Test RMSLE: %.2f" % test_mean_sq_log_error_val)

Train RMSLE: 0.29
Test RMSLE: 0.35


### Export model

In [131]:
# Assuming `model` is your trained model
with open('../models/rf_model.pkl', 'wb') as file:
    pickle.dump(sel_model, file)
