In [1]:
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

In [2]:
train_df = pd.read_csv('../split_data/train.csv')
test_df = pd.read_csv('../split_data/test.csv')
train_df.head()

Unnamed: 0,street,ward,district,city,size,property_legal_document,bed_rooms,toilets,floors,house_type,price
0,-1.108516,-1.497725,-1.841108,1.750851,0.107497,0.123381,-0.090157,0.152288,0.485824,0.832956,3.1
1,0.196962,1.479461,0.819971,-0.221011,0.512415,1.041508,-0.090157,-0.695881,-0.920688,0.832956,5.8
2,0.704575,1.431798,0.404731,-0.605881,-0.263678,0.123381,1.768352,1.848626,1.892337,0.832956,17.5
3,0.084307,-0.709135,-1.717542,-0.605881,1.457224,0.123381,1.768352,2.696795,0.485824,0.832956,6.1
4,-1.30732,-0.360524,-0.929962,-0.221011,-0.904799,0.123381,-0.090157,0.152288,-0.920688,-1.196906,2.6


In [3]:
train_X, train_y = train_df.drop(columns=['price']).to_numpy(), train_df['price'].to_numpy()
test_X, test_y = test_df.drop(columns=['price']).to_numpy(), test_df['price'].to_numpy()

In [4]:
def run_gridsearch(X, y, param_grids, model, cv=9, scorer=['r2', 'neg_mean_squared_error']):
    kf = KFold(n_splits=cv, shuffle=True, random_state=42)
    grid_search = GridSearchCV(estimator=model, param_grid=param_grids, cv=kf, scoring=scorer, refit='r2', n_jobs=-1)
    grid_search.fit(X, y)

    best_param = grid_search.best_params_
    results = grid_search.cv_results_
    best_model = grid_search.best_estimator_

    best_scores = {}
    best_index = grid_search.best_index_
    best_scores['best_r2_score'] = results['mean_test_r2'][best_index]
    best_scores['best_mse_score'] = -results['mean_test_neg_mean_squared_error'][best_index]

    return {
        'best_param': best_param,
        'scores': best_scores,
        'best_model': best_model
    }

In [5]:
import pickle

model = Ridge()
param_grid = {'alpha': [0.001, 0.01, 0.1, 1, 10, 70, 80, 85, 90, 100]}

res = run_gridsearch(train_X, train_y, param_grid, model, cv = 10)
with open('../saved_models/ridge.pkl', 'wb') as f:
    pickle.dump(res['best_model'], f)
loaded_model = res['best_model']
pred_test_y = loaded_model.predict(test_X)

mse = mean_squared_error(test_y, pred_test_y)
rmse = np.sqrt(mse)
mae = mean_absolute_error(test_y, pred_test_y)
r2 = r2_score(test_y, pred_test_y)

print(f"R2-score on test set: {r2}")
print(f"MSE on test set: {mse}")
print(f"RMSE on test set: {rmse}")
print(f"MAE on test set: {mae}")

print(f"Best alpha: {res['best_param']}")
print(f"Best R2-score: {res['scores']['best_r2_score']}")
print(f"Best MSE score: {res['scores']['best_mse_score']}")

R2-score on test set: 0.5140931015023682
MSE on test set: 7.362133730531936
RMSE on test set: 2.7133252165068487
MAE on test set: 1.8157422766797138
Best alpha: {'alpha': 85}
Best R2-score: 0.5999516506004122
Best MSE score: 6.0538091479487735
