In [13]:
import os
import pandas as pd

df = pd.read_csv(os.path.join(os.getcwd(), 'CrabAgePrediction.csv'))
df['Sex'] = df['Sex'].map({'F':0, 'I':2, 'M':3})
df.head()

X = df.drop('Age', axis=1)
y = df['Age']

In [14]:
from sklearn.linear_model import LinearRegression # rmse: 2.2399
from sklearn.pipeline import Pipeline   
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate
import numpy as np

pipeline_linear = Pipeline([
    ('sc', StandardScaler()),
    ('lin', LinearRegression())
])

classification_result_lr = cross_validate(pipeline_linear, X, y, cv=5, scoring=['neg_mean_squared_error', 'r2'])

mse_lr = -classification_result_lr['test_neg_mean_squared_error']
rmse_lr = np.sqrt(mse_lr)
r2_lr = classification_result_lr['test_r2']

print("LinearRegression Results:")
print("MSE:", mse_lr.mean())
print("RMSE:", rmse_lr.mean())
print("R^2:", r2_lr.mean())

LinearRegression Results:
MSE: 5.02315541531175
RMSE: 2.2399815138647434
R^2: 0.514307535901118


In [15]:
from sklearn.ensemble import RandomForestRegressor #  2.2075
pipeline_rf = Pipeline([
    ('scaler', StandardScaler()),
    ('rf', RandomForestRegressor(random_state=42))
])

classification_result_rf = cross_validate(pipeline_rf, X, y, cv=5, scoring=['neg_mean_squared_error', 'r2'])

mse_rf = -classification_result_rf['test_neg_mean_squared_error']
rmse_rf = np.sqrt(mse_rf)
r2_rf = classification_result_rf['test_r2']

print("RandomForestRegressor Results:")
print("MSE:", mse_rf.mean())
print("RMSE:", rmse_rf.mean())
print("R^2:", r2_rf.mean())

RandomForestRegressor Results:
MSE: 4.8841122407278466
RMSE: 2.2075865567504414
R^2: 0.5274717069400165


In [16]:
from xgboost import XGBRegressor #  2.3017

pipeline_xgb = Pipeline([
    ('iskaler', StandardScaler()),
    ('xgb', XGBRegressor(random_state='42'))
])

classification_result_xgb = cross_validate(pipeline_xgb, X, y, cv=5, scoring=['neg_mean_squared_error', 'r2'])

mse = - classification_result_xgb['test_neg_mean_squared_error']
rmse = np.sqrt(mse)
r2 = classification_result_xgb['test_r2']

mse.mean(), rmse.mean(), r2.mean()

(5.308877096496005, 2.3017602992461748, 0.4868880715147833)

In [17]:
from catboost import CatBoostRegressor #  2.1880

pipeline_cat = Pipeline([
    ('scaler', StandardScaler()),
    ('catboost', CatBoostRegressor(random_state=42, silent=True))
])

classification_result_cat = cross_validate(pipeline_cat, X, y, cv=5, scoring=['neg_mean_squared_error', 'r2'])

mse_cat = -classification_result_cat['test_neg_mean_squared_error']
rmse_cat = np.sqrt(mse_cat)
r2_cat = classification_result_cat['test_r2']

mse_cat.mean(), rmse_cat.mean(), r2_cat.mean()

(4.796621308172593, 2.188085955647151, 0.5362061207728999)

In [18]:
from lightgbm import LGBMRegressor # 2.2063

pipeline_lgb = Pipeline([
    ('scaler', StandardScaler()),
    ('lgbm', LGBMRegressor(random_state=42))
])

classification_result_lgb = cross_validate(pipeline_lgb, X, y, cv=5, scoring=['neg_mean_squared_error', 'r2'])

mse_lgb = -classification_result_lgb['test_neg_mean_squared_error']
rmse_lgb = np.sqrt(mse_lgb)
r2_lgb = classification_result_lgb['test_r2']

mse_lgb.mean(), rmse_lgb.mean(), r2_lgb.mean()

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000151 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1296
[LightGBM] [Info] Number of data points in the train set: 3114, number of used features: 8
[LightGBM] [Info] Start training from score 9.986191
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000147 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1292
[LightGBM] [Info] Number of data points in the train set: 3114, number of used features: 8
[LightGBM] [Info] Start training from score 9.941875
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000120 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1302
[LightGBM] [Info] Number of data points in the train set: 3114, number of used features: 8
[LightGBM] [Info] Start training 

(4.876315718725886, 2.2063601258901135, 0.5283077062913072)

In [19]:
from sklearn.svm import SVR # 2.1830

pipeline_svr = Pipeline([
    ('scaler', StandardScaler()),
    ('svr', SVR())
])

classification_result_svr = cross_validate(pipeline_svr, X, y, cv=5, scoring=['neg_mean_squared_error', 'r2'])

mse_svr = -classification_result_svr['test_neg_mean_squared_error']
rmse_svr = np.sqrt(mse_svr)
r2_svr = classification_result_svr['test_r2']

print("SVR Results:")
print("MSE:", mse_svr.mean())
print("RMSE:", rmse_svr.mean())
print("R^2:", r2_svr.mean())


SVR Results:
MSE: 4.770499137272676
RMSE: 2.1830999552756216
R^2: 0.5390335730695541


In [20]:
from sklearn.neighbors import KNeighborsRegressor # 2.2891

pipeline_knn = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsRegressor())
])

classification_result_knn = cross_validate(pipeline_knn, X, y, cv=5, scoring=['neg_mean_squared_error', 'r2'])

mse_knn = -classification_result_knn['test_neg_mean_squared_error']
rmse_knn = np.sqrt(mse_knn)
r2_knn = classification_result_knn['test_r2']

print("KNeighborsRegressor Results:")
print("MSE:", mse_knn.mean())
print("RMSE:", rmse_knn.mean())
print("R^2:", r2_knn.mean())

KNeighborsRegressor Results:
MSE: 5.245559800812458
RMSE: 2.2891812288822253
R^2: 0.49321511452136235


In [21]:
from sklearn.ensemble import VotingRegressor # 2.1522

catboost = CatBoostRegressor(random_state=42, silent=True)
svr = SVR()
knn = KNeighborsRegressor()

ensemble = VotingRegressor([
    ('catboost', catboost),
    ('svr', svr),
    ('knn', knn)
])

pipeline_ensemble = Pipeline([
    ('scaler', StandardScaler()),
    ('ensemble', ensemble)
])

classification_result_ensemble = cross_validate(pipeline_ensemble, X, y, cv=5, scoring=['neg_mean_squared_error', 'r2'])

mse_ensemble = -classification_result_ensemble['test_neg_mean_squared_error']
rmse_ensemble = np.sqrt(mse_ensemble)
r2_ensemble = classification_result_ensemble['test_r2']

print("Ensemble Results:")
print("MSE:", mse_ensemble.mean())
print("RMSE:", rmse_ensemble.mean())
print("R^2:", r2_ensemble.mean())


Ensemble Results:
MSE: 4.638073032709989
RMSE: 2.152211822843827
R^2: 0.551703196172996


In [23]:

# Define the base models
catboost = CatBoostRegressor(random_state=42, silent=True)
svr = SVR()
knn = KNeighborsRegressor()
xgboost = XGBRegressor(random_state=42)
lightgbm = LGBMRegressor(random_state=42)

ensemble = VotingRegressor([
    ('catboost', catboost),
    ('svr', svr),
    ('knn', knn),
    ('xgboost', xgboost),
    ('lightgbm', lightgbm)
])

pipeline_ensemble = Pipeline([
    ('scaler', StandardScaler()),
    ('ensemble', ensemble)
])

classification_result_ensemble = cross_validate(pipeline_ensemble, X, y, cv=5, scoring=['neg_mean_squared_error', 'r2'])

mse_ensemble = -classification_result_ensemble['test_neg_mean_squared_error']
rmse_ensemble = np.sqrt(mse_ensemble)
r2_ensemble = classification_result_ensemble['test_r2']

print("Ensemble Results:")
print("MSE:", mse_ensemble.mean())
print("RMSE:", rmse_ensemble.mean())
print("R^2:", r2_ensemble.mean())


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000410 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1296
[LightGBM] [Info] Number of data points in the train set: 3114, number of used features: 8
[LightGBM] [Info] Start training from score 9.986191
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000220 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1292
[LightGBM] [Info] Number of data points in the train set: 3114, number of used features: 8
[LightGBM] [Info] Start training from score 9.941875
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000140 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1302
[LightGBM] [Info] Number of data points in the train set: 31