In [1]:
import arrow
import pandas as pd

COLUMNS = ['Length', 'Diameter', 'Height', 'Whole weight', 'Whole weight.1', 'Whole weight.2', 'Shell weight',]
TARGET = 'Rings'

TEST = '/kaggle/input/playground-series-s4e4/test.csv'
TRAIN = '/kaggle/input/playground-series-s4e4/train.csv'

time_start = arrow.now()
test_df = pd.read_csv(filepath_or_buffer=TEST, index_col=['id']).drop(columns=['Sex'])
train_df = pd.read_csv(filepath_or_buffer=TRAIN, index_col=['id']).drop(columns=['Sex'])
print('{} data load complete.'.format(arrow.now() - time_start))

0:00:00.348223 data load complete.


What does our target variable look like?

In [2]:
import warnings
from plotly import express

warnings.filterwarnings(action='ignore', category=FutureWarning)
express.histogram(data_frame=train_df, x='Rings',)

Let's make a split and start comparing models.

In [3]:
import arrow
from sklearn.model_selection import train_test_split

time_start = arrow.now()
X_train, X_test, y_train, y_test = train_test_split(train_df[COLUMNS], train_df[TARGET], test_size=0.2, random_state=2024, stratify=train_df[TARGET])

print('{} train/test split complete.'.format(arrow.now() - time_start))

0:00:00.061711 train/test split complete.


Let's build and compare a bunch of regressors. We believe that our test split RMSLE is a good approximation to the actual RMSLE for the 

In [4]:
import arrow
import math
# we don't have the RMSLE directly, so we have to use the MSLE
# and post-process to get the RMSLE
from sklearn.metrics import mean_squared_log_error

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet

MODELS = {
    'ElasticNet': ElasticNet(alpha=1.0, l1_ratio=0.5, fit_intercept=True, precompute=False, max_iter=10000, copy_X=True, tol=1e-12,
                             warm_start=False, positive=True, random_state=2024, selection='cyclic'),
    'Lasso': Lasso(alpha=1.0, fit_intercept=True, precompute=False, copy_X=True, max_iter=10000, tol=1e-12, warm_start=False, positive=True, 
                   random_state=2024, selection='cyclic'),
    'Linear Regression': LinearRegression(fit_intercept=True, copy_X=True, n_jobs=None, positive=True),
    'Ridge': Ridge(alpha=1.0, fit_intercept=True, copy_X=True, max_iter=None, tol=1e-12, solver='auto', positive=True, random_state=2024),
}

for penalty in ['l2', None]:   #['l1', 'l2', 'elasticnet', None]:
    MODELS['Logistic Regression {}'.format(penalty)] = LogisticRegression(penalty=penalty, max_iter=10000, tol=1e-12)

for n_neighbors in range(78, 81): 
    for weights in ['uniform', 'distance']:
        for algorithm in ['auto', 'ball_tree', 'kd_tree', 'brute']:
            MODELS['KNN k = {}, weights = {}'.format(n_neighbors, weights)] = KNeighborsRegressor(n_neighbors=n_neighbors, weights=weights, 
                                                                                          algorithm=algorithm, leaf_size=30,
                                                                   p=2, metric='minkowski', metric_params=None, n_jobs=None)

for criterion in ['squared_error', 'friedman_mse', 'absolute_error', 'poisson']:
    MODELS['Decision Tree {}'.format(criterion)] = DecisionTreeRegressor(criterion=criterion, splitter='best', max_depth=None, 
                                                                         min_samples_split=2, min_samples_leaf=1,
                                           min_weight_fraction_leaf=0.0, max_features=None, random_state=2024, max_leaf_nodes=None,
                                           min_impurity_decrease=0.0, ccp_alpha=0.0, )

scores = []
for name, regressor in MODELS.items():
    time_start = arrow.now()
    regressor.fit(X=X_train, y=y_train)
    score = math.sqrt(mean_squared_log_error(y_true=y_test, y_pred=regressor.predict(X=X_test)))
    if hasattr(regressor, 'n_iter_'):
        print('{:6.5f} : {} iterations: {} {}'.format(score, name, regressor.n_iter_, arrow.now() - time_start))
    else:
        print('{:6.5f} : {} {}'.format(score, name, arrow.now() - time_start))
    scores.append((score, name))

print(sorted(scores)[0])

0.27298 : ElasticNet iterations: 2 0:00:00.026490
0.28847 : Lasso iterations: 1 0:00:00.034176
0.18135 : Linear Regression 0:00:00.092640
0.18146 : Ridge iterations: None 0:00:00.051607
0.18384 : Logistic Regression l2 iterations: [542] 0:01:06.159448
0.17351 : Logistic Regression None iterations: [5463] 0:11:01.885249
0.15308 : KNN k = 78, weights = uniform 0:00:03.110770
0.15298 : KNN k = 78, weights = distance 0:00:03.014934
0.15308 : KNN k = 79, weights = uniform 0:00:02.907789
0.15297 : KNN k = 79, weights = distance 0:00:03.000561
0.15308 : KNN k = 80, weights = uniform 0:00:02.937147
0.15297 : KNN k = 80, weights = distance 0:00:02.953854
0.21689 : Decision Tree squared_error 0:00:00.557758
0.21632 : Decision Tree friedman_mse 0:00:00.555285
0.21913 : Decision Tree absolute_error 0:01:59.020067
0.21153 : Decision Tree poisson 0:00:00.642773
(0.15297206917390624, 'KNN k = 80, weights = distance')


In [5]:
import arrow
from sklearn.neighbors import KNeighborsRegressor

time_start = arrow.now()
knn = KNeighborsRegressor(n_neighbors=79, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=None)
knn.fit(X=train_df[COLUMNS], y=train_df[TARGET])
knn_df = pd.DataFrame(data={'id' : test_df.index.tolist(), 'Rings': knn.predict(X=test_df[COLUMNS])})
print('{} model done'.format(arrow.now() - time_start))

0:00:08.533394 model done


In [6]:
SCORE = 0.153081
if SCORE >  math.sqrt(mean_squared_log_error(y_true=y_test, y_pred=knn.predict(X=X_test))):
    knn_df.to_csv(path_or_buf='/kaggle/working/submission.csv', index=False)