In [1]:
import arrow
import pandas as pd

COLUMNS = ['Length', 'Diameter', 'Height', 'Whole weight', 'Whole weight.1', 'Whole weight.2', 'Shell weight',]
TARGET = 'Rings'

TEST = '/kaggle/input/playground-series-s4e4/test.csv'
TRAIN = '/kaggle/input/playground-series-s4e4/train.csv'

time_start = arrow.now()
test_df = pd.read_csv(filepath_or_buffer=TEST, index_col=['id']).drop(columns=['Sex'])
train_df = pd.read_csv(filepath_or_buffer=TRAIN, index_col=['id']).drop(columns=['Sex'])
print('{} data load complete.'.format(arrow.now() - time_start))

0:00:00.281227 data load complete.


What does our target variable look like?

In [2]:
import warnings
from plotly import express

warnings.filterwarnings(action='ignore', category=FutureWarning)
express.histogram(data_frame=train_df, x='Rings',)

In [3]:
express.scatter(data_frame=train_df.sample(n=1000, random_state=2024), x='Height', y='Rings', trendline='ols')

Our height data is low resolution and has outliers.

Let's make a split and start comparing models.

In [4]:
import arrow
from sklearn.model_selection import train_test_split

time_start = arrow.now()
X_train, X_test, y_train, y_test = train_test_split(train_df[COLUMNS], train_df[TARGET], test_size=0.2, random_state=2024, stratify=train_df[TARGET])

print('{} train/test split complete.'.format(arrow.now() - time_start))

0:00:00.060533 train/test split complete.


Let's build and compare a bunch of regressors. We believe that our test split RMSLE is a good approximation to the actual RMSLE for the 

In [5]:
import arrow
import math
import numpy as np
# we don't have the RMSLE directly, so we have to use the MSLE
# and post-process to get the RMSLE
from sklearn.metrics import mean_squared_log_error

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import Perceptron
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler


MODELS = {
    'ElasticNet': ElasticNet(alpha=1.0, l1_ratio=0.5, fit_intercept=True, precompute=False, max_iter=10000, copy_X=True, tol=1e-12,
                             warm_start=False, positive=True, random_state=2024, selection='cyclic'),
    'Lasso': Lasso(alpha=1.0, fit_intercept=True, precompute=False, copy_X=True, max_iter=10000, tol=1e-12, warm_start=False, positive=True, 
                   random_state=2024, selection='cyclic'),
    'Linear Regression': LinearRegression(fit_intercept=True, copy_X=True, n_jobs=None, positive=True),
    'Perceptron': Perceptron(penalty=None, alpha=0.0001, l1_ratio=0.15, fit_intercept=True, max_iter=10000, tol=1e-12, shuffle=True, verbose=0, 
                             eta0=1.0, n_jobs=None, random_state=2024, early_stopping=True, validation_fraction=0.25, n_iter_no_change=5,
                             class_weight=None, warm_start=False),
    'Ridge': Ridge(alpha=1.0, fit_intercept=True, copy_X=True, max_iter=None, tol=1e-12, solver='auto', positive=True, random_state=2024),
    'Singular Value': SVR(kernel='rbf', degree=3, gamma='scale', coef0=0.0, tol=1e-12, C=1.0, epsilon=0.1, shrinking=True, cache_size=200, 
                          verbose=False, max_iter=-1),
}

# todo build out Logistic Regression options
for penalty in ['l2',]: #  None]: (no penalty is too expensive)
    MODELS['Logistic Regression {}'.format(penalty)] = LogisticRegression(penalty=penalty, max_iter=10000, tol=1e-12)

for n_neighbors in range(75, 85): 
    for weights in ['uniform', 'distance']:
        for algorithm in ['auto', 'ball_tree', 'kd_tree', 'brute']:
            name = 'KNN k = {}, weights = {} algorithm = {}'.format(n_neighbors, weights, algorithm)
            MODELS[name] = KNeighborsRegressor(n_neighbors=n_neighbors, weights=weights, algorithm=algorithm, leaf_size=30,
                                                                   p=2, metric='minkowski', metric_params=None, n_jobs=None)

for criterion in ['squared_error', 'friedman_mse', 'absolute_error', 'poisson']:
    MODELS['Decision Tree {}'.format(criterion)] = DecisionTreeRegressor(criterion=criterion, splitter='best', max_depth=None, 
                                                                         min_samples_split=2, min_samples_leaf=1,
                                           min_weight_fraction_leaf=0.0, max_features=None, random_state=2024, max_leaf_nodes=None,
                                           min_impurity_decrease=0.0, ccp_alpha=0.0, )
    
for solver in ['svd', 'lsqr', 'eigen']:
    MODELS['LDA {}'.format(solver)] = LinearDiscriminantAnalysis(solver=solver, shrinkage=None, priors=None, n_components=None, 
                                                                 store_covariance=False, tol=1e-12, covariance_estimator=None)
    
for loss in ['squared_error', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive']:
    for penalty in ['elasticnet', 'l1', 'l2', None]:
        for learning_rate in ['constant', 'optimal', 'invscaling', 'adaptive']:
            name = 'SGD loss = {} penalty = {} learning rate = {}'.format(loss, penalty, learning_rate)
            MODELS['name'] = SGDRegressor(loss=loss, penalty=penalty, alpha=0.0001, l1_ratio=0.15, fit_intercept=True,
                                               max_iter=10000, tol=1e-12, shuffle=True, verbose=0, epsilon=0.1, random_state=2024, 
                                               learning_rate=learning_rate, eta0=0.01, power_t=0.25, early_stopping=True,
                                               validation_fraction=0.25, n_iter_no_change=5, warm_start=False, average=False)

best = None
best_name = ''
best_score = 1.0
scores = []
for name, regressor in MODELS.items():
    time_start = arrow.now()
    pipeline = make_pipeline(StandardScaler(), regressor)
    pipeline.fit(X=X_train, y=y_train)
    score = math.sqrt(mean_squared_log_error(y_true=y_test, y_pred=np.abs(pipeline.predict(X=X_test))))
    scores.append((score, name))
    if hasattr(pipeline, 'n_iter_'):
        print('{:6.5f} : {} iterations: {} {}'.format(score, name, pipeline.n_iter_, arrow.now() - time_start))
    else:
        print('{:6.5f} : {} {}'.format(score, name, arrow.now() - time_start))

    if score < best_score:
        best_score = score
        best_name = name
        best = pipeline

print(sorted(scores)[0])

0.20267 : ElasticNet 0:00:00.087175
0.21572 : Lasso 0:00:00.122800
0.18135 : Linear Regression 0:00:00.111045
0.19940 : Perceptron 0:00:04.742910
0.18135 : Ridge 0:00:00.054383
0.15367 : Singular Value 0:05:09.849101
0.17408 : Logistic Regression l2 0:01:35.025582
0.15374 : KNN k = 75, weights = uniform algorithm = auto 0:00:02.434346
0.15374 : KNN k = 75, weights = uniform algorithm = ball_tree 0:00:09.834118
0.15374 : KNN k = 75, weights = uniform algorithm = kd_tree 0:00:02.640502
0.15374 : KNN k = 75, weights = uniform algorithm = brute 0:00:02.729619
0.15365 : KNN k = 75, weights = distance algorithm = auto 0:00:02.429591
0.15365 : KNN k = 75, weights = distance algorithm = ball_tree 0:00:08.093871
0.15365 : KNN k = 75, weights = distance algorithm = kd_tree 0:00:02.411283
0.15365 : KNN k = 75, weights = distance algorithm = brute 0:00:02.736433
0.15374 : KNN k = 76, weights = uniform algorithm = auto 0:00:02.720335
0.15374 : KNN k = 76, weights = uniform algorithm = ball_tree 0:0

Use the best regressor from the list above to build the submission.

In [6]:
import arrow

time_start = arrow.now()
best.fit(X=train_df[COLUMNS], y=train_df[TARGET])
best_df = pd.DataFrame(data={'id' : test_df.index.tolist(), 'Rings': best.predict(X=test_df[COLUMNS])})
print('{} model done'.format(arrow.now() - time_start))

0:00:31.899432 model done


In [7]:
SCORE = 0.15308
if SCORE >  math.sqrt(mean_squared_log_error(y_true=y_test, y_pred=best.predict(X=X_test))):
    print('writing new submission')
    best_df.to_csv(path_or_buf='/kaggle/working/submission.csv', index=False)
else:
    print('score: {:6.5f} benchmark: {}'.format(score, SCORE))

writing new submission
