In [1]:
import pandas as pd

TEST = '/kaggle/input/playground-series-s4e4/test.csv'
TRAIN = '/kaggle/input/playground-series-s4e4/train.csv'
SAMPLE = '/kaggle/input/playground-series-s4e4/sample_submission.csv'

sample_df = pd.read_csv(filepath_or_buffer=SAMPLE, )
test_df = pd.read_csv(filepath_or_buffer=TEST, index_col=['id'])
train_df = pd.read_csv(filepath_or_buffer=TRAIN, index_col=['id'])

train_df.head()

Unnamed: 0_level_0,Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,Rings
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,F,0.55,0.43,0.15,0.7715,0.3285,0.1465,0.24,11
1,F,0.63,0.49,0.145,1.13,0.458,0.2765,0.32,11
2,I,0.16,0.11,0.025,0.021,0.0055,0.003,0.005,6
3,M,0.595,0.475,0.15,0.9145,0.3755,0.2055,0.25,10
4,I,0.555,0.425,0.13,0.782,0.3695,0.16,0.1975,9


In [2]:
import warnings
from plotly import express

warnings.filterwarnings(action='ignore', category=FutureWarning)
express.histogram(data_frame=train_df, x='Rings', facet_col='Sex')

In [3]:
import arrow
from sklearn.linear_model import LogisticRegression

COLUMNS = ['Length', 'Diameter', 'Height', 'Whole weight', 'Whole weight.1', 'Whole weight.2', 'Shell weight',]
TARGET = 'Rings'
TOL = 1e-12

time_start = arrow.now()

logreg = LogisticRegression(penalty='l2', dual=False, tol=TOL, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None,
                           random_state=2024, solver='lbfgs', max_iter=10000, multi_class='auto', verbose=1, warm_start=False, n_jobs=None,
                           l1_ratio=None).fit(X=train_df[COLUMNS], y=train_df[TARGET])
print('{} model fit complete in {} iterations'.format(arrow.now() - time_start, logreg.n_iter_))
logreg_df = pd.DataFrame(data={'id' : test_df.index.tolist(), 'Rings': logreg.predict(X=test_df[COLUMNS])})
print('{} model done'.format(arrow.now() - time_start))

RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =          224     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  3.01948D+05    |proj g|=  1.42288D+04


 This problem is unconstrained.



At iterate   50    f=  1.68723D+05    |proj g|=  5.47756D+02

At iterate  100    f=  1.64920D+05    |proj g|=  2.06417D+02

At iterate  150    f=  1.63888D+05    |proj g|=  1.65748D+02

At iterate  200    f=  1.63687D+05    |proj g|=  5.92529D+01

At iterate  250    f=  1.63609D+05    |proj g|=  2.80957D+01

At iterate  300    f=  1.63584D+05    |proj g|=  1.42689D+01

At iterate  350    f=  1.63575D+05    |proj g|=  3.05080D+01

At iterate  400    f=  1.63573D+05    |proj g|=  3.97145D+00

At iterate  450    f=  1.63572D+05    |proj g|=  2.80380D+00

At iterate  500    f=  1.63571D+05    |proj g|=  2.02689D+00

At iterate  550    f=  1.63571D+05    |proj g|=  1.92321D+00

0:01:23.215515 model fit complete in [590] iterations
           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy p

In [4]:
express.histogram(x=COLUMNS, y=logreg.coef_[0])

Let's make a split and start comparing models.

In [5]:
import arrow
import math
from sklearn.linear_model import LogisticRegression
# we don't have the RMSLE directly, so we have to use the MSLE
# and post-process to get the RMSLE
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import train_test_split

time_start = arrow.now()
X_train, X_test, y_train, y_test = train_test_split(train_df[COLUMNS], train_df[TARGET], test_size=0.2, random_state=2024, stratify=train_df[TARGET])

regression = LogisticRegression(max_iter=1000, tol=1e-12).fit(X=X_train, y=y_train)
print('fit complete after {} iterations.'.format(regression.n_iter_[0]))
print('RMSLE: {:6.5f}'.format(math.sqrt(mean_squared_log_error(y_true=y_test, y_pred=regression.predict(X=X_test)))))
print('done in {}'.format(arrow.now() - time_start))

fit complete after 542 iterations.
RMSLE: 0.18384
done in 0:01:01.320523


Let's build and compare a bunch of regressors.

In [6]:
import arrow
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsRegressor

MODELS = {
    'Logistic Regression': LogisticRegression(max_iter=1000, tol=1e-12),   
}

for n_neighbors in range(78, 81): 
    MODELS['KNN k = {}'.format(n_neighbors)] = KNeighborsRegressor(n_neighbors=n_neighbors, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=None)

scores = []
for name, regressor in MODELS.items():
    time_start = arrow.now()
    regressor.fit(X=X_train, y=y_train)
    score = math.sqrt(mean_squared_log_error(y_true=y_test, y_pred=regressor.predict(X=X_test)))
    print('{:6.5f} : {} {}'.format(score, name, arrow.now() - time_start))
    scores.append((score, name))

print(sorted(scores)[0])

0.18384 : Logistic Regression 0:01:00.563437
0.15309 : KNN k = 78 0:00:03.203300
0.15308 : KNN k = 79 0:00:03.198431
0.15308 : KNN k = 80 0:00:02.286122
(0.15308106850508654, 'KNN k = 79')


In [7]:
import arrow
from sklearn.neighbors import KNeighborsRegressor

time_start = arrow.now()
knn = KNeighborsRegressor(n_neighbors=79, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=None)
knn.fit(X=train_df[COLUMNS], y=train_df[TARGET])
knn_df = pd.DataFrame(data={'id' : test_df.index.tolist(), 'Rings': knn.predict(X=test_df[COLUMNS])})
print('{} model done'.format(arrow.now() - time_start))

0:00:14.297358 model done


In [8]:
knn_df.to_csv(path_or_buf='/kaggle/working/submission.csv', index=False)