In [1]:
# =====================
# Core Libraries
# =====================
import numpy as np
import pandas as pd
import tensorflow as tf
# =====================
# Visualization
# =====================
import seaborn as sns
import matplotlib.pyplot as plt

# =====================
# Preprocessing
# =====================
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer, StandardScaler, PolynomialFeatures
from sklearn.impute import KNNImputer

# =====================
# Model Selection & Tuning
# =====================
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV

# =====================
# Regression Models
# =====================
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neural_network import MLPRegressor

# =====================
# Classification Models
# =====================
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC

# =====================
# Pipelines
# =====================
from sklearn.pipeline import Pipeline

# =====================
# Metrics
# =====================
from sklearn.metrics import (confusion_matrix, roc_curve, precision_recall_curve,
                             roc_auc_score, precision_score,
                             recall_score, f1_score , accuracy_score)

# =====================
# Other Useful Tools
# =====================
from sklearn.datasets import make_regression
from numpy import log1p


In [4]:
data = pd.read_csv("train.csv")

In [None]:
data

In [2]:
def fix_data(data):
    data.drop("id" , inplace=True,axis = 1)
    data = pd.get_dummies(data, columns=['Sex'])
    return data

In [5]:
fix_data = fix_data(data)

In [20]:
fix_data

Unnamed: 0,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,Rings,Sex_F,Sex_I,Sex_M
0,0.550,0.430,0.150,0.7715,0.3285,0.1465,0.2400,11,True,False,False
1,0.630,0.490,0.145,1.1300,0.4580,0.2765,0.3200,11,True,False,False
2,0.160,0.110,0.025,0.0210,0.0055,0.0030,0.0050,6,False,True,False
3,0.595,0.475,0.150,0.9145,0.3755,0.2055,0.2500,10,False,False,True
4,0.555,0.425,0.130,0.7820,0.3695,0.1600,0.1975,9,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...
90610,0.335,0.235,0.075,0.1585,0.0685,0.0370,0.0450,6,False,False,True
90611,0.555,0.425,0.150,0.8790,0.3865,0.1815,0.2400,9,False,False,True
90612,0.435,0.330,0.095,0.3215,0.1510,0.0785,0.0815,6,False,True,False
90613,0.345,0.270,0.075,0.2000,0.0980,0.0490,0.0700,6,False,True,False


In [6]:
train = fix_data.drop("Rings",axis=1)
test = fix_data["Rings"]
x_train,x_test,y_train,y_test = train_test_split(train,test,random_state=42,test_size=.2)

RandomForestRegressor

In [None]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'max_features': ['auto', 'sqrt']
}
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(x_train, y_train)
print(grid_search.best_params_)

In [25]:
rf_model = RandomForestRegressor(
    max_depth=20,
    max_features='sqrt',
    min_samples_split=5,
    n_estimators=300,
    random_state=42  # For reproducibility
)
rf_model.fit(x_train , y_train)
y_pred = rf_model.predict(x_test)
from sklearn.metrics import mean_squared_error, r2_score

print("RMSE:", mean_squared_error(y_test, y_pred))
print("R²:", r2_score(y_test, y_pred))

RMSE: 3.494924320492685
R²: 0.6597738304829663


In [44]:
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import root_mean_squared_log_error

model = XGBRegressor()
param_dist = {
    'n_estimators': [100, 200, 300, 400],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7, 9],
    'subsample': [0.7, 0.8, 0.9, 1.0]
}
random_search = RandomizedSearchCV(model, param_dist, n_iter=20, cv=5, scoring='neg_mean_squared_log_error')
#fit model
random_search.fit(x_train, y_train)
best_model = random_search.best_estimator_
y_pred = best_model.predict(x_test)
# evaluate
print("RMSE:", mean_squared_error(y_test, y_pred))
print("R²:", r2_score(y_test, y_pred))
print("RMSLE" , root_mean_squared_log_error(y_test,y_pred))


RMSE: 3.457280158996582
R²: 0.6634384393692017
RMSLE 0.15163256227970123


In [48]:
test = pd.read_csv("test.csv")


In [46]:
test_fix = fix_data(test)

In [47]:
y_pred = best_model.predict(test_fix)

In [49]:
submission = pd.DataFrame({'id': test['id'], 'Rings': y_pred})
submission.to_csv("my_submission.csv", index=False)