In [None]:
!pip install --upgrade scikit-learn
import sklearn
print(sklearn.__version__)

1.5.1


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_log_error

import pickle

In [None]:
from google.colab import drive

#drive.flush_and_unmount()

drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
train = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Abalone Prediction/Dataset/train.csv')

test = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Abalone Prediction/Dataset/test.csv')

In [None]:
train.head()

Unnamed: 0,id,Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,Rings
0,0,F,0.55,0.43,0.15,0.7715,0.3285,0.1465,0.24,11
1,1,F,0.63,0.49,0.145,1.13,0.458,0.2765,0.32,11
2,2,I,0.16,0.11,0.025,0.021,0.0055,0.003,0.005,6
3,3,M,0.595,0.475,0.15,0.9145,0.3755,0.2055,0.25,10
4,4,I,0.555,0.425,0.13,0.782,0.3695,0.16,0.1975,9


In [None]:
train.dtypes

In [None]:
null_values = train.isnull().sum()
print(null_values)

#preprocessing


In [None]:

def preprocess(df):

    return df.assign(Sex=df['Sex'].map({'M': 1, 'F': 2, 'I': 0}))


train_encoded = preprocess(train)
test_encoded = preprocess(test)

In [None]:
train_encoded.head()

Unnamed: 0,id,Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,Rings
0,0,2,0.55,0.43,0.15,0.7715,0.3285,0.1465,0.24,11
1,1,2,0.63,0.49,0.145,1.13,0.458,0.2765,0.32,11
2,2,0,0.16,0.11,0.025,0.021,0.0055,0.003,0.005,6
3,3,1,0.595,0.475,0.15,0.9145,0.3755,0.2055,0.25,10
4,4,0,0.555,0.425,0.13,0.782,0.3695,0.16,0.1975,9


In [None]:
train_encoded.to_csv('/content/drive/My Drive/Colab Notebooks/Abalone Prediction/Dataset/Encoded dataset/train_encoded.csv', index=False);
test_encoded.to_csv('/content/drive/My Drive/Colab Notebooks/Abalone Prediction/Dataset/Encoded dataset/test_encoded.csv', index=False);

Importing preprocessed dataset

In [None]:

train_encoded = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Abalone Prediction/Dataset/Encoded dataset/train_encoded.csv')
test_encoded = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Abalone Prediction/Dataset/Encoded dataset/test_encoded.csv')


#Descriptive statistics

In [None]:

summary_stats = train_encoded.describe()

print(summary_stats)

                 id           Sex        Length      Diameter        Height  \
count  90615.000000  90615.000000  90615.000000  90615.000000  90615.000000   
mean   45307.000000      0.927186      0.517098      0.401679      0.135464   
std    26158.441658      0.807651      0.118217      0.098026      0.038008   
min        0.000000      0.000000      0.075000      0.055000      0.000000   
25%    22653.500000      0.000000      0.445000      0.345000      0.110000   
50%    45307.000000      1.000000      0.545000      0.425000      0.140000   
75%    67960.500000      2.000000      0.600000      0.470000      0.160000   
max    90614.000000      2.000000      0.815000      0.650000      1.130000   

       Whole weight  Whole weight.1  Whole weight.2  Shell weight  \
count  90615.000000    90615.000000    90615.000000  90615.000000   
mean       0.789035        0.340778        0.169422      0.225898   
std        0.457671        0.204428        0.100909      0.130203   
min        0

Correlation analysis

In [None]:

correlation_matrix = train_encoded.corr()
print(correlation_matrix)


In [None]:
plt.figure(figsize=(12, 8))

sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')

plt.show()

#Visualizations

In [None]:
sns.histplot(train_encoded['Rings'], kde=True)
plt.title('Distribution frequency of rings')
plt.xlabel('Rings')
plt.ylabel('Frequency')
plt.show()


In [None]:
sns.boxplot(data=train_encoded[['Height', 'Length', 'Diameter', 'Whole weight', 'Shell weight']])
plt.title('Box plot on physical measurements')
plt.show()

In [None]:
sns.countplot(x='Sex', data=train_encoded)
plt.title('Sex distribution')
plt.xlabel('Sex')
plt.ylabel('Count')
plt.show()


In [None]:

plt.scatter(train_encoded['Length'], train_encoded['Rings'], alpha=0.5)
plt.title('Length vs. Rings')
plt.xlabel('Length')
plt.ylabel('Rings')
plt.show()


In [None]:

sns.violinplot(x='Sex', y='Rings', data=train_encoded)
plt.title('Rings distribution by sex category')
plt.show()

In [None]:


fig, axs = plt.subplots(2, 2, figsize=(12, 10))
sns.histplot(train_encoded['Whole weight'], kde=True, ax=axs[0, 0])
sns.histplot(train_encoded['Whole weight.1'], kde=True, ax=axs[0, 1])
sns.histplot(train_encoded['Whole weight.2'], kde=True, ax=axs[1, 0])
sns.histplot(train_encoded['Shell weight'], kde=True, ax=axs[1, 1])
plt.show()


#Modelling

In [None]:

X_train = train_encoded.drop(columns=['id', 'Rings'])
y_train = train_encoded['Rings']


In [None]:

decision_tree = DecisionTreeRegressor(random_state=42)


decision_tree.fit(X_train, y_train)

In [None]:

train_predictions = decision_tree.predict(X_train)
rmsle_train = mean_squared_log_error(y_train, train_predictions, squared=False)

print(f'RMSLE on training data: {rmsle_train}')

RMSLE on training data: 0.0




In [None]:

X_test = test_encoded.drop(columns=['id'])
test_predictions = decision_tree.predict(X_test)


#Exporting

In [None]:

submission = pd.DataFrame({
    'id': test_encoded['id'],
    'Rings': test_predictions
})


submission.to_csv('/content/drive/My Drive/Colab Notebooks/Abalone Prediction/Submissions/dt_basic.csv', index=False)

print('Submission file created: dt_basic.csv')

Submission file created: dt_basic.csv


In [None]:

with open('/content/drive/My Drive/Colab Notebooks/Abalone Prediction/Pickle/dt_basic.pkl', 'wb') as file:
    pickle.dump(decision_tree, file)

print('Model saved as: dt_basic.pkl')

Model saved as: dt_basic.pkl
