In [9]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_log_error

In [2]:
from google.colab import drive

#drive.flush_and_unmount()

drive.mount('/content/drive')

Mounted at /content/drive


In [3]:

train = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Abalone Prediction/Dataset/train.csv')
test = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Abalone Prediction/Dataset/test.csv')


In [4]:
train.head()

Unnamed: 0,id,Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,Rings
0,0,F,0.55,0.43,0.15,0.7715,0.3285,0.1465,0.24,11
1,1,F,0.63,0.49,0.145,1.13,0.458,0.2765,0.32,11
2,2,I,0.16,0.11,0.025,0.021,0.0055,0.003,0.005,6
3,3,M,0.595,0.475,0.15,0.9145,0.3755,0.2055,0.25,10
4,4,I,0.555,0.425,0.13,0.782,0.3695,0.16,0.1975,9


In [None]:
train.dtypes

Unnamed: 0,0
id,int64
Sex,object
Length,float64
Diameter,float64
Height,float64
Whole weight,float64
Whole weight.1,float64
Whole weight.2,float64
Shell weight,float64
Rings,int64


#preprocessing


In [5]:

duplicate_columns = ['Whole weight.1', 'Whole weight.2']

#preprocessing funciton (drop columns & encode)
def preprocess(df):

    cols_to_drop = [col for col in duplicate_columns if col in df.columns]
    cleaned = df.drop(columns=cols_to_drop)

    cleaned['Sex'] = cleaned['Sex'].map({'M': 1, 'F': 2, 'I': 0})

    return cleaned


train_cleaned = preprocess(train)
test_cleaned = preprocess(test)


In [6]:
train_cleaned.head()

Unnamed: 0,id,Sex,Length,Diameter,Height,Whole weight,Shell weight,Rings
0,0,2,0.55,0.43,0.15,0.7715,0.24,11
1,1,2,0.63,0.49,0.145,1.13,0.32,11
2,2,0,0.16,0.11,0.025,0.021,0.005,6
3,3,1,0.595,0.475,0.15,0.9145,0.25,10
4,4,0,0.555,0.425,0.13,0.782,0.1975,9


In [None]:

#Saving cleaned - train & test dataset

train_cleaned.to_csv('/content/drive/My Drive/Colab Notebooks/Abalone Prediction/Dataset/Cleaned dataset/cleaned_train.csv', index=False);
test_cleaned.to_csv('/content/drive/My Drive/Colab Notebooks/Abalone Prediction/Dataset/Cleaned dataset/cleaned_test.csv', index=False);


#Modelling

In [7]:


features = ['Sex', 'Length', 'Diameter', 'Height', 'Whole weight', 'Shell weight']
target = 'Rings'

X_train = train_cleaned[features]
y_train = train_cleaned[target]

X_test = test_cleaned[features]  # to extract features from the test data

In [8]:


model = LinearRegression()
model.fit(X_train, y_train)


In [10]:
train_predictions = model.predict(X_train)
rmsle_train = mean_squared_log_error(y_train, train_predictions, squared=False)

print(f'RMSLE on training data: {rmsle_train}')

RMSLE on training data: 0.17190268557285057


In [None]:


y_test_pred = model.predict(X_test)
y_test_pred_adjusted = np.maximum(0, y_test_pred) # ensure non-negative predictions

In [None]:


submission = pd.DataFrame({
    'id': test_cleaned['id'],
    'Rings': y_test_pred_adjusted
})

submission.to_csv('/content/drive/My Drive/Colab Notebooks/Abalone Prediction/Submissions/submission1.csv', index=False)

print(f'Submission file saved to location')

Submission file saved to location
