In [152]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

In [137]:
df = pd.read_csv('abalone.csv')
df.head()

Unnamed: 0,id,Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,Rings
0,0,F,0.55,0.43,0.15,0.7715,0.3285,0.1465,0.24,11
1,1,F,0.63,0.49,0.145,1.13,0.458,0.2765,0.32,11
2,2,I,0.16,0.11,0.025,0.021,0.0055,0.003,0.005,6
3,3,M,0.595,0.475,0.15,0.9145,0.3755,0.2055,0.25,10
4,4,I,0.555,0.425,0.13,0.782,0.3695,0.16,0.1975,9


In [138]:
# Drop id because not needed
df = df.drop(columns=['id'])
df.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,Rings
0,F,0.55,0.43,0.15,0.7715,0.3285,0.1465,0.24,11
1,F,0.63,0.49,0.145,1.13,0.458,0.2765,0.32,11
2,I,0.16,0.11,0.025,0.021,0.0055,0.003,0.005,6
3,M,0.595,0.475,0.15,0.9145,0.3755,0.2055,0.25,10
4,I,0.555,0.425,0.13,0.782,0.3695,0.16,0.1975,9


In [139]:
df.isna().sum()

Sex               0
Length            0
Diameter          0
Height            0
Whole weight      0
Whole weight.1    0
Whole weight.2    0
Shell weight      0
Rings             0
dtype: int64

In [140]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90615 entries, 0 to 90614
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Sex             90615 non-null  object 
 1   Length          90615 non-null  float64
 2   Diameter        90615 non-null  float64
 3   Height          90615 non-null  float64
 4   Whole weight    90615 non-null  float64
 5   Whole weight.1  90615 non-null  float64
 6   Whole weight.2  90615 non-null  float64
 7   Shell weight    90615 non-null  float64
 8   Rings           90615 non-null  int64  
dtypes: float64(7), int64(1), object(1)
memory usage: 6.2+ MB


In [141]:
# Convert categorical
categorical = df.select_dtypes(include=['object']).columns
categorical

Index(['Sex'], dtype='object')

In [142]:
le = LabelEncoder()
for col in categorical:
    df[col] = le.fit_transform(df[col])

In [143]:
df.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,Rings
0,0,0.55,0.43,0.15,0.7715,0.3285,0.1465,0.24,11
1,0,0.63,0.49,0.145,1.13,0.458,0.2765,0.32,11
2,1,0.16,0.11,0.025,0.021,0.0055,0.003,0.005,6
3,2,0.595,0.475,0.15,0.9145,0.3755,0.2055,0.25,10
4,1,0.555,0.425,0.13,0.782,0.3695,0.16,0.1975,9


In [144]:
# Split features and target variable
X = df.drop('Rings', axis=1)
y = df['Rings']

In [145]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [146]:
# Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [147]:
# Model training
model = RandomForestRegressor(random_state=123)
model.fit(X_train_scaled, y_train)

In [1]:
# Predictions
y_pred = model.predict(X_test_scaled)
print(y_pred)

NameError: name 'model' is not defined

In [149]:
# Evaluate
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [150]:
print(f"MAE: {mae:.2f}")
print(f"MSE: {mse:.2f}")
print(f"R² Score: {r2:.2f}")

MAE: 1.28
MSE: 3.59
R² Score: 0.65


In [153]:
joblib.dump(model, 'model.joblib')
joblib.dump(scaler, 'scaler.joblib')

['scaler.joblib']