# Abalone Rings Regression using Classical Machine Learning Algorithms

## Comparison on Linear Regression, Ridge Regression, Lasso Regression, Random Forest and Decision Tree

In [29]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

In [30]:
data = pd.read_csv('data/abalone.csv')

In [31]:
data['Sex'] = data['Sex'].map({'M': 0, 'F': 1, 'I': 2})

X = data.drop('Rings', axis=1)
y = data['Rings']

In [32]:
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

In [33]:
models = {
    'SVM': SVC(random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'KNN': KNeighborsClassifier()
}

param_grids = {
    'SVM': {'C': [0.1, 1, 10], 'kernel': ['rbf', 'linear']},
    'Logistic Regression': {'C': [0.1, 1, 10], 'solver': ['liblinear', 'lbfgs']},
    'Random Forest': {'n_estimators': [100, 200], 'max_depth': [10, 20, None]},
    'Decision Tree': {'max_depth': [10, 20, None], 'min_samples_split': [2, 5, 10]},
    'KNN': {'n_neighbors': [3, 5, 7], 'weights': ['uniform', 'distance']}
}

In [34]:
results = {}

for name, model in models.items():
    print(f"Training {name}...")
    
    grid_search = GridSearchCV(model, param_grids[name], cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train_scaled, y_train)
    
    best_model = grid_search.best_estimator_
    
    y_val_pred = best_model.predict(X_val_scaled)
    
    val_accuracy = accuracy_score(y_val, y_val_pred)
    
    y_test_pred = best_model.predict(X_test_scaled)
    
    test_accuracy = accuracy_score(y_test, y_test_pred)
    
    cv_scores = cross_val_score(best_model, X_train_scaled, y_train, cv=5)
    
    results[name] = {
        'Best Parameters': grid_search.best_params_,
        'Validation Accuracy': val_accuracy,
        'Test Accuracy': test_accuracy,
        'Cross-Validation Mean': cv_scores.mean(),
        'Cross-Validation Std': cv_scores.std()
    }
    
    print(f"{name} - Validation Accuracy: {val_accuracy:.4f}")
    print(f"{name} - Test Accuracy: {test_accuracy:.4f}")
    print(f"{name} - CV Accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
    print(f"{name} - Best Parameters: {grid_search.best_params_}")
    print("Classification Report (Test Set):")
    print(classification_report(y_test, y_test_pred))
    print("\n" + "="*50 + "\n")

Training SVM...
SVM - Validation Accuracy: 0.2632
SVM - Test Accuracy: 0.2791
SVM - CV Accuracy: 0.2639 (+/- 0.0398)
SVM - Best Parameters: {'C': 1, 'kernel': 'rbf'}
Classification Report (Test Set):
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         3
           4       0.33      0.37      0.35        19
           5       0.30      0.07      0.11        45
           6       0.30      0.34      0.32        77
           7       0.34      0.35      0.34       129
           8       0.33      0.41      0.37       164
           9       0.28      0.42      0.34       212
          10       0.22      0.43      0.29       191
          11       0.30      0.21      0.25       137
          12       0.00      0.00      0.00        84
          13       0.00      0.00      0.00        53
          14       0.00      0.00      0.00        34
          15       0.00      0.00      0.00        31
          16       0.00      0.00      0.00

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Logistic Regression - Validation Accuracy: 0.2650
Logistic Regression - Test Accuracy: 0.2751
Logistic Regression - CV Accuracy: 0.2767 (+/- 0.0184)
Logistic Regression - Best Parameters: {'C': 10, 'solver': 'lbfgs'}
Classification Report (Test Set):
              precision    recall  f1-score   support

           3       0.33      0.33      0.33         3
           4       0.35      0.37      0.36        19
           5       0.43      0.22      0.29        45
           6       0.31      0.42      0.36        77
           7       0.35      0.29      0.32       129
           8       0.32      0.46      0.38       164
           9       0.29      0.37      0.33       212
          10       0.24      0.34      0.28       191
          11       0.27      0.19      0.22       137
          12       0.10      0.02      0.04        84
          13       0.06      0.06      0.06        53
          14       0.06      0.03      0.04        34
          15       0.09      0.03      0.05   

In [35]:
print("Summary of Results:")
for name, result in results.items():
    print(f"{name}:")
    print(f"\tValidation Accuracy: {result['Validation Accuracy']:.4f}")
    print(f"\tTest Accuracy: {result['Test Accuracy']:.4f}")
    print(f"\tCV Accuracy: {result['Cross-Validation Mean']:.4f} (+/- {result['Cross-Validation Std'] * 2:.4f})")
    print(f"\tBest Parameters: {result['Best Parameters']}")

Summary of Results:
SVM:
	Validation Accuracy: 0.2632
	Test Accuracy: 0.2791
	CV Accuracy: 0.2639 (+/- 0.0398)
	Best Parameters: {'C': 1, 'kernel': 'rbf'}
Logistic Regression:
	Validation Accuracy: 0.2650
	Test Accuracy: 0.2751
	CV Accuracy: 0.2767 (+/- 0.0184)
	Best Parameters: {'C': 10, 'solver': 'lbfgs'}
Random Forest:
	Validation Accuracy: 0.2752
	Test Accuracy: 0.2799
	CV Accuracy: 0.2596 (+/- 0.0236)
	Best Parameters: {'max_depth': 10, 'n_estimators': 200}
Decision Tree:
	Validation Accuracy: 0.2274
	Test Accuracy: 0.2456
	CV Accuracy: 0.2382 (+/- 0.0309)
	Best Parameters: {'max_depth': 10, 'min_samples_split': 10}
KNN:
	Validation Accuracy: 0.2274
	Test Accuracy: 0.2448
	CV Accuracy: 0.2438 (+/- 0.0216)
	Best Parameters: {'n_neighbors': 7, 'weights': 'uniform'}
