# Ship Hull Resistance Prediction - Interactive Analysis

Complete ML pipeline with EDA, training, and evaluation.

**Dataset**: calm_water_resistance_data_CORRECTED-1.csv  
**Task**: Regression (predicting Rt_N)  
**Random Seed**: 42

In [None]:
# Setup
import sys
sys.path.append('../scripts')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb
import joblib
import warnings
warnings.filterwarnings('ignore')

from utils import RANDOM_STATE, load_dataset, preprocess_data

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.dpi'] = 100
print('✓ Setup complete')

In [None]:
# Load Data
df, target_col = load_dataset('../data/calm_water_resistance_data_CORRECTED-1.csv', 'Rt_N')
print(f'\nDataset shape: {df.shape}')
print(f'Target column: {target_col}')
df.head()

In [None]:
# EDA - Dataset Info
print('Dataset Info:')
print('=' * 50)
print(df.info())
print('\nMissing values:')
print(df.isnull().sum())
print('\nBasic statistics:')
df.describe()

In [None]:
# EDA - Target Distribution
fig, ax = plt.subplots(figsize=(10, 6))
ax.hist(df[target_col], bins=30, color='steelblue', edgecolor='black', alpha=0.7)
ax.set_xlabel('Total Resistance (N)', fontsize=12, fontweight='bold')
ax.set_ylabel('Frequency', fontsize=12)
ax.set_title('Distribution of Target Variable (Rt_N)', fontsize=14, fontweight='bold')
ax.grid(alpha=0.3)
plt.tight_layout()
plt.show()

print(f'Target statistics:')
print(df[target_col].describe())

In [None]:
# EDA - Hull Types
hull_counts = df['hull'].value_counts()
fig, ax = plt.subplots(figsize=(10, 6))
hull_counts.plot(kind='bar', ax=ax, color='coral', edgecolor='black')
ax.set_title('Samples per Hull Type', fontsize=14, fontweight='bold')
ax.set_xlabel('Hull Type')
ax.set_ylabel('Count')
ax.grid(axis='y', alpha=0.3)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Preprocess
X_train, X_test, y_train, y_test, feature_names = preprocess_data(df, target_col, test_size=0.2)
print(f'Training features: {X_train.shape}')
print(f'Test features: {X_test.shape}')
print(f'Feature names: {feature_names}')

In [None]:
# Train Models
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=RANDOM_STATE, n_jobs=-1),
    'XGBoost': xgb.XGBRegressor(n_estimators=100, random_state=RANDOM_STATE, n_jobs=-1)
}

print('Training models...')
for name, model in models.items():
    model.fit(X_train, y_train)
    print(f'  ✓ {name} trained')

In [None]:
# Cross-Validation
kfold = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
cv_results = {}

for name, model in models.items():
    cv_scores = cross_val_score(model, X_train, y_train, cv=kfold, 
                               scoring='neg_mean_squared_error', n_jobs=-1)
    cv_rmse = np.sqrt(-cv_scores)
    cv_results[name] = cv_rmse
    print(f'{name}: CV RMSE = {cv_rmse.mean():.2f} ± {cv_rmse.std():.2f}')

In [None]:
# Test Set Evaluation
print('\nTest Set Performance:')
print('=' * 60)

results = {}
for name, model in models.items():
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    results[name] = {
        'RMSE': rmse,
        'MAE': mae,
        'R2': r2,
        'predictions': y_pred
    }
    
    print(f'{name}:')
    print(f'  RMSE: {rmse:.2f} N')
    print(f'  MAE:  {mae:.2f} N')
    print(f'  R²:   {r2:.4f}')
    print()

In [None]:
# Results Table
results_df = pd.DataFrame({
    'Model': list(results.keys()),
    'RMSE (N)': [results[k]['RMSE'] for k in results.keys()],
    'MAE (N)': [results[k]['MAE'] for k in results.keys()],
    'R²': [results[k]['R2'] for k in results.keys()]
})
results_df.sort_values('RMSE')

In [None]:
# Predicted vs Actual
best_model_name = min(results.keys(), key=lambda k: results[k]['RMSE'])
best_predictions = results[best_model_name]['predictions']

fig, ax = plt.subplots(figsize=(10, 8))
scatter = ax.scatter(y_test, best_predictions, alpha=0.6, s=100, 
                    c=y_test, cmap='viridis', edgecolors='black', linewidth=0.5)
min_val = min(y_test.min(), best_predictions.min())
max_val = max(y_test.max(), best_predictions.max())
ax.plot([min_val, max_val], [min_val, max_val], 'r--', lw=2, label='Perfect Prediction')

r2 = results[best_model_name]['R2']
rmse = results[best_model_name]['RMSE']
stats_text = f'R² = {r2:.4f}\nRMSE = {rmse:.2f} N'
ax.text(0.05, 0.95, stats_text, transform=ax.transAxes, fontsize=12,
        verticalalignment='top', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.8))

ax.set_xlabel('Actual Resistance (N)', fontsize=14, fontweight='bold')
ax.set_ylabel('Predicted Resistance (N)', fontsize=14, fontweight='bold')
ax.set_title(f'Best Model: {best_model_name}', fontsize=16, fontweight='bold')
ax.legend(fontsize=12)
ax.grid(alpha=0.3)
plt.colorbar(scatter, ax=ax, label='Actual (N)')
plt.tight_layout()
plt.show()

In [None]:
# Feature Importance
rf_model = models['Random Forest']
importances = rf_model.feature_importances_
indices = np.argsort(importances)[::-1][:15]

fig, ax = plt.subplots(figsize=(10, 8))
y_pos = np.arange(len(indices))
ax.barh(y_pos, importances[indices], color='teal', edgecolor='black')
ax.set_yticks(y_pos)
ax.set_yticklabels([feature_names[i] for i in indices])
ax.invert_yaxis()
ax.set_xlabel('Importance', fontsize=14, fontweight='bold')
ax.set_title('Top 15 Features - Random Forest', fontsize=16, fontweight='bold')
ax.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Summary
print('=' * 60)
print('SUMMARY')
print('=' * 60)
print(f'✓ Dataset: 60 samples, {len(feature_names)} features')
print(f'✓ Best Model: {best_model_name}')
print(f'✓ Test RMSE: {results[best_model_name]["RMSE"]:.2f} N')
print(f'✓ Test R²: {results[best_model_name]["R2"]:.4f}')
print(f'✓ Random Seed: {RANDOM_STATE}')
print('=' * 60)
print('\n🎉 Analysis complete! All outputs reproducible with seed 42.')