## üì¶ Import Libraries

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import warnings
import time

# Scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFECV, mutual_info_regression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler

# LightGBM
import lightgbm as lgb

warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-darkgrid')

print("‚úÖ Libraries loaded successfully!")

‚úÖ Libraries loaded successfully!


## üìÇ Auto-Load Excel Files

In [11]:
# Auto-detect all Excel files in dataset-type-2 folder
excel_files = glob.glob('dataset-type-2/*.xlsx')

print(f"üîç Found {len(excel_files)} Excel files:")
for f in excel_files:
    print(f"  üìÑ {f}")

if len(excel_files) == 0:
    raise FileNotFoundError("‚ùå No Excel files found in dataset-type-2/ folder!")

# Load and combine all Excel files
dfs = []
for file in excel_files:
    try:
        df_temp = pd.read_excel(file)
        dfs.append(df_temp)
        print(f"‚úÖ Loaded {file}: {len(df_temp)} rows, {len(df_temp.columns)} columns")
    except Exception as e:
        print(f"‚ùå Error loading {file}: {e}")

# Combine all dataframes
df_raw = pd.concat(dfs, ignore_index=True)
print(f"\nüìä Total combined data: {len(df_raw):,} rows, {len(df_raw.columns)} columns")
print(f"\nüìã Columns: {list(df_raw.columns)}")
df_raw.head()

üîç Found 6 Excel files:
  üìÑ dataset-type-2\Gelombang (1).xlsx
  üìÑ dataset-type-2\Gelombang (2).xlsx
  üìÑ dataset-type-2\Gelombang (3).xlsx
  üìÑ dataset-type-2\Gelombang (4).xlsx
  üìÑ dataset-type-2\Gelombang (5).xlsx
  üìÑ dataset-type-2\Gelombang (6).xlsx
‚úÖ Loaded dataset-type-2\Gelombang (1).xlsx: 8741 rows, 24 columns
‚úÖ Loaded dataset-type-2\Gelombang (2).xlsx: 8740 rows, 24 columns
‚úÖ Loaded dataset-type-2\Gelombang (3).xlsx: 17548 rows, 24 columns
‚úÖ Loaded dataset-type-2\Gelombang (4).xlsx: 8740 rows, 24 columns
‚úÖ Loaded dataset-type-2\Gelombang (5).xlsx: 8740 rows, 24 columns
‚úÖ Loaded dataset-type-2\Gelombang (6).xlsx: 8740 rows, 24 columns

üìä Total combined data: 61,249 rows, 30 columns

üìã Columns: ['bandar agung_Andrean Syahrezi', 'Unnamed: 1', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4', 'Unnamed: 5', 'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11', 'Unnamed: 12', 'Unnamed: 13', 'Unnamed: 14', 'Unnamed: 15', 'Un

Unnamed: 0,bandar agung_Andrean Syahrezi,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Location:,Teluk betung,Panjang,Kota Jawa,Kota agung,kalianda
0,Location:,bandar agung,,,,,,,,,...,,,,,,,,,,
1,Latitude:,-5.604619,,,,,,,,,...,,,,,,,,,,
2,Longitude:,105.838853,,,,,,,,,...,,,,,,,,,,
3,Time(UTC/GMT),Hsig(m),Hsig(Scale),Hmax(m),Hmax(Scale),WaveDir(deg),WaveDir(compass),PrimSwell(m),PrimSwell(Scale),PrimSwellDir(deg),...,SeaSurfaceSalinity(PSU),WindSpeed(knots),WindDir(deg),WindDir(compass),,,,,,
4,2024-07-29 00:00:00,0.08409,Smooth,0.16817,Smooth,334,NNW,0.07923,Smooth,270,...,31.0715,1.30439,62,ENE,,,,,,


## ü§ñ Auto-Detect Features & Target

In [12]:
# The data has metadata rows at the top, we need to extract the actual data
# Row 3 contains the column names, row 4+ contains the data
# We need to reload the data properly

# Find the header row (row with "Time(UTC/GMT)")
header_row = None
for idx, row in df_raw.iterrows():
    if 'Time(UTC/GMT)' in str(row.values):
        header_row = idx
        break

if header_row is None:
    raise ValueError("‚ùå Could not find header row with 'Time(UTC/GMT)'")

# Extract column names from header row
columns = df_raw.iloc[header_row].values.tolist()
# Extract data starting from the row after header
data = df_raw.iloc[header_row + 1:].values

# Create clean dataframe
df_clean = pd.DataFrame(data, columns=columns)

# Remove 'Time(UTC/GMT)' column and text columns (Scale, compass)
text_cols = ['Time(UTC/GMT)', 'Hsig(Scale)', 'Hmax(Scale)', 'WaveDir(compass)', 
             'PrimSwell(Scale)', 'PrimSwellDir(compass)', 'WindSea(Scale)', 
             'WindSeaDir(compass)', 'SurfCurrentDir(compass)', 'WindDir(compass)']
df_clean = df_clean.drop(columns=[col for col in text_cols if col in df_clean.columns], errors='ignore')

# Convert all remaining columns to numeric
for col in df_clean.columns:
    df_clean[col] = pd.to_numeric(df_clean[col], errors='coerce')

# Remove rows with NaN values
df_clean = df_clean.dropna()

# Auto-detect numeric columns
numeric_cols = df_clean.select_dtypes(include=[np.number]).columns.tolist()

print(f"üî¢ Found {len(numeric_cols)} numeric columns:")
for col in numeric_cols:
    print(f"  ‚Ä¢ {col}")

if len(numeric_cols) < 2:
    raise ValueError("‚ùå Need at least 2 numeric columns (1 target + 1 feature)")

# Auto-assign target (last numeric column) and features (all others)
target_col = numeric_cols[-1]
feature_cols = numeric_cols[:-1]

print(f"\nüéØ Auto-detected Target: {target_col}")
print(f"üìä Auto-detected Features ({len(feature_cols)}): {feature_cols}")

# Extract features and target
df = df_clean[numeric_cols].copy()

print(f"\n‚úÖ Clean data: {len(df):,} rows (after removing NaN)")

# Display basic statistics
print(f"\nüìä Dataset Statistics:")
df.describe()

TypeError: arg must be a list, tuple, 1-d array, or Series

## üéØ Prepare Train/Test Data

In [13]:
# Split features and target
X = df[feature_cols]
y = df[target_col]

# Train/test split (80/20)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=True
)

print(f"üì¶ Train set: {len(X_train):,} rows")
print(f"üì¶ Test set: {len(X_test):,} rows")
print(f"\nüìä Target ({target_col}) statistics:")
print(f"  - Mean: {y_train.mean():.4f}")
print(f"  - Std: {y_train.std():.4f}")
print(f"  - Min: {y_train.min():.4f}")
print(f"  - Max: {y_train.max():.4f}")

# Optional: Standardize features for better performance
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert back to DataFrame for easier handling
X_train_scaled = pd.DataFrame(X_train_scaled, columns=feature_cols)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=feature_cols)

print("\n‚úÖ Data standardized (mean=0, std=1)")

NameError: name 'df' is not defined

## üî¨ Experiment 1: RFECV Feature Selection

In [None]:
print("=" * 60)
print("üî¨ EXPERIMENT 1: RFECV + LightGBM")
print("=" * 60)

# Base estimator for RFECV
base_estimator = lgb.LGBMRegressor(
    n_estimators=100,
    learning_rate=0.05,
    max_depth=5,
    random_state=42,
    verbose=-1
)

# RFECV - automatically finds optimal number of features
print("\n‚è≥ Running RFECV... (this may take a few minutes)")
start_time = time.time()

rfecv = RFECV(
    estimator=base_estimator,
    step=1,
    cv=5,  # 5-fold cross-validation
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=0
)

rfecv.fit(X_train_scaled, y_train)
rfecv_time = time.time() - start_time

# Get selected features
rfecv_features = X_train_scaled.columns[rfecv.support_].tolist()
print(f"\n‚úÖ RFECV completed in {rfecv_time:.2f} seconds")
print(f"üéØ Optimal number of features: {rfecv.n_features_}")
print(f"üìä Selected features: {rfecv_features}")

# Transform data
X_train_rfecv = rfecv.transform(X_train_scaled)
X_test_rfecv = rfecv.transform(X_test_scaled)

In [None]:
# Train final model with selected features
print("\nüèãÔ∏è Training LightGBM with RFECV features...")
start_time = time.time()

model_rfecv = lgb.LGBMRegressor(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=7,
    num_leaves=31,
    random_state=42,
    verbose=-1
)

model_rfecv.fit(X_train_rfecv, y_train)
train_time_rfecv = time.time() - start_time

# Predictions
y_pred_rfecv = model_rfecv.predict(X_test_rfecv)

# Metrics
rmse_rfecv = np.sqrt(mean_squared_error(y_test, y_pred_rfecv))
mae_rfecv = mean_absolute_error(y_test, y_pred_rfecv)
r2_rfecv = r2_score(y_test, y_pred_rfecv)

print(f"\nüìä RFECV Results:")
print(f"  ‚è±Ô∏è  Total time: {rfecv_time + train_time_rfecv:.2f}s")
print(f"  üéØ Features used: {rfecv.n_features_}")
print(f"  üìâ RMSE: {rmse_rfecv:.4f}")
print(f"  üìâ MAE: {mae_rfecv:.4f}")
print(f"  üìà R¬≤: {r2_rfecv:.4f}")

## üî¨ Experiment 2: Mutual Information Feature Selection

In [None]:
print("=" * 60)
print("üî¨ EXPERIMENT 2: Mutual Information + LightGBM")
print("=" * 60)

# Calculate mutual information scores
print("\n‚è≥ Calculating Mutual Information scores...")
start_time = time.time()

mi_scores = mutual_info_regression(X_train_scaled, y_train, random_state=42, n_jobs=-1)
mi_time = time.time() - start_time

# Create feature importance dataframe
mi_df = pd.DataFrame({
    'feature': X_train_scaled.columns,
    'mi_score': mi_scores
}).sort_values('mi_score', ascending=False)

print(f"\n‚úÖ MI calculation completed in {mi_time:.2f} seconds")
print(f"\nüìä All features ranked by MI score:")
print(mi_df.to_string(index=False))

# Select top k features (same number as RFECV for fair comparison)
top_k = rfecv.n_features_
mi_features = mi_df.head(top_k)['feature'].tolist()

print(f"\nüéØ Selected top {top_k} features: {mi_features}")

# Transform data
X_train_mi = X_train_scaled[mi_features]
X_test_mi = X_test_scaled[mi_features]

In [None]:
# Train final model with MI features
print("\nüèãÔ∏è Training LightGBM with MI features...")
start_time = time.time()

model_mi = lgb.LGBMRegressor(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=7,
    num_leaves=31,
    random_state=42,
    verbose=-1
)

model_mi.fit(X_train_mi, y_train)
train_time_mi = time.time() - start_time

# Predictions
y_pred_mi = model_mi.predict(X_test_mi)

# Metrics
rmse_mi = np.sqrt(mean_squared_error(y_test, y_pred_mi))
mae_mi = mean_absolute_error(y_test, y_pred_mi)
r2_mi = r2_score(y_test, y_pred_mi)

print(f"\nüìä Mutual Information Results:")
print(f"  ‚è±Ô∏è  Total time: {mi_time + train_time_mi:.2f}s")
print(f"  üéØ Features used: {top_k}")
print(f"  üìâ RMSE: {rmse_mi:.4f}")
print(f"  üìâ MAE: {mae_mi:.4f}")
print(f"  üìà R¬≤: {r2_mi:.4f}")

## üìä Comparison & Results

In [None]:
# Create comparison dataframe
results = pd.DataFrame({
    'Method': ['RFECV', 'Mutual Information'],
    'RMSE': [rmse_rfecv, rmse_mi],
    'MAE': [mae_rfecv, mae_mi],
    'R2': [r2_rfecv, r2_mi],
    'Time (s)': [rfecv_time + train_time_rfecv, mi_time + train_time_mi],
    'Num Features': [rfecv.n_features_, top_k]
})

print("\n" + "="*80)
print("üìä FINAL COMPARISON: RFECV vs Mutual Information")
print("="*80)
print(results.to_string(index=False))

# Determine winner
print("\nüèÜ WINNER ANALYSIS:")
if rmse_rfecv < rmse_mi:
    print(f"  ü•á RFECV wins on RMSE ({rmse_rfecv:.4f} < {rmse_mi:.4f})")
    diff = ((rmse_mi - rmse_rfecv) / rmse_mi) * 100
    print(f"     ‚Üí {diff:.2f}% better than MI")
else:
    print(f"  ü•á Mutual Information wins on RMSE ({rmse_mi:.4f} < {rmse_rfecv:.4f})")
    diff = ((rmse_rfecv - rmse_mi) / rmse_rfecv) * 100
    print(f"     ‚Üí {diff:.2f}% better than RFECV")

if r2_rfecv > r2_mi:
    print(f"  ü•á RFECV wins on R¬≤ ({r2_rfecv:.4f} > {r2_mi:.4f})")
else:
    print(f"  ü•á Mutual Information wins on R¬≤ ({r2_mi:.4f} > {r2_rfecv:.4f})")

if (rfecv_time + train_time_rfecv) < (mi_time + train_time_mi):
    print(f"  ‚ö° RFECV is faster ({rfecv_time + train_time_rfecv:.2f}s < {mi_time + train_time_mi:.2f}s)")
else:
    ratio = (rfecv_time + train_time_rfecv) / (mi_time + train_time_mi)
    print(f"  ‚ö° Mutual Information is faster ({mi_time + train_time_mi:.2f}s < {rfecv_time + train_time_rfecv:.2f}s)")
    print(f"     ‚Üí {ratio:.1f}x faster than RFECV")

# Save results
results.to_csv('results_dataset2_wave.csv', index=False)
print("\nüíæ Results saved to: results_dataset2_wave.csv")

## üìà Visualization

In [None]:
# Create comparison plots
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
fig.suptitle(f'Dataset 2: Feature Selection Comparison (Target: {target_col})', 
             fontsize=16, fontweight='bold')

methods = ['RFECV', 'MI']
colors = ['#9b59b6', '#f39c12']

# Plot 1: RMSE
ax1 = axes[0, 0]
ax1.bar(methods, [rmse_rfecv, rmse_mi], color=colors, alpha=0.7, edgecolor='black')
ax1.set_ylabel('RMSE', fontsize=12, fontweight='bold')
ax1.set_title('Root Mean Squared Error (Lower is Better)', fontsize=12)
ax1.grid(axis='y', alpha=0.3)
for i, v in enumerate([rmse_rfecv, rmse_mi]):
    ax1.text(i, v, f'{v:.4f}', ha='center', va='bottom', fontweight='bold')

# Plot 2: R¬≤
ax2 = axes[0, 1]
ax2.bar(methods, [r2_rfecv, r2_mi], color=colors, alpha=0.7, edgecolor='black')
ax2.set_ylabel('R¬≤ Score', fontsize=12, fontweight='bold')
ax2.set_title('R¬≤ Score (Higher is Better)', fontsize=12)
ax2.grid(axis='y', alpha=0.3)
ax2.set_ylim([min(r2_rfecv, r2_mi) * 0.95, max(r2_rfecv, r2_mi) * 1.05])
for i, v in enumerate([r2_rfecv, r2_mi]):
    ax2.text(i, v, f'{v:.4f}', ha='center', va='bottom', fontweight='bold')

# Plot 3: Training Time
ax3 = axes[1, 0]
times = [rfecv_time + train_time_rfecv, mi_time + train_time_mi]
ax3.bar(methods, times, color=colors, alpha=0.7, edgecolor='black')
ax3.set_ylabel('Time (seconds)', fontsize=12, fontweight='bold')
ax3.set_title('Total Processing Time (Lower is Better)', fontsize=12)
ax3.grid(axis='y', alpha=0.3)
for i, v in enumerate(times):
    ax3.text(i, v, f'{v:.2f}s', ha='center', va='bottom', fontweight='bold')

# Plot 4: MAE
ax4 = axes[1, 1]
ax4.bar(methods, [mae_rfecv, mae_mi], color=colors, alpha=0.7, edgecolor='black')
ax4.set_ylabel('MAE', fontsize=12, fontweight='bold')
ax4.set_title('Mean Absolute Error (Lower is Better)', fontsize=12)
ax4.grid(axis='y', alpha=0.3)
for i, v in enumerate([mae_rfecv, mae_mi]):
    ax4.text(i, v, f'{v:.4f}', ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.savefig('comparison_dataset2_wave.png', dpi=300, bbox_inches='tight')
print("\nüìä Visualization saved to: comparison_dataset2_wave.png")
plt.show()

## üîç Feature Importance Analysis

In [None]:
# Compare selected features
print("\nüìä FEATURE SELECTION COMPARISON:")
print("\nüîπ RFECV Selected Features:")
for i, feat in enumerate(rfecv_features, 1):
    print(f"  {i}. {feat}")

print("\nüîπ Mutual Information Top Features:")
for i, row in mi_df.head(top_k).iterrows():
    print(f"  {i+1}. {row['feature']} (score: {row['mi_score']:.4f})")

# Find common features
common_features = set(rfecv_features) & set(mi_features)
print(f"\nü§ù Common Features ({len(common_features)}):")
for feat in common_features:
    print(f"  ‚úì {feat}")

# Unique features
rfecv_only = set(rfecv_features) - set(mi_features)
mi_only = set(mi_features) - set(rfecv_features)

if rfecv_only:
    print(f"\nüîπ RFECV-only features ({len(rfecv_only)}):")
    for feat in rfecv_only:
        print(f"  ‚Ä¢ {feat}")
        
if mi_only:
    print(f"\nüîπ MI-only features ({len(mi_only)}):")
    for feat in mi_only:
        print(f"  ‚Ä¢ {feat}")

## üìà Prediction Visualization

In [None]:
# Plot actual vs predicted
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
fig.suptitle(f'Actual vs Predicted: {target_col}', fontsize=16, fontweight='bold')

# RFECV
ax1 = axes[0]
ax1.scatter(y_test, y_pred_rfecv, alpha=0.5, color='#9b59b6', edgecolors='black', linewidth=0.5)
ax1.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2, label='Perfect prediction')
ax1.set_xlabel('Actual', fontsize=12, fontweight='bold')
ax1.set_ylabel('Predicted', fontsize=12, fontweight='bold')
ax1.set_title(f'RFECV (R¬≤={r2_rfecv:.4f})', fontsize=12)
ax1.legend()
ax1.grid(alpha=0.3)

# Mutual Information
ax2 = axes[1]
ax2.scatter(y_test, y_pred_mi, alpha=0.5, color='#f39c12', edgecolors='black', linewidth=0.5)
ax2.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2, label='Perfect prediction')
ax2.set_xlabel('Actual', fontsize=12, fontweight='bold')
ax2.set_ylabel('Predicted', fontsize=12, fontweight='bold')
ax2.set_title(f'Mutual Information (R¬≤={r2_mi:.4f})', fontsize=12)
ax2.legend()
ax2.grid(alpha=0.3)

plt.tight_layout()
plt.savefig('predictions_dataset2_wave.png', dpi=300, bbox_inches='tight')
print("\nüìä Prediction plot saved to: predictions_dataset2_wave.png")
plt.show()

## üìù Conclusion & Recommendations

In [None]:
print("\n" + "="*80)
print("üìù KESIMPULAN & REKOMENDASI - DATASET 2 (WAVE)")
print("="*80)

# Calculate percentage differences
rmse_diff = abs(rmse_rfecv - rmse_mi) / min(rmse_rfecv, rmse_mi) * 100
r2_diff = abs(r2_rfecv - r2_mi) / max(r2_rfecv, r2_mi) * 100
time_diff = abs((rfecv_time + train_time_rfecv) - (mi_time + train_time_mi)) / min(rfecv_time + train_time_rfecv, mi_time + train_time_mi) * 100

print(f"\nüìä Performance Differences:")
print(f"  ‚Ä¢ RMSE difference: {rmse_diff:.2f}%")
print(f"  ‚Ä¢ R¬≤ difference: {r2_diff:.2f}%")
print(f"  ‚Ä¢ Time difference: {time_diff:.2f}%")

print(f"\nüéØ Dataset Characteristics:")
print(f"  ‚Ä¢ Target variable: {target_col}")
print(f"  ‚Ä¢ Total features available: {len(feature_cols)}")
print(f"  ‚Ä¢ Features selected: {top_k}")
print(f"  ‚Ä¢ Selection rate: {(top_k/len(feature_cols)*100):.1f}%")

print(f"\nüèÜ Recommendations:")

# Overall winner
if rmse_rfecv < rmse_mi and r2_rfecv > r2_mi:
    print("  ‚úÖ RFECV is CLEARLY BETTER for this dataset")
    print("     ‚Üí Better accuracy on both RMSE and R¬≤")
    print("     ‚Üí Use RFECV for final model deployment")
elif rmse_mi < rmse_rfecv and r2_mi > r2_rfecv:
    print("  ‚úÖ Mutual Information is CLEARLY BETTER for this dataset")
    print("     ‚Üí Better accuracy on both RMSE and R¬≤")
    print("     ‚Üí Use MI for final model deployment")
else:
    print("  ‚öñÔ∏è Mixed results - choose based on priority:")
    if rmse_rfecv < rmse_mi:
        print("     ‚Üí RFECV better for minimizing prediction errors (RMSE)")
    else:
        print("     ‚Üí MI better for minimizing prediction errors (RMSE)")
    if r2_rfecv > r2_mi:
        print("     ‚Üí RFECV better for explaining variance (R¬≤)")
    else:
        print("     ‚Üí MI better for explaining variance (R¬≤)")

# Speed consideration
if (mi_time + train_time_mi) < (rfecv_time + train_time_rfecv):
    speed_ratio = (rfecv_time + train_time_rfecv) / (mi_time + train_time_mi)
    print(f"\n  ‚ö° Speed Advantage: MI is {speed_ratio:.1f}x FASTER")
    if speed_ratio > 2:
        print("     ‚Üí Consider MI if speed is critical")

# Feature agreement
agreement = len(common_features) / top_k * 100
print(f"\n  ü§ù Feature Agreement: {agreement:.1f}%")
if agreement > 70:
    print("     ‚Üí Both methods largely agree on important features")
elif agreement > 40:
    print("     ‚Üí Moderate agreement - features have different importance perspectives")
else:
    print("     ‚Üí Low agreement - methods see importance very differently")

print(f"\nüí° Best Practice for Wave Data:")
print("  1Ô∏è‚É£ Start with MI for quick feature exploration")
print("  2Ô∏è‚É£ Validate with RFECV for robust selection")
print("  3Ô∏è‚É£ Use ensemble of both selections for maximum robustness")
print("  4Ô∏è‚É£ Consider domain knowledge to validate selected features")

print("\n‚úÖ Analysis Complete for Dataset 2!")