In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Machine Learning Libraries
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

# Data Augmentation
from imblearn.over_sampling import SMOTE
from scipy import stats
from sklearn.neighbors import NearestNeighbors

# Model Persistence
import joblib
import pickle


In [2]:
try:
    df = pd.read_excel("Book1.xlsx")
    print(f"✅ Data berhasil dimuat: {len(df)} baris, {len(df.columns)} kolom")
except FileNotFoundError:
    print("❌ File tidak ditemukan")

    np.random.seed(42)

✅ Data berhasil dimuat: 68 baris, 9 kolom


In [3]:
print(f"Data awal: {len(df)} baris")
print(f"Columns: {list(df.columns)}")

Data awal: 68 baris
Columns: ['Tanggal', 'Jam_Mulai', 'Durasi_Menit', 'Elevasi', 'Jarak', 'Kec_Rata_Rata', 'Kec_Maksimal', 'Curah_Hujan', 'Jam_Tidur']


</H1> DATA CLEANING DAN FEATURE ENGINEERING </H1>

In [4]:
# Handling anomali curah hujan (888.0 mm)
print(f"\nAnomali curah hujan ditemukan: {df['Curah_Hujan'].max()} mm")
df.loc[df['Curah_Hujan'] == 888.0, 'Curah_Hujan'] = df['Curah_Hujan'].median()
print(f"Diganti dengan median: {df['Curah_Hujan'].median()} mm")


Anomali curah hujan ditemukan: 888.0 mm
Diganti dengan median: 0.05 mm


In [5]:
# Convert datetime features
df['Tanggal'] = pd.to_datetime(df['Tanggal'], format='%d/%m/%Y')
df['Jam_Mulai_Datetime'] = pd.to_datetime(df['Jam_Mulai'], format='%H:%M:%S').dt.time


In [6]:
# Extract time-based features
df['Hour'] = df['Jam_Mulai'].apply(lambda t: t.hour)
df['Is_Morning'] = (df['Hour'] < 12).astype(int)
df['Day_of_Week'] = df['Tanggal'].dt.dayofweek
df['Is_Weekend'] = (df['Day_of_Week'].isin([5, 6])).astype(int)

In [7]:
# Create categorical features
df['Time_Category'] = df['Hour'].apply(lambda x: 'Early_Morning' if x < 8 
                                       else 'Morning' if x < 12 
                                       else 'Afternoon' if x < 17 
                                       else 'Evening')

In [8]:
# Elevation categories
df['Elevation_Category'] = pd.cut(df['Elevasi'], 
                                  bins=[0, 150, 250, 400, float('inf')],
                                  labels=['Flat', 'Rolling', 'Hilly', 'Mountainous'])

In [9]:
# Rain categories  
df['Rain_Category'] = pd.cut(df['Curah_Hujan'],
                             bins=[-0.1, 0, 10, 30, float('inf')],
                             labels=['No_Rain', 'Light', 'Moderate', 'Heavy'])

In [10]:
# Distance categories
df['Distance_Category'] = pd.cut(df['Jarak'],
                                 bins=[0, 20, 30, 40, float('inf')],
                                 labels=['Short', 'Medium', 'Long', 'Ultra'])

In [11]:
# Sleep quality categories
df['Sleep_Quality'] = pd.cut(df['Jam_Tidur'],
                             bins=[0, 4, 6, 8, float('inf')],
                             labels=['Poor', 'Moderate', 'Good', 'Excellent'])

In [12]:
# Advanced feature engineering
df['Speed_per_Elevation'] = df['Kec_Rata_Rata'] / (df['Elevasi'] + 1)
df['Distance_per_Duration'] = df['Jarak'] / df['Durasi_Menit']
df['Elevation_per_Distance'] = df['Elevasi'] / df['Jarak']
df['Rest_Factor'] = df['Jam_Tidur'] / df['Durasi_Menit'] * 100

In [14]:
# Weather impact factor
df['Weather_Impact'] = df['Curah_Hujan'].apply(lambda x: 1 if x == 0 else 0.8 if x <= 10 else 0.6 if x <= 30 else 0.4)

print(f"\nFeature engineering completed. New features:")
new_features = [col for col in df.columns if col not in df.keys()]
for feature in new_features:
    print(f"  - {feature}")


Feature engineering completed. New features:


</H2> EXPLORATORY DATA ANALYSIS </H2>

In [15]:
# Basic statistics
print("\nStatistik Dasar Target Variable (Kec_Rata_Rata):")
print(f"Mean: {df['Kec_Rata_Rata'].mean():.2f} km/h")
print(f"Std: {df['Kec_Rata_Rata'].std():.2f} km/h")
print(f"Min: {df['Kec_Rata_Rata'].min():.2f} km/h")
print(f"Max: {df['Kec_Rata_Rata'].max():.2f} km/h")


Statistik Dasar Target Variable (Kec_Rata_Rata):
Mean: 21.36 km/h
Std: 3.63 km/h
Min: 15.20 km/h
Max: 38.50 km/h


In [16]:
# Correlation analysis
numeric_features = ['Elevasi', 'Jarak', 'Curah_Hujan', 'Jam_Tidur', 'Hour', 'Day_of_Week',
                    'Speed_per_Elevation', 'Distance_per_Duration', 'Elevation_per_Distance', 
                    'Rest_Factor', 'Weather_Impact']

print(f"\nKorelasi dengan target variable:")
correlations = df[numeric_features + ['Kec_Rata_Rata']].corr()['Kec_Rata_Rata'].sort_values(key=abs, ascending=False)
for feature, corr in correlations.items():
    if feature != 'Kec_Rata_Rata':
        print(f"  {feature}: {corr:.3f}")


Korelasi dengan target variable:
  Jarak: 0.435
  Elevasi: 0.373
  Distance_per_Duration: 0.325
  Hour: -0.211
  Jam_Tidur: 0.172
  Elevation_per_Distance: 0.108
  Weather_Impact: -0.085
  Rest_Factor: 0.041
  Curah_Hujan: 0.036
  Day_of_Week: 0.029
  Speed_per_Elevation: 0.008


DATA PREPARATION

In [17]:
# Select features for modeling
categorical_features = ['Time_Category', 'Elevation_Category', 'Rain_Category', 
                       'Distance_Category', 'Sleep_Quality']

In [18]:
# Encode categorical variables
le_encoders = {}
for cat_feature in categorical_features:
    le = LabelEncoder()
    df[f'{cat_feature}_encoded'] = le.fit_transform(df[cat_feature].astype(str))
    le_encoders[cat_feature] = le

In [19]:
# Final feature selection
feature_columns = (numeric_features + 
                  [f'{cat}_encoded' for cat in categorical_features] +
                  ['Is_Morning', 'Is_Weekend'])

X = df[feature_columns].copy()
y = df['Kec_Rata_Rata'].copy()

print(f"Features selected: {len(feature_columns)}")
print(f"Dataset shape: {X.shape}")

Features selected: 18
Dataset shape: (68, 18)


DATA AUGMENTATION

In [20]:
class AdvancedDataAugmentation:
    def __init__(self, X, y, random_state=42):
        self.X = X
        self.y = y
        self.random_state = random_state
        np.random.seed(random_state)
    
    def gaussian_noise_augmentation(self, noise_factor=0.1, n_samples=50):
        """ Gaussian noise untuk sample baru"""
        X_noise = []
        y_noise = []
        
        for _ in range(n_samples):
            idx = np.random.randint(0, len(self.X))
            sample = self.X.iloc[idx].values.copy()
            target = self.y.iloc[idx]
            
            # menambah noise untuk fitur kontinu
            continuous_features = ['Elevasi', 'Jarak', 'Curah_Hujan', 'Jam_Tidur', 'Hour',
                                 'Speed_per_Elevation', 'Distance_per_Duration', 
                                 'Elevation_per_Distance', 'Rest_Factor', 'Weather_Impact']
            
            for i, feature in enumerate(self.X.columns):
                if feature in continuous_features:
                    noise = np.random.normal(0, noise_factor * np.std(self.X[feature]))
                    sample[i] += noise
                    sample[i] = max(0, sample[i])  # memastikan non-negative values
            
            X_noise.append(sample)
            # memastikan target
            y_adjustment = np.random.normal(0, 0.5)
            y_noise.append(max(10, target + y_adjustment))  # Min speed 10 km/h
        
        return np.array(X_noise), np.array(y_noise)
    
    def interpolation_augmentation(self, n_samples=30):
        """Create new samples by interpolating between existing ones"""
        X_interp = []
        y_interp = []
        
        for _ in range(n_samples):
            idx1, idx2 = np.random.choice(len(self.X), 2, replace=False)
            alpha = np.random.uniform(0.2, 0.8)
            
            sample1 = self.X.iloc[idx1].values
            sample2 = self.X.iloc[idx2].values
            target1 = self.y.iloc[idx1]
            target2 = self.y.iloc[idx2]
            
            # Interpolate
            new_sample = alpha * sample1 + (1 - alpha) * sample2
            new_target = alpha * target1 + (1 - alpha) * target2
            
            X_interp.append(new_sample)
            y_interp.append(new_target)
        
        return np.array(X_interp), np.array(y_interp)
    
    def seasonal_variation_augmentation(self, n_samples=20):
        """ Membuat cariasi berdasarkan musim/pola cuaca"""
        X_seasonal = []
        y_seasonal = []
        
        for _ in range(n_samples):
            idx = np.random.randint(0, len(self.X))
            sample = self.X.iloc[idx].values.copy()
            target = self.y.iloc[idx]
            
            # Simulasi cuaca
            weather_variation = np.random.choice(['sunny', 'windy', 'humid'])
            
            if weather_variation == 'windy':
                target *= np.random.uniform(0.95, 1.0)
            elif weather_variation == 'humid':
                target *= np.random.uniform(0.9, 0.98)
            else: 
                target *= np.random.uniform(1.0, 1.05)
            
            X_seasonal.append(sample)
            y_seasonal.append(target)
        
        return np.array(X_seasonal), np.array(y_seasonal)

# Apply augmentation
augmenter = AdvancedDataAugmentation(X, y)

In [21]:
# Generate augmented data
X_noise, y_noise = augmenter.gaussian_noise_augmentation(n_samples=60)
X_interp, y_interp = augmenter.interpolation_augmentation(n_samples=40)
X_seasonal, y_seasonal = augmenter.seasonal_variation_augmentation(n_samples=30)

In [22]:
# Combine all data
X_augmented = np.vstack([X.values, X_noise, X_interp, X_seasonal])
y_augmented = np.hstack([y.values, y_noise, y_interp, y_seasonal])

print(f"Original data: {len(X)} samples")
print(f"Augmented data: {len(X_augmented)} samples")
print(f"Augmentation ratio: {len(X_augmented)/len(X):.1f}x")

# Convert kembali ke df untuk consistency
X_augmented = pd.DataFrame(X_augmented, columns=X.columns)
y_augmented = pd.Series(y_augmented)

Original data: 68 samples
Augmented data: 198 samples
Augmentation ratio: 2.9x


MODEL TRAINING DAN OPTIMIZATION

In [23]:

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X_augmented, y_augmented, test_size=0.2, random_state=42, stratify=None
)

print(f"Training set: {len(X_train)} samples")
print(f"Test set: {len(X_test)} samples")

Training set: 158 samples
Test set: 40 samples


In [24]:
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [25]:
# XGBoost parameter tuning
print("\nOptimizing XGBoost hyperparameters...")

# Initial XGBoost model
xgb_model = xgb.XGBRegressor(random_state=42, n_jobs=-1)

# GridSearch
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'reg_alpha': [0, 0.1, 0.5],
    'reg_lambda': [0, 0.1, 0.5]
}



Optimizing XGBoost hyperparameters...


In [26]:
# GridSearch with cross-validation
grid_search = GridSearchCV(
    xgb_model, param_grid, cv=5, scoring='neg_mean_absolute_error',
    n_jobs=-1, verbose=1
)

grid_search.fit(X_train_scaled, y_train)

print(f"Best parameters found:")
for param, value in grid_search.best_params_.items():
    print(f"  {param}: {value}")

# Train final model with best parameters
final_model = grid_search.best_estimator_

Fitting 5 folds for each of 2187 candidates, totalling 10935 fits
Best parameters found:
  colsample_bytree: 0.8
  learning_rate: 0.2
  max_depth: 3
  n_estimators: 300
  reg_alpha: 0.1
  reg_lambda: 0.1
  subsample: 0.8


MODEL EVALUATION

In [27]:
# Predictions
y_pred_train = final_model.predict(X_train_scaled)
y_pred_test = final_model.predict(X_test_scaled)


In [28]:
# Calculate metrics
train_mae = mean_absolute_error(y_train, y_pred_train)
test_mae = mean_absolute_error(y_test, y_pred_test)
train_r2 = r2_score(y_train, y_pred_train)
test_r2 = r2_score(y_test, y_pred_test)
train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))

In [29]:
print(f"Training Performance:")
print(f"  MAE: {train_mae:.3f} km/h")
print(f"  R²: {train_r2:.3f}")
print(f"  RMSE: {train_rmse:.3f} km/h")

print(f"\nTest Performance:")
print(f"  MAE: {test_mae:.3f} km/h")
print(f"  R²: {test_r2:.3f}")
print(f"  RMSE: {test_rmse:.3f} km/h")

Training Performance:
  MAE: 0.128 km/h
  R²: 0.995
  RMSE: 0.240 km/h

Test Performance:
  MAE: 0.993 km/h
  R²: 0.800
  RMSE: 1.240 km/h


In [30]:
# Cross-validation
cv_scores = cross_val_score(final_model, X_train_scaled, y_train, 
                           cv=5, scoring='neg_mean_absolute_error')
print(f"\nCross-validation MAE: {-cv_scores.mean():.3f} ± {cv_scores.std():.3f}")


Cross-validation MAE: 1.078 ± 0.318


In [31]:
# Feature importance
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': final_model.feature_importances_
}).sort_values('importance', ascending=False)

print(f"\nTop 10 Most Important Features:")
for i, (_, row) in enumerate(feature_importance.head(10).iterrows()):
    print(f"  {i+1}. {row['feature']}: {row['importance']:.3f}")


Top 10 Most Important Features:
  1. Jarak: 0.182
  2. Is_Morning: 0.137
  3. Elevasi: 0.129
  4. Speed_per_Elevation: 0.111
  5. Elevation_per_Distance: 0.094
  6. Distance_per_Duration: 0.084
  7. Rain_Category_encoded: 0.057
  8. Sleep_Quality_encoded: 0.046
  9. Weather_Impact: 0.027
  10. Elevation_Category_encoded: 0.023


MODEL PERSISTENCE

In [32]:

# Create model package
model_package = {
    'model': final_model,
    'scaler': scaler,
    'label_encoders': le_encoders,
    'feature_columns': feature_columns,
    'feature_importance': feature_importance,
    'model_performance': {
        'test_mae': test_mae,
        'test_r2': test_r2,
        'test_rmse': test_rmse,
        'cv_mae_mean': -cv_scores.mean(),
        'cv_mae_std': cv_scores.std()
    },
    'training_info': {
        'original_samples': len(X),
        'augmented_samples': len(X_augmented),
        'best_params': grid_search.best_params_,
        'training_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    }
}

In [33]:
# Save model
joblib.dump(model_package, 'cycling_speed_prediction_model_v2.joblib')
print("Model saved as 'cycling_speed_prediction_model_v2.joblib'")

Model saved as 'cycling_speed_prediction_model_v2.joblib'
