In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import RandomOverSampler
import joblib

# Load data
trip1_potholes = pd.read_csv('trip1_potholes.csv')
trip1_sensors = pd.read_csv('trip1_sensors.csv')
trip2_potholes = pd.read_csv('trip2_potholes.csv')
trip2_sensors = pd.read_csv('trip2_sensors.csv')
trip3_potholes = pd.read_csv('trip3_potholes.csv')
trip3_sensors = pd.read_csv('trip3_sensors.csv')
trip4_potholes = pd.read_csv('trip4_potholes.csv')
trip4_sensors = pd.read_csv('trip4_sensors.csv')
trip5_potholes = pd.read_csv('trip5_potholes.csv')
trip5_sensors = pd.read_csv('trip5_sensors.csv')

# Convert timestamps
for df in [trip1_sensors, trip1_potholes]:
    df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s')
for df in [trip2_sensors, trip2_potholes]:
    df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s')
for df in [trip3_sensors, trip3_potholes]:
    df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s')
for df in [trip4_sensors, trip4_potholes]:
    df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s')
for df in [trip5_sensors, trip5_potholes]:
    df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s')

# Label pothole events for trip 1
trip1_sensors['pothole'] = 0
for pothole_time in trip1_potholes['timestamp']:
    mask = (trip1_sensors['timestamp'] >= pothole_time - pd.Timedelta(seconds=0.5)) & \
           (trip1_sensors['timestamp'] <= pothole_time + pd.Timedelta(seconds=0.5))
    trip1_sensors.loc[mask, 'pothole'] = 1

# Label pothole events for trip 2
trip2_sensors['pothole'] = 0
for pothole_time in trip2_potholes['timestamp']:
    mask = (trip2_sensors['timestamp'] >= pothole_time - pd.Timedelta(seconds=0.5)) & \
           (trip2_sensors['timestamp'] <= pothole_time + pd.Timedelta(seconds=0.5))
    trip2_sensors.loc[mask, 'pothole'] = 1

# Label pothole events for trip 3
trip3_sensors['pothole'] = 0
for pothole_time in trip3_potholes['timestamp']:
    mask = (trip3_sensors['timestamp'] >= pothole_time - pd.Timedelta(seconds=0.5)) & \
           (trip3_sensors['timestamp'] <= pothole_time + pd.Timedelta(seconds=0.5))
    trip3_sensors.loc[mask, 'pothole'] = 1

# Label pothole events for trip 4
trip4_sensors['pothole'] = 0
for pothole_time in trip4_potholes['timestamp']:
    mask = (trip4_sensors['timestamp'] >= pothole_time - pd.Timedelta(seconds=0.5)) & \
           (trip4_sensors['timestamp'] <= pothole_time + pd.Timedelta(seconds=0.5))
    trip4_sensors.loc[mask, 'pothole'] = 1

# Label pothole events for trip 5
trip5_sensors['pothole'] = 0
for pothole_time in trip5_potholes['timestamp']:
    mask = (trip5_sensors['timestamp'] >= pothole_time - pd.Timedelta(seconds=0.5)) & \
           (trip5_sensors['timestamp'] <= pothole_time + pd.Timedelta(seconds=0.5))
    trip5_sensors.loc[mask, 'pothole'] = 1

# Add trip IDs
trip1_sensors['trip_id'] = 1
trip2_sensors['trip_id'] = 2
trip3_sensors['trip_id'] = 3
trip4_sensors['trip_id'] = 4
trip5_sensors['trip_id'] = 5

# Combine all trips
all_trips = pd.concat([
    trip1_sensors, trip2_sensors, trip3_sensors,
    trip4_sensors, trip5_sensors
], ignore_index=True)

print("Data loaded successfully!")
print(f"Total samples: {len(all_trips)}")
print(f"Missing values:\n{all_trips.isna().sum()}")

# Remove outliers
print("\nRemoving outliers...")
all_trips = all_trips[(all_trips['accelerometerX'] < 10) & (all_trips['accelerometerX'] > -10)]
all_trips = all_trips[(all_trips['accelerometerY'] < 10) & (all_trips['accelerometerY'] > -10)]
all_trips = all_trips[(all_trips['accelerometerZ'] < 10) & (all_trips['accelerometerZ'] > -10)]
all_trips = all_trips[(all_trips['gyroX'] < 10) & (all_trips['gyroX'] > -10)]
all_trips = all_trips[(all_trips['gyroY'] < 10) & (all_trips['gyroY'] > -10)]
all_trips = all_trips[(all_trips['gyroZ'] < 10) & (all_trips['gyroZ'] > -10)]
print(f"Samples after outlier removal: {len(all_trips)}")

# Feature engineering
all_trips['acclermeter_magnitude'] = (all_trips['accelerometerX']**2 + all_trips['accelerometerY']**2 + all_trips['accelerometerZ']**2)**0.5
all_trips['gyro_magnitude'] = (all_trips['gyroX']**2 + all_trips['gyroY']**2 + all_trips['gyroZ']**2)**0.5

features = ['latitude', 'longitude', 'speed', 'accelerometerX',
           'accelerometerY', 'accelerometerZ', 'gyroX', 'gyroY', 'gyroZ',
           'acclermeter_magnitude', 'gyro_magnitude']

# Standardize features
print("\nStandardizing features...")
scaler = StandardScaler()
all_trips[features] = scaler.fit_transform(all_trips[features])

# Check class distribution
print(f"\nOriginal class distribution:\n{all_trips['pothole'].value_counts()}")

# Handle class imbalance
print("\nBalancing classes with RandomOverSampler...")
X = all_trips[features]
y = all_trips['pothole']
ros = RandomOverSampler(random_state=42)
X, y = ros.fit_resample(X, y)
print(f"Balanced class distribution:\n{y.value_counts()}")

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
print(f"\nTrain set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")

# ==================== ADVANCED MODELS ====================

print("\n" + "="*70)
print("TRAINING ADVANCED MODELS FOR POTHOLE DETECTION")
print("="*70)

# 1. XGBoost
print("\n[1/5] Training XGBoost...")
import xgboost as xgb
xgb_model = xgb.XGBClassifier(
    n_estimators=100,
    max_depth=5,
    learning_rate=0.1,
    random_state=42,
    eval_metric='logloss'
)

# 2. LightGBM
print("[2/5] Training LightGBM...")
import lightgbm as lgb
lgb_model = lgb.LGBMClassifier(
    n_estimators=100,
    max_depth=5,
    learning_rate=0.1,
    random_state=42,
    verbose=-1
)

# 3. CatBoost
print("[3/5] Training CatBoost...")
from catboost import CatBoostClassifier
catboost_model = CatBoostClassifier(
    iterations=100,
    depth=5,
    learning_rate=0.1,
    random_state=42,
    verbose=0
)

# 4. Bagging Classifier
print("[4/5] Training Bagging Classifier...")
bagging_model = BaggingClassifier(
    estimator=DecisionTreeClassifier(max_depth=10),
    n_estimators=50,
    max_samples=0.8,
    max_features=0.8,
    random_state=42,
    n_jobs=-1
)

# 5. 1D-CNN
print("[5/5] Training 1D-CNN...")
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

def create_1d_cnn(input_shape):
    model = keras.Sequential([
        layers.Input(shape=input_shape),
        layers.Reshape((input_shape[0], 1)),
        layers.Conv1D(64, 3, activation='relu', padding='same'),
        layers.BatchNormalization(),
        layers.MaxPooling1D(2),
        layers.Conv1D(128, 3, activation='relu', padding='same'),
        layers.BatchNormalization(),
        layers.GlobalAveragePooling1D(),
        layers.Dense(64, activation='relu'),
        layers.Dropout(0.3),
        layers.Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

cnn_model = create_1d_cnn((X_train.shape[1],))

# Train and evaluate all models
models = {
    'XGBoost': xgb_model,
    'LightGBM': lgb_model,
    'CatBoost': catboost_model,
    'Bagging': bagging_model,
}

print("\n" + "="*70)
print("MODEL EVALUATION RESULTS")
print("="*70)

results = {}

for model_name, model in models.items():
    print(f"\n{model_name}:")
    print("-" * 50)
    model.fit(X_train, y_train)

    # Predictions
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)

    # Metrics
    train_acc = accuracy_score(y_train, y_pred_train)
    test_acc = accuracy_score(y_test, y_pred_test)
    test_f1 = f1_score(y_test, y_pred_test)

    print(f"  Training Accuracy: {train_acc:.4f}")
    print(f"  Test Accuracy:     {test_acc:.4f}")
    print(f"  Test F1 Score:     {test_f1:.4f}")
    print(f"  Overfitting Gap:   {(train_acc - test_acc):.4f}")

    results[model_name] = {
        'model': model,
        'train_acc': train_acc,
        'test_acc': test_acc,
        'f1': test_f1,
        'predictions': y_pred_test
    }

# Train 1D-CNN
print(f"\n1D-CNN:")
print("-" * 50)
history = cnn_model.fit(
    X_train, y_train,
    epochs=30,
    batch_size=32,
    validation_split=0.2,
    verbose=0
)

# CNN Predictions
y_pred_train_cnn = (cnn_model.predict(X_train, verbose=0) > 0.5).astype(int).flatten()
y_pred_test_cnn = (cnn_model.predict(X_test, verbose=0) > 0.5).astype(int).flatten()

train_acc_cnn = accuracy_score(y_train, y_pred_train_cnn)
test_acc_cnn = accuracy_score(y_test, y_pred_test_cnn)
test_f1_cnn = f1_score(y_test, y_pred_test_cnn)

print(f"  Training Accuracy: {train_acc_cnn:.4f}")
print(f"  Test Accuracy:     {test_acc_cnn:.4f}")
print(f"  Test F1 Score:     {test_f1_cnn:.4f}")
print(f"  Overfitting Gap:   {(train_acc_cnn - test_acc_cnn):.4f}")

results['1D-CNN'] = {
    'model': cnn_model,
    'train_acc': train_acc_cnn,
    'test_acc': test_acc_cnn,
    'f1': test_f1_cnn,
    'predictions': y_pred_test_cnn
}

# Find best model
print("\n" + "="*70)
print("FINAL SUMMARY")
print("="*70)

# Sort by F1 score
sorted_results = sorted(results.items(), key=lambda x: x[1]['f1'], reverse=True)

print("\nModel Rankings (by F1 Score):")
print("-" * 50)
for i, (model_name, metrics) in enumerate(sorted_results, 1):
    print(f"{i}. {model_name:15} - F1: {metrics['f1']:.4f}, Accuracy: {metrics['test_acc']:.4f}")

best_model_name = sorted_results[0][0]
best_metrics = sorted_results[0][1]

print(f"\nüèÜ BEST MODEL: {best_model_name}")
print("-" * 50)
print(f"Test Accuracy:  {best_metrics['test_acc']:.4f}")
print(f"Test F1 Score:  {best_metrics['f1']:.4f}")
print(f"Training Acc:   {best_metrics['train_acc']:.4f}")

# Show confusion matrix for best model
print(f"\nConfusion Matrix ({best_model_name}):")
cm = confusion_matrix(y_test, best_metrics['predictions'])
print(cm)

print(f"\nClassification Report ({best_model_name}):")
print(classification_report(y_test, best_metrics['predictions'], target_names=['No Pothole', 'Pothole']))

# Save the best model
print("\n" + "="*70)
print("SAVING MODELS")
print("="*70)

if best_model_name == '1D-CNN':
    cnn_model.save('best_pothole_detection_model_cnn.h5')
    print(f"‚úì Best model saved as 'best_pothole_detection_model_cnn.h5'")
else:
    joblib.dump(results[best_model_name]['model'], f'best_pothole_detection_model_{best_model_name.lower()}.joblib')
    print(f"‚úì Best model saved as 'best_pothole_detection_model_{best_model_name.lower()}.joblib'")

# Save all models
for model_name, model_data in results.items():
    if model_name != '1D-CNN':
        filename = f'pothole_model_{model_name.lower()}.joblib'
        joblib.dump(model_data['model'], filename)
        print(f"‚úì {model_name} saved as '{filename}'")

# Save the scaler
joblib.dump(scaler, 'scaler.joblib')
print("‚úì Scaler saved as 'scaler.joblib'")

print("\n" + "="*70)
print("TRAINING COMPLETE!")
print("="*70)

Data loaded successfully!
Total samples: 9859
Missing values:
timestamp         0
latitude          0
longitude         0
speed             0
accelerometerX    0
accelerometerY    0
accelerometerZ    0
gyroX             0
gyroY             0
gyroZ             0
pothole           0
trip_id           0
dtype: int64

Removing outliers...
Samples after outlier removal: 9859

Standardizing features...

Original class distribution:
pothole
0    9348
1     511
Name: count, dtype: int64

Balancing classes with RandomOverSampler...
Balanced class distribution:
pothole
0    9348
1    9348
Name: count, dtype: int64

Train set size: 14956
Test set size: 3740

TRAINING ADVANCED MODELS FOR POTHOLE DETECTION

[1/5] Training XGBoost...
[2/5] Training LightGBM...
[3/5] Training CatBoost...
[4/5] Training Bagging Classifier...
[5/5] Training 1D-CNN...

MODEL EVALUATION RESULTS

XGBoost:
--------------------------------------------------
  Training Accuracy: 0.9702
  Test Accuracy:     0.9604
  Test F1 S

In [3]:
!pip install catboost


Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m99.2/99.2 MB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8
