In [None]:
# Let's add a model with different class handling
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

print("Original class distribution in training set:")
print(y_train.value_counts())

# 1. Apply SMOTE (Synthetic Minority Over-sampling Technique)
# This creates synthetic examples of the minority class
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote, y_train

# Create a new model with different class_weight
pipe_balanced = Pipeline([
    ("scaler", StandardScaler()),  
    ("clf", RandomForestClassifier(
        n_estimators=100,
        max_depth=4,        # Slightly deeper trees
        min_samples_leaf=5, # Reduced from 10
        min_samples_split=5,# Reduced from 10
        class_weight={0: 1, 1: 5},  # Put 5x weight on transit class
        random_state=42
    ))
])

# Train and evaluate
pipe_balanced.fit(X_train, y_train)
y_pred_balanced = pipe_balanced.predict(X_test)

print("\nClassification Report with Transit-Weighted Model:")
print(classification_report(y_test, y_pred_balanced))

# Show confusion matrix
cm = confusion_matrix(y_test, y_pred_balanced)
print("\nConfusion Matrix:")
print(cm)

# Calculate the actual number of transits detected
print(f"\nTransits detected: {cm[1,1]} out of {sum(cm[1,:])} true transits")
print(f"False transit alerts: {cm[0,1]} out of {sum(cm[0,:])} non-transit samples")

# Calculate precision for just the transit class
transit_precision = cm[1,1] / (cm[1,1] + cm[0,1]) if (cm[1,1] + cm[0,1]) > 0 else 0
print(f"Transit detection precision: {transit_precision:.4f}")