In [16]:
import pandas as pd
import numpy as np

df = pd.read_csv('/Users/balmukundmishra/Desktop/2025-Learning/ML-Course/9_Problem_Statements/data/common.csv')


In [20]:
from datetime import datetime, timedelta


print("Original DataFrame Head:")
print(df.head())
print("\nOriginal DataFrame Info:")
df.info()
print("\nLeak Distribution:")
print(df['isleak'].value_counts())

# --- 1. Create a Proper Datetime Index ---
# A common year is needed to parse full dates. Let's pick an arbitrary year.
# Ensure consistency if your real data spans multiple years.
current_year = datetime.now().year # Or a fixed year like 2024
df['timestamp_str'] = df['day'] + ' ' + df['time'] + ' ' + str(current_year)

# Define a custom mapping for day abbreviations if they are inconsistent
day_map = {
    'Mon': 'Monday', 'Tue': 'Tuesday', 'Wed': 'Wednesday', 'Thu': 'Thursday',
    'Fri': 'Friday', 'Sat': 'Saturday', 'Sun': 'Sunday'
}
df['timestamp_str'] = df['timestamp_str'].replace(day_map, regex=True)

df['timestamp'] = pd.to_datetime(df['timestamp_str'], format='%A %H:%M:%S %Y')
df = df.sort_values(by='timestamp').reset_index(drop=True)
df = df.set_index('timestamp')

# Drop the temporary string column
df = df.drop(columns=['timestamp_str', 'time', 'day'])

print("\nDataFrame after Datetime Indexing:")
print(df.head())
print("\n")

# --- 2. Extract Cyclical Time Features ---
print("--- Step 2: Extracting Cyclical Time Features ---")
df['hour'] = df.index.hour
df['day_of_week'] = df.index.dayofweek # Monday=0, Sunday=6

# Cyclical encoding for hour
df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)

# Cyclical encoding for day of week
df['day_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
df['day_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)

# Drop original hour and day_of_week if preferred, or keep for tree models
df = df.drop(columns=['hour', 'day_of_week'])

print("Cyclical Time Features Added. Head:")
print(df.head())
print("\n")

# --- 3. Create Lag Features ---
print("--- Step 3: Creating Lag Features ---")
# Lag features for flow_rate and variability.
# Choose lag periods based on your data frequency and domain knowledge.
# Example: If data is minutely, lag 1 is 1 minute ago.
# Let's assume your data is roughly 5-minute intervals from the sample for rolling windows.

lag_periods = [1, 2, 3] # Lags of 1, 2, 3 previous observations

for col in ['flow_rate', 'variability']:
    for lag in lag_periods:
        df[f'{col}_lag_{lag}'] = df[col].shift(lag)

# Fill NaNs created by shifting (e.g., with 0, mean, or median, or forward/backward fill)
# For time series, forward fill or 0 is often appropriate for lags.
# Be careful not to fill with values that would introduce future information.
df.fillna(0, inplace=True) # Simple fill for demonstration; adjust as needed

print("Lag Features Added. Head:")
print(df.head())
print("\n")

# --- 4. Create Rolling Window Statistics ---
print("--- Step 4: Creating Rolling Window Statistics ---")
# Define rolling windows. Use time-based windows if your index is datetime.
# '5min', '10min', '30min', '1H', etc. Adjust based on expected leak durations and precursors.
# Example windows:
window_sizes = ['5min', '10min', '30min'] # Use time-based offsets

for col in ['flow_rate', 'variability']:
    for window in window_sizes:
        # Mean
        df[f'{col}_rolling_mean_{window}'] = df[col].rolling(window=window, closed='left').mean()
        # Standard Deviation (Volatility)
        df[f'{col}_rolling_std_{window}'] = df[col].rolling(window=window, closed='left').std()
        # Min/Max (Extremes)
        df[f'{col}_rolling_min_{window}'] = df[col].rolling(window=window, closed='left').min()
        df[f'{col}_rolling_max_{window}'] = df[col].rolling(window=window, closed='left').max()
        # Range
        df[f'{col}_rolling_range_{window}'] = df[f'{col}_rolling_max_{window}'] - df[f'{col}_rolling_min_{window}']
        # Median (Robust to outliers)
        df[f'{col}_rolling_median_{window}'] = df[col].rolling(window=window, closed='left').median()
        # Sum (e.g., total flow over a period)
        df[f'{col}_rolling_sum_{window}'] = df[col].rolling(window=window, closed='left').sum()


# Fill NaNs created by rolling operations (e.g., with 0 or the first valid observation)
df.fillna(0, inplace=True) # Simple fill for demonstration; adjust as needed

print("Rolling Window Statistics Added. Head:")
print(df.head())
print("\n")

# --- 5. Create Rate of Change / Difference Features ---
print("--- Step 5: Creating Rate of Change / Difference Features ---")
diff_periods = [1, 2] # Differences from 1 and 2 steps ago (adjust as needed)

for col in ['flow_rate', 'variability']:
    for period in diff_periods:
        df[f'{col}_diff_{period}'] = df[col].diff(periods=period)

# Fill NaNs
df.fillna(0, inplace=True) # Simple fill for demonstration; adjust as needed

print("Rate of Change Features Added. Head:")
print(df.head())
print("\n")

# --- 6. Create Interaction Features ---
print("--- Step 6: Creating Interaction Features ---")
df['flow_rate_x_variability'] = df['flow_rate'] * df['variability']
df['flow_rate_div_variability'] = df['flow_rate'] / (df['variability'] + 1e-6) # Add epsilon to avoid div by zero
df['duration_x_flow_rate'] = df['duration'] * df['flow_rate']
df['duration_x_variability'] = df['duration'] * df['variability']

print("Interaction Features Added. Head:")
print(df.head())
print("\n")

# --- 7. Create Deviation Features (from rolling mean/median) ---
print("--- Step 7: Creating Deviation Features ---")
for col in ['flow_rate', 'variability']:
    for window in window_sizes:
        # Absolute deviation from rolling mean
        df[f'{col}_abs_dev_from_mean_{window}'] = np.abs(df[col] - df[f'{col}_rolling_mean_{window}'])
        # Z-score (deviation normalized by std dev)
        df[f'{col}_zscore_{window}'] = (df[col] - df[f'{col}_rolling_mean_{window}']) / (df[f'{col}_rolling_std_{window}'] + 1e-6)

df.fillna(0, inplace=True) # Fill any NaNs that might arise from std=0

print("Deviation Features Added. Head:")
print(df.head())
print("\n")

# --- 8. (Optional) Domain-Specific Threshold Features ---
print("--- Step 8: (Optional) Domain-Specific Threshold Features ---")
# Example: If you know that flow_rate > 100 or variability > 3 is highly suspicious
# These thresholds should come from domain knowledge or initial data exploration.
flow_rate_high_threshold = 10.0 # Adjust based on data analysis
variability_high_threshold = 0.5 # Adjust based on data analysis

df['is_flow_rate_abnormally_high'] = (df['flow_rate'] > flow_rate_high_threshold).astype(int)
df['is_variability_abnormally_high'] = (df['variability'] > variability_high_threshold).astype(int)

print("Domain-Specific Threshold Features Added. Head:")
print(df.head())
print("\n")

# --- Final Check ---
print("--- Final DataFrame Info ---")
df.info()
print("\nFinal DataFrame Head (with new features):")
print(df.head())
print("\nDataFrame Tail (useful to see how rolling features stabilize):")
print(df.tail())

# Check a row with a leak to see feature values
print("\nFeatures for the 'True' leak instance(s):")
print(df[df['isleak'] == True])

Original DataFrame Head:
   Unnamed: 0      time  day  duration  flow_rate  variability  isleak
0           0   0:31:37  Sun        10   2.005437     0.445410   False
1           1  19:20:08  Sat       141   2.244325     0.097051   False
2           2  19:24:42  Thu        22   1.447766     0.328988   False
3           3  15:20:20  Sat        45   1.796816     0.670781   False
4           4  21:47:19  Mon        37   0.372023     0.347357   False

Original DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Unnamed: 0   10000 non-null  int64  
 1   time         10000 non-null  object 
 2   day          10000 non-null  object 
 3   duration     10000 non-null  int64  
 4   flow_rate    10000 non-null  float64
 5   variability  10000 non-null  float64
 6   isleak       10000 non-null  bool   
dtypes: bool(1), float64(2), int64(2),

In [21]:
df.head()

Unnamed: 0_level_0,Unnamed: 0,duration,flow_rate,variability,isleak,hour_sin,hour_cos,day_sin,day_cos,flow_rate_lag_1,...,flow_rate_abs_dev_from_mean_30min,flow_rate_zscore_30min,variability_abs_dev_from_mean_5min,variability_zscore_5min,variability_abs_dev_from_mean_10min,variability_zscore_10min,variability_abs_dev_from_mean_30min,variability_zscore_30min,is_flow_rate_abnormally_high,is_variability_abnormally_high
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2025-01-01 00:00:22,4721,17,1.464558,0.702492,False,0.0,1.0,0.974928,-0.222521,0.0,...,1.464558,1464558.0,0.702492,702492.012,0.702492,702492.012,0.702492,702492.012,0,1
2025-01-01 00:00:23,3087,55,0.818034,0.488942,False,0.0,1.0,0.974928,-0.222521,1.464558,...,0.646524,-646523.8,0.21355,-213550.041,0.21355,-213550.041,0.21355,-213550.041,0,0
2025-01-01 00:00:28,7458,241,0.789594,0.367876,False,0.0,1.0,0.974928,-0.222521,0.818034,...,0.351702,-0.7693152,0.227841,-1.508845,0.227841,-1.508845,0.227841,-1.508845,0,0
2025-01-01 00:00:34,1906,35,0.226443,0.122868,False,0.0,1.0,0.974928,-0.222521,0.789594,...,0.797618,-2.089392,0.396901,-2.342627,0.396901,-2.342627,0.396901,-2.342627,0,0
2025-01-01 00:00:39,6167,174,2.747311,0.615335,False,0.0,1.0,0.974928,-0.222521,0.226443,...,1.922654,3.798473,0.194791,0.805226,0.194791,0.805226,0.194791,0.805226,0,1


In [23]:
df.drop(columns=['Unnamed: 0'], inplace= True)

In [24]:
df.head()

Unnamed: 0_level_0,duration,flow_rate,variability,isleak,hour_sin,hour_cos,day_sin,day_cos,flow_rate_lag_1,flow_rate_lag_2,...,flow_rate_abs_dev_from_mean_30min,flow_rate_zscore_30min,variability_abs_dev_from_mean_5min,variability_zscore_5min,variability_abs_dev_from_mean_10min,variability_zscore_10min,variability_abs_dev_from_mean_30min,variability_zscore_30min,is_flow_rate_abnormally_high,is_variability_abnormally_high
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2025-01-01 00:00:22,17,1.464558,0.702492,False,0.0,1.0,0.974928,-0.222521,0.0,0.0,...,1.464558,1464558.0,0.702492,702492.012,0.702492,702492.012,0.702492,702492.012,0,1
2025-01-01 00:00:23,55,0.818034,0.488942,False,0.0,1.0,0.974928,-0.222521,1.464558,0.0,...,0.646524,-646523.8,0.21355,-213550.041,0.21355,-213550.041,0.21355,-213550.041,0,0
2025-01-01 00:00:28,241,0.789594,0.367876,False,0.0,1.0,0.974928,-0.222521,0.818034,1.464558,...,0.351702,-0.7693152,0.227841,-1.508845,0.227841,-1.508845,0.227841,-1.508845,0,0
2025-01-01 00:00:34,35,0.226443,0.122868,False,0.0,1.0,0.974928,-0.222521,0.789594,0.818034,...,0.797618,-2.089392,0.396901,-2.342627,0.396901,-2.342627,0.396901,-2.342627,0,0
2025-01-01 00:00:39,174,2.747311,0.615335,False,0.0,1.0,0.974928,-0.222521,0.226443,0.789594,...,1.922654,3.798473,0.194791,0.805226,0.194791,0.805226,0.194791,0.805226,0,1


In [26]:
# Example time-series split
split_point = int(len(df) * 0.8) # 80% for training
train_df = df.iloc[:split_point]
test_df = df.iloc[split_point:]

X_train = train_df.drop(columns=['isleak'])
y_train = train_df['isleak']
X_test = test_df.drop(columns=['isleak'])
y_test = test_df['isleak']

In [32]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# --- 0. Re-create a Sample DataFrame with more realistic data for balancing ---
# This assumes your previous feature engineering steps were run and `df` exists.
# For demonstration, I'm creating a new df that's more imbalanced.
# In your actual workflow, you'd just continue with your already engineered 'df'.

# Let's create a more imbalanced synthetic dataset for demonstration
np.random.seed(42) # for reproducibility

n_samples = 10000
n_leaks = 50 # 0.5% leaks, simulating high imbalance

# Generate features for non-leaks
non_leak_data = {
    'flow_rate': np.random.normal(50, 10, n_samples - n_leaks),
    'variability': np.random.normal(0.1, 0.05, n_samples - n_leaks),
    'duration': np.random.randint(5, 60, n_samples - n_leaks),
    # Let's add some simple engineered features for demonstration
    'flow_rate_lag_1': np.random.normal(50, 10, n_samples - n_leaks),
    'flow_rate_rolling_mean_10min': np.random.normal(50, 5, n_samples - n_leaks),
    'variability_rolling_std_5min': np.random.normal(0.02, 0.01, n_samples - n_leaks),
    'flow_rate_x_variability': np.random.normal(5, 1, n_samples - n_leaks),
    'flow_rate_abs_dev_from_mean_5min': np.random.normal(1, 0.5, n_samples - n_leaks),
    'is_flow_rate_abnormally_high': np.zeros(n_samples - n_leaks, dtype=int)
}

# Generate features for leaks (with distinct patterns)
leak_data = {
    'flow_rate': np.random.normal(150, 30, n_leaks), # Higher flow during leaks
    'variability': np.random.normal(5.0, 1.0, n_leaks), # Higher variability during leaks
    'duration': np.random.randint(60, 300, n_leaks), # Longer duration leaks
    'flow_rate_lag_1': np.random.normal(140, 30, n_leaks),
    'flow_rate_rolling_mean_10min': np.random.normal(100, 20, n_leaks),
    'variability_rolling_std_5min': np.random.normal(1.0, 0.5, n_leaks),
    'flow_rate_x_variability': np.random.normal(500, 100, n_leaks),
    'flow_rate_abs_dev_from_mean_5min': np.random.normal(50, 10, n_leaks),
    'is_flow_rate_abnormally_high': np.ones(n_leaks, dtype=int)
}

# Combine and add timestamp (simplified for this demo)
# In a real scenario, you'd have your actual engineered 'df' from previous steps
all_data = {col: np.concatenate([non_leak_data[col], leak_data[col]]) for col in non_leak_data.keys()}
all_data['is_leak'] = np.concatenate([np.zeros(n_samples - n_leaks, dtype=bool), np.ones(n_leaks, dtype=bool)])

# Create dummy timestamps (important for realistic splitting later)
start_time = datetime(2024, 1, 1, 0, 0, 0)
time_deltas = [timedelta(minutes=i) for i in range(n_samples)]
np.random.shuffle(time_deltas) # Shuffle to mix true/false leaks chronologically, then sort
timestamps = sorted([start_time + td for td in time_deltas])
df = pd.DataFrame(all_data, index=timestamps)
df['is_leak'] = df['is_leak'].astype(bool) # Ensure boolean type

df = df.sort_index() # Sort by timestamp

print("--- Initial DataFrame (Simulated Engineered) ---")
print(df.head())
print("\nLeak distribution in initial DF:")
print(df['is_leak'].value_counts())
print("\n")

# --- 1. Prepare Data for Modeling ---
X = df.drop('is_leak', axis=1)
y = df['is_leak']

# --- 2. Time-Series Train-Test Split ---
# This is crucial for time-series data to prevent data leakage.
# Use a simple proportional split based on time.
split_ratio = 0.8 # 80% for training, 20% for testing
split_index = int(len(X) * split_ratio)

X_train, X_test = X.iloc[:split_index], X.iloc[split_index:]
y_train, y_test = y.iloc[:split_index], y.iloc[split_index:]

print(f"Train set size: {len(X_train)} samples")
print(f"Test set size: {len(X_test)} samples")
print(f"Leak distribution in TRAINING set:\n{y_train.value_counts()}")
print(f"Leak distribution in TEST set:\n{y_test.value_counts()}")
print("\n")



--- Initial DataFrame (Simulated Engineered) ---
                     flow_rate  variability  duration  flow_rate_lag_1  \
2024-01-01 00:00:00  54.967142     0.143390        44        39.375346   
2024-01-01 00:01:00  48.617357     0.111370        48        40.656867   
2024-01-01 00:02:00  56.476885     0.055508        43        36.741827   
2024-01-01 00:03:00  65.230299     0.051961        28        47.454534   
2024-01-01 00:04:00  47.658466     0.112706        13        49.782906   

                     flow_rate_rolling_mean_10min  \
2024-01-01 00:00:00                     54.555138   
2024-01-01 00:01:00                     43.038515   
2024-01-01 00:02:00                     49.894332   
2024-01-01 00:03:00                     54.018039   
2024-01-01 00:04:00                     52.818751   

                     variability_rolling_std_5min  flow_rate_x_variability  \
2024-01-01 00:00:00                      0.028725                 5.090831   
2024-01-01 00:01:00            

In [33]:
# --- 3. Feature Scaling (Recommended for SVM, Logistic Regression, potentially beneficial for tree-based) ---
from sklearn.preprocessing import StandardScaler

print("--- Step 3: Scaling Features ---")
scaler = StandardScaler()
# Fit scaler only on training data to prevent data leakage from the test set
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test) # Transform test data using scaler fitted on train

# Convert back to DataFrame for easier handling if needed, retaining column names
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

print("Features Scaled.\n")



--- Step 3: Scaling Features ---
Features Scaled.



In [38]:
y_train.value_counts()

False    8000
Name: is_leak, dtype: int64

In [35]:
# --- 4. Data Balancing Techniques ---
# We'll demonstrate a few, but you'll likely pick one or two based on experiments.
# Remember: Apply balancing ONLY on the training data.

from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE
from imblearn.under_sampling import RandomUnderSampler, TomekLinks
from imblearn.combine import SMOTETomek
from collections import Counter

print("--- Step 4: Applying Data Balancing Techniques ---")

# --- Option A: No Balancing (Baseline for comparison) ---
# We'll use the original (scaled) X_train, y_train directly for a baseline model.
# X_train_no_balance, y_train_no_balance = X_train_scaled_df, y_train


# --- Option B: Class Weighting (Preferred for Tree-based Models first) ---
# This isn't a "balancing" technique in the sense of changing data samples,
# but it tells the model to pay more attention to the minority class.
# We will apply this directly when defining the models.
# For sklearn's RandomForest/SVC: class_weight='balanced'
# For XGBoost/LightGBM: scale_pos_weight = count(negative_class) / count(positive_class)
class_weight_value = y_train.value_counts()[False] / y_train.value_counts()[True]




--- Step 4: Applying Data Balancing Techniques ---


KeyError: True

In [None]:
# --- 5. Model Training and Evaluation ---
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import precision_score, recall_score, fbeta_score, roc_auc_score, precision_recall_curve, auc, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

print("--- Step 5: Model Training and Evaluation ---")

models = {
    # Baseline with no explicit balancing (rely on class_weight)
    "RandomForest (No Resampling, Weighted)": RandomForestClassifier(
        random_state=42, n_estimators=200, class_weight='balanced', n_jobs=-1
    ),
    "XGBoost (No Resampling, Weighted)": None, # Will be set below
    "LightGBM (No Resampling, Weighted)": None, # Will be set below

    # Models with SMOTE
    "RandomForest (SMOTE)": RandomForestClassifier(random_state=42, n_estimators=200, n_jobs=-1),
    "XGBoost (SMOTE)": None, # Will be set below
    "Logistic Regression (SMOTE)": LogisticRegression(random_state=42, solver='liblinear', max_iter=1000),
    "SVM (SMOTE)": SVC(random_state=42, probability=True) # probability=True is needed for PR curve
}

# XGBoost and LightGBM need to be imported and set up.
# They handle class weighting via `scale_pos_weight` parameter.
# For models with explicit resampling (like SMOTE), do NOT use `scale_pos_weight` again,
# as the dataset is already balanced.

import xgboost as xgb
import lightgbm as lgb

models["XGBoost (No Resampling, Weighted)"] = xgb.XGBClassifier(
    random_state=42, n_estimators=200, use_label_encoder=False, eval_metric='logloss',
    scale_pos_weight=class_weight_value # Apply class weighting here
)
models["LightGBM (No Resampling, Weighted)"] = lgb.LGBMClassifier(
    random_state=42, n_estimators=200,
    scale_pos_weight=class_weight_value # Apply class weighting here
)

models["XGBoost (SMOTE)"] = xgb.XGBClassifier(
    random_state=42, n_estimators=200, use_label_encoder=False, eval_metric='logloss'
)
models["LightGBM (SMOTE)"] = lgb.LGBMClassifier(
    random_state=42, n_estimators=200
)


# Define a function to evaluate models
def evaluate_model(model, X_train_data, y_train_data, X_test_data, y_test_data, model_name):
    print(f"\n--- Training and Evaluating: {model_name} ---")
    model.fit(X_train_data, y_train_data)

    # Predict probabilities for precision-recall curve
    if hasattr(model, "predict_proba"):
        y_pred_proba = model.predict_proba(X_test_data)[:, 1]
    elif hasattr(model, "decision_function"): # For SVM without probability=True
        y_pred_proba = model.decision_function(X_test_data)
        # Normalize decision function to [0,1] if not already
        if np.min(y_pred_proba) < 0 or np.max(y_pred_proba) > 1:
             y_pred_proba = (y_pred_proba - np.min(y_pred_proba)) / (np.max(y_pred_proba) - np.min(y_pred_proba))
    else:
        print(f"Warning: {model_name} does not have predict_proba or decision_function. Skipping PR curve.")
        return

    # --- Optimize Threshold for F2-Score ---
    # F2-score weights recall twice as much as precision (beta=2)
    precisions, recalls, thresholds = precision_recall_curve(y_test_data, y_pred_proba)
    f2_scores = (1 + 2**2) * (precisions * recalls) / (2**2 * precisions + recalls + 1e-10) # Add epsilon to avoid div by zero
    
    # Find the threshold that maximizes F2-score
    optimal_threshold_idx = np.argmax(f2_scores)
    optimal_threshold = thresholds[optimal_threshold_idx]
    
    # Predict with optimized threshold
    y_pred_optimized = (y_pred_proba >= optimal_threshold).astype(int)

    # --- Evaluation Metrics ---
    precision = precision_score(y_test_data, y_pred_optimized)
    recall = recall_score(y_test_data, y_pred_optimized)
    f2 = fbeta_score(y_test_data, y_pred_optimized, beta=2) # F2-score
    roc_auc = roc_auc_score(y_test_data, y_pred_proba)
    pr_auc = auc(recalls, precisions) # PR-AUC

    print(f"  Optimal Threshold (max F2): {optimal_threshold:.4f}")
    print(f"  Precision (Positive Class): {precision:.4f}")
    print(f"  Recall (Positive Class):    {recall:.4f}")
    print(f"  F2-Score:                   {f2:.4f}")
    print(f"  ROC-AUC:                    {roc_auc:.4f}")
    print(f"  PR-AUC:                     {pr_auc:.4f}")

    cm = confusion_matrix(y_test_data, y_pred_optimized)
    print("\n  Confusion Matrix (Optimized Threshold):\n", cm)
    
    # Plot Confusion Matrix
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=['Predicted False', 'Predicted True'],
                yticklabels=['Actual False', 'Actual True'])
    plt.title(f'Confusion Matrix for {model_name}\n(Optimized for F2-Score)')
    plt.ylabel('Actual Label')
    plt.xlabel('Predicted Label')
    plt.show()

    # Plot Precision-Recall Curve
    plt.figure(figsize=(8, 6))
    plt.plot(recalls, precisions, label=f'PR Curve (AUC = {pr_auc:.2f})')
    plt.scatter(recalls[optimal_threshold_idx], precisions[optimal_threshold_idx],
                marker='o', color='red', label=f'Optimal Threshold (F2={f2:.2f})')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title(f'Precision-Recall Curve for {model_name}')
    plt.legend()
    plt.grid(True)
    plt.show()


# --- Evaluate Models ---
# Baseline Models (No Resampling, but with class_weight/scale_pos_weight)
evaluate_model(models["RandomForest (No Resampling, Weighted)"], X_train_scaled_df, y_train, X_test_scaled_df, y_test, "RandomForest (No Resampling, Weighted)")
evaluate_model(models["XGBoost (No Resampling, Weighted)"], X_train_scaled_df, y_train, X_test_scaled_df, y_test, "XGBoost (No Resampling, Weighted)")
evaluate_model(models["LightGBM (No Resampling, Weighted)"], X_train_scaled_df, y_train, X_test_scaled_df, y_test, "LightGBM (No Resampling, Weighted)")


# Models with SMOTE data
print("\n" + "="*50 + "\nEvaluating Models with SMOTE Resampled Data\n" + "="*50)
evaluate_model(models["RandomForest (SMOTE)"], X_train_smote, y_train_smote, X_test_scaled_df, y_test, "RandomForest (SMOTE)")
evaluate_model(models["XGBoost (SMOTE)"], X_train_smote, y_train_smote, X_test_scaled_df, y_test, "XGBoost (SMOTE)")
evaluate_model(models["Logistic Regression (SMOTE)"], X_train_smote, y_train_smote, X_test_scaled_df, y_test, "Logistic Regression (SMOTE)")
evaluate_model(models["SVM (SMOTE)"], X_train_smote, y_train_smote, X_test_scaled_df, y_test, "SVM (SMOTE)")


# You can repeat evaluation for ADASYN, BorderlineSMOTE, SMOTETomek data
# by replacing X_train_smote, y_train_smote with X_train_adasyn, y_train_adasyn etc.
# For brevity, I'm just demonstrating SMOTE here.

In [30]:
y_train.value_counts()

False    7912
True       88
Name: isleak, dtype: int64

In [27]:
y_test.value_counts()

False    1974
True       26
Name: isleak, dtype: int64

In [3]:
df['timestamp_str'] = df['day'] + ' ' + df['time']

In [11]:
df = df.sort_values(by='timestamp_str').reset_index(drop=True)

In [13]:
df = df.set_index('timestamp_str')

In [14]:
df = df.drop(columns=['time', 'day'])

In [15]:
df['hour'] = df.index.hour
df['day_of_week'] = df.index.dayofweek

AttributeError: 'Index' object has no attribute 'hour'

In [12]:
df.head()

Unnamed: 0.1,Unnamed: 0,time,day,duration,flow_rate,variability,isleak,timestamp_str
0,7458,0:00:28,Fri,241,0.789594,0.367876,False,Fri 0:00:28
1,1701,0:02:05,Fri,11,0.81619,0.397041,False,Fri 0:02:05
2,2373,0:03:30,Fri,808,0.593305,0.299904,False,Fri 0:03:30
3,2605,0:03:44,Fri,353,2.547575,0.131525,False,Fri 0:03:44
4,8599,0:03:45,Fri,55,0.430962,0.398336,False,Fri 0:03:45


In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,time,day,duration,flow_rate,variability,isleak,timestamp_str
0,0,0:31:37,Sun,10,2.005437,0.44541,False,Sun 0:31:37
1,1,19:20:08,Sat,141,2.244325,0.097051,False,Sat 19:20:08
2,2,19:24:42,Thu,22,1.447766,0.328988,False,Thu 19:24:42
3,3,15:20:20,Sat,45,1.796816,0.670781,False,Sat 15:20:20
4,4,21:47:19,Mon,37,0.372023,0.347357,False,Mon 21:47:19


In [2]:
df['timestamp'] = pd.to_datetime(df['timestamp_str'], format='%A %H:%M:%S %Y')
df = df.sort_values(by='timestamp').reset_index(drop=True)
df = df.set_index('timestamp')

# Drop the temporary string column
df = df.drop(columns=['timestamp_str', 'time', 'day'])

KeyError: 'timestamp_str'