In [27]:
import os
import json

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# We drop XGBoost and LightGBM for now to avoid libomp issues on macOS
# from xgboost import XGBClassifier
# from lightgbm import LGBMClassifier

from catboost import CatBoostClassifier
from sklearn.ensemble import VotingClassifier

# Paths
DATA_DIR = os.path.join("..", "data", "plasticc")
FEATURES_PATH = os.path.join(DATA_DIR, "transient_features.csv")

RESULTS_DIR = os.path.join("..", "results")
os.makedirs(RESULTS_DIR, exist_ok=True)

FEATURES_PATH, RESULTS_DIR
df = pd.read_csv(FEATURES_PATH)
df.head()

Unnamed: 0,transient_id,n_points,mag_min,mag_max,mag_mean,mag_std,mag_range,flux_max,flux_mean,flux_std,time_span,rise_time,decline_time,rise_decline_ratio,mean_rise_slope,mean_decline_slope,max_slope,label
0,215282,21,-3.539098,-1.349829,-2.679177,0.594565,2.189269,26.039886,13.479886,6.350749,113.5933,1.9995,111.5938,0.017918,-12.304446,-3.625747,178.404545,SNIa
1,92999561,5,-5.858902,-3.968508,-5.185998,0.688865,1.890394,220.57724,140.131747,67.895501,59.0308,8.0055,51.0253,0.156893,-0.16561,0.101884,0.25974,SNIa
2,19866,85,-6.08006,-1.458976,-4.326849,1.078168,4.621084,270.410736,82.137994,70.261176,104.7842,16.787,87.9972,0.190767,5.602911,-4.89604,260.035027,SNIa
3,34971934,5,-5.973134,-5.222988,-5.64041,0.27876,0.750146,245.049469,186.256429,46.025077,17.0175,8.014,9.0035,0.890098,-0.079878,0.085137,0.146091,SNIa
4,106057072,6,-5.404697,-4.004794,-4.790354,0.498553,1.399903,145.1707,91.019598,38.254045,40.0,23.0221,16.9779,1.356004,-0.1802,0.054138,0.311822,SNIa


In [28]:
features_df = pd.read_csv(FEATURES_PATH)

print("Shape:", features_df.shape)
features_df.head()
features_df["label"].value_counts()

Shape: (339, 18)


label
SNII    181
SNIa    158
Name: count, dtype: int64

In [29]:
feature_cols = [
    "n_points",
    
    # Magnitude features
    "mag_min", "mag_max", "mag_mean", "mag_std", "mag_range",
    
    # Flux features
    "flux_max", "flux_mean", "flux_std",
    
    # Time features
    "time_span", "rise_time", "decline_time", "rise_decline_ratio",
    
    # Slope features
    "mean_rise_slope", "mean_decline_slope", "max_slope"
]

X = features_df[feature_cols].values
y_str = features_df["label"].values

label_mapping = {"SNIa": 1, "SNII": 0}
y = np.array([label_mapping[val] for val in y_str])

X.shape, y.shape, np.unique(y, return_counts=True)

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y,
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [30]:
# 1. Logistic Regression
log_reg = LogisticRegression(
    max_iter=2000,
    C=2.0,
    solver="lbfgs",
    random_state=42
)

# 2. Random Forest
rf = RandomForestClassifier(
    n_estimators=600,
    max_depth=20,
    min_samples_leaf=2,
    n_jobs=-1,
    random_state=42
)

# 3. CatBoost (boosting for tabular)
cat_model = CatBoostClassifier(
    iterations=800,
    depth=8,
    learning_rate=0.05,
    loss_function="Logloss",
    verbose=False,
    random_state=42
)

# 4. Voting Ensemble
ensemble = VotingClassifier(
    estimators=[
        ("log_reg", log_reg),
        ("rf", rf),
        ("cat", cat_model),
    ],
    voting="soft",
    weights=[1, 2, 3]
)

models = {
    "logistic_regression": log_reg,
    "random_forest": rf,
    "catboost": cat_model,
    "ensemble": ensemble,
}

list(models.keys())

['logistic_regression', 'random_forest', 'catboost', 'ensemble']

In [31]:
from sklearn.metrics import roc_auc_score

def train_and_evaluate(model, X_train, y_train, X_test, y_test):
    """Fit model and return metrics dict."""
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Some models have predict_proba, some do not
    try:
        y_proba = model.predict_proba(X_test)[:, 1]
        auc = roc_auc_score(y_test, y_proba)
    except:
        auc = None

    acc = accuracy_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    report = classification_report(
        y_test,
        y_pred,
        target_names=["SNII (0)", "SNIa (1)"],
        output_dict=True,
        zero_division=0,
    )

    return {
        "accuracy": float(acc),
        "auc": float(auc) if auc is not None else None,
        "confusion_matrix": cm.tolist(),
        "classification_report": report,
    }

In [32]:
results = {}

for name, model in models.items():
    print(f"\n=== Training {name} ===")
    
    # These models use the scaled features
    if name in ["logistic_regression", "random_forest", "catboost", "ensemble"]:
        X_tr, X_te = X_train_scaled, X_test_scaled
    else:
        X_tr, X_te = X_train, X_test
    
    metrics = train_and_evaluate(model, X_tr, y_train, X_te, y_test)
    results[name] = metrics
    print(f"{name} accuracy: {metrics['accuracy']:.3f}")
    if metrics['auc']:
        print(f"{name} AUC: {metrics['auc']:.3f}")

# Display confusion matrices
print("\n=== Confusion Matrices ===")
for name, res in results.items():
    print(f"\n{name}:")
    print(np.array(res['confusion_matrix']))


=== Training logistic_regression ===
logistic_regression accuracy: 0.706
logistic_regression AUC: 0.777

=== Training random_forest ===
random_forest accuracy: 0.676
random_forest AUC: 0.819

=== Training catboost ===
catboost accuracy: 0.750
catboost AUC: 0.843

=== Training ensemble ===
ensemble accuracy: 0.750
ensemble AUC: 0.844

=== Confusion Matrices ===

logistic_regression:
[[26 10]
 [10 22]]

random_forest:
[[22 14]
 [ 8 24]]

catboost:
[[24 12]
 [ 5 27]]

ensemble:
[[26 10]
 [ 7 25]]


In [1]:
import os
import json
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from catboost import CatBoostClassifier

from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    roc_auc_score,
    roc_curve
)

# ============================================================================
# LOAD FEATURES
# ============================================================================

DATA_DIR = os.path.join("..", "data", "plasticc")
features_df = pd.read_csv(os.path.join(DATA_DIR, "transient_features.csv"))

print(f"Loaded {len(features_df)} samples with {len(features_df.columns)} columns")
print(f"\nClass distribution:")
print(features_df['label'].value_counts())

# ============================================================================
# PREPARE DATA - USE ALL 16 FEATURES
# ============================================================================

feature_cols = [
    # Magnitude features
    "mag_min", "mag_max", "mag_mean", "mag_std", "mag_range",
    # Flux features
    "flux_max", "flux_mean", "flux_std",
    # Time features
    "time_span", "rise_time", "decline_time", "rise_decline_ratio",
    # Slope features
    "mean_rise_slope", "mean_decline_slope", "max_slope",
    # Metadata
    "n_points"
]

X = features_df[feature_cols].values
y_str = features_df['label'].values

label_map = {'SNII': 0, 'SNIa': 1}
y = np.array([label_map[label] for label in y_str])

print(f"\nFeature matrix: {X.shape}")
print(f"Labels: {y.shape}, distribution: {np.unique(y, return_counts=True)}")

# ============================================================================
# TRAIN-TEST SPLIT
# ============================================================================

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print(f"\nTrain: {X_train.shape[0]}, Test: {X_test.shape[0]}")

# ============================================================================
# SCALE FEATURES
# ============================================================================

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ============================================================================
# TRAIN MODELS
# ============================================================================

print("\n" + "=" * 70)
print("TRAINING CLASSICAL MODELS")
print("=" * 70)

models = {}

# Logistic Regression
print("\n1. Logistic Regression...")
lr = LogisticRegression(max_iter=2000, C=2.0, random_state=42)
lr.fit(X_train_scaled, y_train)
models['Logistic Regression'] = lr

# Random Forest
print("2. Random Forest...")
rf = RandomForestClassifier(
    n_estimators=600,
    max_depth=20,
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train, y_train)
models['Random Forest'] = rf

# CatBoost
print("3. CatBoost...")
cb = CatBoostClassifier(
    iterations=800,
    depth=8,
    learning_rate=0.05,
    random_state=42,
    verbose=False
)
cb.fit(X_train, y_train)
models['CatBoost'] = cb

# Ensemble
print("4. Ensemble (Voting)...")
ensemble = VotingClassifier(
    estimators=[
        ('lr', lr),
        ('rf', rf),
        ('cb', cb)
    ],
    voting='soft',
    weights=[1, 2, 3]
)
ensemble.fit(X_train_scaled, y_train)
models['Ensemble'] = ensemble

# ============================================================================
# EVALUATE ALL MODELS
# ============================================================================

print("\n" + "=" * 70)
print("CLASSICAL MODEL RESULTS")
print("=" * 70)

results = {}

for name, model in models.items():
    print(f"\n{name}:")
    print("-" * 50)
    
    # Use scaled features for LR and Ensemble, raw for RF and CB
    if name in ['Logistic Regression', 'Ensemble']:
        X_test_input = X_test_scaled
    else:
        X_test_input = X_test
    
    y_pred = model.predict(X_test_input)
    y_proba = model.predict_proba(X_test_input)[:, 1]
    
    acc = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_proba)
    
    print(f"Accuracy: {acc:.1%} ({acc:.3f})")
    print(f"AUC:      {auc:.1%} ({auc:.3f})")
    
    print("\nConfusion Matrix:")
    cm = confusion_matrix(y_test, y_pred)
    print(f"              SNII  SNIa")
    print(f"Actual SNII   {cm[0,0]:3d}   {cm[0,1]:3d}")
    print(f"       SNIa   {cm[1,0]:3d}   {cm[1,1]:3d}")
    
    results[name] = {
        'accuracy': float(acc),
        'auc': float(auc),
        'confusion_matrix': cm.tolist()
    }

# ============================================================================
# SAVE RESULTS
# ============================================================================

RESULTS_DIR = os.path.join("..", "results")
os.makedirs(RESULTS_DIR, exist_ok=True)

results_path = os.path.join(RESULTS_DIR, "plasticc_classical_results_2k.json")
with open(results_path, 'w') as f:
    json.dump(results, f, indent=2)

print(f"\nâœ“ Results saved: {results_path}")

print("\n" + "=" * 70)
print("CLASSICAL TRAINING COMPLETE!")
print("=" * 70)
print("Next step: Train quantum model with top 3 features")

Loaded 1072 samples with 18 columns

Class distribution:
label
SNII    549
SNIa    523
Name: count, dtype: int64

Feature matrix: (1072, 16)
Labels: (1072,), distribution: (array([0, 1]), array([549, 523]))

Train: 857, Test: 215

TRAINING CLASSICAL MODELS

1. Logistic Regression...
2. Random Forest...
3. CatBoost...
4. Ensemble (Voting)...

CLASSICAL MODEL RESULTS

Logistic Regression:
--------------------------------------------------
Accuracy: 71.2% (0.712)
AUC:      77.0% (0.770)

Confusion Matrix:
              SNII  SNIa
Actual SNII    75    35
       SNIa    27    78

Random Forest:
--------------------------------------------------
Accuracy: 75.8% (0.758)
AUC:      84.5% (0.845)

Confusion Matrix:
              SNII  SNIa
Actual SNII    81    29
       SNIa    23    82

CatBoost:
--------------------------------------------------
Accuracy: 74.4% (0.744)
AUC:      84.6% (0.846)

Confusion Matrix:
              SNII  SNIa
Actual SNII    83    27
       SNIa    28    77

Ensemble:

In [2]:
# ============================================================================
# FEATURE CORRELATION ANALYSIS - FIND TOP 3 FOR QUANTUM
# ============================================================================

from scipy.stats import pointbiserialr

print("\n" + "=" * 70)
print("FEATURE CORRELATION WITH LABEL (SNIa vs SNII)")
print("=" * 70)

# Numeric encoding: SNII=0, SNIa=1
label_numeric = y

correlations = []
for i, feat in enumerate(feature_cols):
    corr, pval = pointbiserialr(label_numeric, X[:, i])
    correlations.append({
        'feature': feat,
        'correlation': abs(corr),  # absolute value for ranking
        'correlation_signed': corr,
        'p_value': pval
    })

corr_df = pd.DataFrame(correlations).sort_values('correlation', ascending=False)

print("\nFeatures ranked by correlation strength:")
print(corr_df.to_string(index=False))

# Select TOP 3
top_3_features = corr_df.head(3)['feature'].tolist()

print("\n" + "=" * 70)
print(f"ðŸŽ¯ TOP 3 FEATURES FOR QUANTUM: {top_3_features}")
print("=" * 70)

# Show class separation for these features
print("\nClass separation check:")
for feat in top_3_features:
    feat_idx = feature_cols.index(feat)
    feat_data = X[:, feat_idx]
    
    snia_vals = feat_data[y == 1]
    snii_vals = feat_data[y == 0]
    
    print(f"\n{feat}:")
    print(f"  SNIa: mean={snia_vals.mean():.3f}, std={snia_vals.std():.3f}")
    print(f"  SNII: mean={snii_vals.mean():.3f}, std={snii_vals.std():.3f}")
    print(f"  Separation: {abs(snia_vals.mean() - snii_vals.mean()):.3f} ({abs(snia_vals.mean() - snii_vals.mean()) / snii_vals.std():.2f} Ïƒ)")

# Save for quantum notebook
top_3_path = os.path.join(RESULTS_DIR, "top_3_features.json")
with open(top_3_path, 'w') as f:
    json.dump({'top_3_features': top_3_features}, f, indent=2)

print(f"\nâœ“ Saved top 3 features to: {top_3_path}")


FEATURE CORRELATION WITH LABEL (SNIa vs SNII)

Features ranked by correlation strength:
           feature  correlation  correlation_signed      p_value
         time_span     0.280059           -0.280059 9.061588e-21
      decline_time     0.268915           -0.268915 3.248802e-19
           mag_max     0.150529            0.150529 7.382729e-07
           mag_std     0.142485            0.142485 2.818433e-06
          mag_mean     0.133558            0.133558 1.146827e-05
         mag_range     0.104832            0.104832 5.864396e-04
         rise_time     0.096462           -0.096462 1.567008e-03
mean_decline_slope     0.063565            0.063565 3.744478e-02
         flux_mean     0.041182           -0.041182 1.778611e-01
           mag_min     0.033363            0.033363 2.750981e-01
         max_slope     0.030928           -0.030928 3.116943e-01
          n_points     0.021755            0.021755 4.767484e-01
          flux_max     0.020245           -0.020245 5.078841e-01
r