First model: random forest

In [None]:
# --- Imports ---
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    accuracy_score,
    balanced_accuracy_score,
    f1_score
)
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import seaborn as sns

data = pd.read_csv("Dataset.csv")  # change to your file name

X = data.iloc[:, :30].copy()
y_raw = data.iloc[:, 30].astype(str).str.strip()  

def simplify_flare(label):
    if label.startswith("C"):
        return "C"
    elif label.startswith("M"):
        return "M"
    elif label.startswith("X"):
        return "X"
    elif label in {"0", "0.0"}:
        return "0"
    else:
        return "Unknown"

y_simplified = y_raw.apply(simplify_flare)

print("=== Overall class counts (before split) ===")
overall_counts = y_simplified.value_counts().to_frame("count")
overall_counts["percent"] = 100 * overall_counts["count"] / len(y_simplified)
print(overall_counts, "\n")

X = X.apply(pd.to_numeric, errors='coerce')
imputer = SimpleImputer(strategy='median')
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y_simplified)

print("Encoded class mapping:", dict(zip(encoder.classes_, encoder.transform(encoder.classes_))), "\n")

X_train, X_test, y_train, y_test = train_test_split(
    X_imputed, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

def counts_pct(y, classes):
    vc = pd.Series(y).value_counts().reindex(range(len(classes)), fill_value=0)
    df = pd.DataFrame({"label_id": vc.index, "count": vc.values})
    df["label"] = df["label_id"].map(dict(enumerate(classes)))
    df["percent"] = 100 * df["count"] / df["count"].sum()
    return df[["label", "count", "percent"]].set_index("label")

print("=== Train class distribution ===")
print(counts_pct(y_train, encoder.classes_), "\n")

print("=== Test class distribution ===")
print(counts_pct(y_test, encoder.classes_), "\n")

dummy = DummyClassifier(strategy="most_frequent")
dummy.fit(X_train, y_train)
y_pred_dummy = dummy.predict(X_test)

print("=== Majority-class baseline (DummyClassifier) ===")
print("Baseline accuracy:", accuracy_score(y_test, y_pred_dummy))
print("Baseline balanced accuracy:", balanced_accuracy_score(y_test, y_pred_dummy))
print("Baseline macro F1:", f1_score(y_test, y_pred_dummy, average="macro", zero_division=0), "\n")

rf = RandomForestClassifier(
    n_estimators=300,
    random_state=42,
    class_weight='balanced_subsample',
    n_jobs=-1
)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)

acc = accuracy_score(y_test, y_pred)
bal_acc = balanced_accuracy_score(y_test, y_pred)
macro_f1 = f1_score(y_test, y_pred, average="macro", zero_division=0)
weighted_f1 = f1_score(y_test, y_pred, average="weighted", zero_division=0)

print("=== Random Forest performance (Test set) ===")
print("Accuracy:", acc)
print("Balanced accuracy:", bal_acc)
print("Macro F1:", macro_f1)
print("Weighted F1:", weighted_f1, "\n")

report = classification_report(
    y_test, y_pred, target_names=encoder.classes_, zero_division=0, output_dict=True
)
per_class_recall = {label: report[label]["recall"] for label in encoder.classes_ if label in report}
print("Per-class recall (focus on C/M/X vs 0):")
for lbl in encoder.classes_:
    print(f"  {lbl}: {per_class_recall.get(lbl, np.nan):.3f}")
print()

print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=encoder.classes_, zero_division=0, digits=4))

print("\n=== Predicted class distribution on Test ===")
pred_df = counts_pct(y_pred, encoder.classes_)
print(pred_df, "\n")

cm = confusion_matrix(y_test, y_pred)
cm_norm = confusion_matrix(y_test, y_pred, normalize='true')  # row-normalized: recall per class

plt.figure(figsize=(7,6))
sns.heatmap(cm, annot=True, fmt="d",
            xticklabels=encoder.classes_, yticklabels=encoder.classes_, cmap="Blues")
plt.title("Confusion Matrix (counts) - Flare Strength Categories")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.tight_layout()
plt.show()

plt.figure(figsize=(7,6))
sns.heatmap(cm_norm, annot=True, fmt=".2f",
            xticklabels=encoder.classes_, yticklabels=encoder.classes_, cmap="Blues")
plt.title("Confusion Matrix (row-normalized) - Recall per Class")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.tight_layout()
plt.show()

# --- Feature Importance (Top 10) ---
importances = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False)
importances.head(10).plot(kind='bar', figsize=(8,5))
plt.title("Top 10 Important Features")
plt.ylabel("Importance")
plt.tight_layout()
plt.show()


In [None]:
# --- Imports ---
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import LinearSVC
from sklearn.dummy import DummyClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    accuracy_score,
    balanced_accuracy_score,
    f1_score
)
import matplotlib.pyplot as plt
import seaborn as sns

RANDOM_STATE = 42

def counts_pct(y, classes):
    vc = pd.Series(y).value_counts().reindex(range(len(classes)), fill_value=0)
    df = pd.DataFrame({"label_id": vc.index, "count": vc.values})
    df["label"] = df["label_id"].map(dict(enumerate(classes)))
    df["percent"] = 100 * df["count"] / df["count"].sum()
    return df[["label", "count", "percent"]].set_index("label")

data = pd.read_csv("Dataset.csv")  # change to your file name

X = data.iloc[:, :30].copy()
y_raw = data.iloc[:, 30].astype(str).str.strip()  # flare_strength column

def simplify_flare(label):
    if label.startswith("C"):
        return "C"
    elif label.startswith("M"):
        return "M"
    elif label.startswith("X"):
        return "X"
    elif label in {"0", "0.0"}:
        return "0"
    else:
        return "Unknown"

y_simplified = y_raw.apply(simplify_flare)

# Optionally drop 'Unknown' if any
mask_known = y_simplified != "Unknown"
X = X.loc[mask_known].reset_index(drop=True)
y_simplified = y_simplified.loc[mask_known].reset_index(drop=True)

print("=== Overall class counts (before split) ===")
overall_counts = y_simplified.value_counts().to_frame("count")
overall_counts["percent"] = 100 * overall_counts["count"] / len(y_simplified)
print(overall_counts, "\n")

X = X.apply(pd.to_numeric, errors='coerce')

encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y_simplified)
class_names = list(encoder.classes_)
print("Encoded class mapping:", dict(zip(class_names, encoder.transform(class_names))), "\n")


X_temp, X_test, y_temp, y_test = train_test_split(
    X, y_encoded, test_size=0.20, random_state=RANDOM_STATE, stratify=y_encoded
)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.25, random_state=RANDOM_STATE, stratify=y_temp
)

print("=== Train class distribution ===")
print(counts_pct(y_train, class_names), "\n")
print("=== Validation class distribution ===")
print(counts_pct(y_val, class_names), "\n")
print("=== Test class distribution ===")
print(counts_pct(y_test, class_names), "\n")

dummy = DummyClassifier(strategy="most_frequent")
dummy.fit(X_train, y_train)

for split_name, X_split, y_split in [
    ("Validation", X_val, y_val),
    ("Test", X_test, y_test),
]:
    y_pred_dummy = dummy.predict(X_split)
    print(f"=== Majority-class baseline (DummyClassifier) on {split_name} ===")
    print("Accuracy:", accuracy_score(y_split, y_pred_dummy))
    print("Balanced accuracy:", balanced_accuracy_score(y_split, y_pred_dummy))
    print("Macro F1:", f1_score(y_split, y_pred_dummy, average="macro", zero_division=0), "\n")

svm = LinearSVC(
    C=1.0,                    
    class_weight='balanced',  
    random_state=RANDOM_STATE,
    max_iter=5000             
)

pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler(with_mean=True, with_std=True)),
    ("svm", svm)
])


pipe.fit(X_train, y_train)

y_train_pred = pipe.predict(X_train)
print("=== SVM performance (Train set) ===")
print("Accuracy:", accuracy_score(y_train, y_train_pred))
print("Balanced accuracy:", balanced_accuracy_score(y_train, y_train_pred))
print("Macro F1:", f1_score(y_train, y_train_pred, average="macro", zero_division=0), "\n")

y_val_pred = pipe.predict(X_val)
print("=== SVM performance (Validation set) ===")
print("Accuracy:", accuracy_score(y_val, y_val_pred))
print("Balanced accuracy:", balanced_accuracy_score(y_val, y_val_pred))
print("Macro F1:", f1_score(y_val, y_val_pred, average="macro", zero_division=0))
print("Weighted F1:", f1_score(y_val, y_val_pred, average="weighted", zero_division=0), "\n")

print("Validation Classification Report:")
print(classification_report(y_val, y_val_pred, target_names=class_names, zero_division=0, digits=4))

y_test_pred = pipe.predict(X_test)
print("=== SVM performance (Test set) ===")
print("Accuracy:", accuracy_score(y_test, y_test_pred))
print("Balanced accuracy:", balanced_accuracy_score(y_test, y_test_pred))
print("Macro F1:", f1_score(y_test, y_test_pred, average="macro", zero_division=0))
print("Weighted F1:", f1_score(y_test, y_test_pred, average="weighted", zero_division=0), "\n")

print("Test Classification Report:")
print(classification_report(y_test, y_test_pred, target_names=class_names, zero_division=0, digits=4))

print("\n=== Predicted class distribution on Validation ===")
print(counts_pct(y_val_pred, class_names), "\n")
print("=== Predicted class distribution on Test ===")
print(counts_pct(y_test_pred, class_names), "\n")

def plot_confusions(y_true, y_pred, classes, title_suffix):
    cm = confusion_matrix(y_true, y_pred)
    cm_norm = confusion_matrix(y_true, y_pred, normalize='true')

    plt.figure(figsize=(7,6))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
                xticklabels=classes, yticklabels=classes)
    plt.title(f"Confusion Matrix (counts) - {title_suffix}")
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.tight_layout()
    plt.show()

    plt.figure(figsize=(7,6))
    sns.heatmap(cm_norm, annot=True, fmt=".2f", cmap="Blues",
                xticklabels=classes, yticklabels=classes)
    plt.title(f"Confusion Matrix (row-normalized) - {title_suffix}")
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.tight_layout()
    plt.show()

plot_confusions(y_val, y_val_pred, class_names, "Validation")
plot_confusions(y_test, y_test_pred, class_names, "Test")
