In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

# --- Step 1: Load Clustering Results ---
clustering_results_path = "data/kmeans_cosine_clustering_results_5_1-1_threshold_0.3.csv"  # Adjust path if needed
df_clusters = pd.read_csv(clustering_results_path)

# --- Step 2: Load Processed Data ---
file_path = "data/notable_state_correlation_trends_filtered.csv"
df_input = pd.read_csv(file_path)

# --- Step 3: Pivot Data for Classification ---
df_pivot = df_input.pivot(index='State', columns=['Metric 1', 'Metric 2'], values='Correlation').fillna(0)

# ✅ Flatten MultiIndex Column Names
df_pivot.columns = ['_'.join(col).strip() for col in df_pivot.columns.values]  # Converts tuples to strings
df_pivot.reset_index(inplace=True)  # Make "State" a normal column for merging

# --- Step 4: Merge with Cluster Labels ---
df_merged = df_pivot.merge(df_clusters, on="State")

# Extract features (X) and labels (y)
X = df_merged.drop(columns=["State", "Cluster"]).values
y = df_merged["Cluster"].values

# --- Step 5: Train-Test Split ---
test_size = 0.4  # Experiment with 0.4 or 0.5
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

# --- Step 6: Standardize Features ---
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# --- Step 7: Train a Random Forest Classifier ---
clf = RandomForestClassifier(n_estimators=200, random_state=42)  # Increased n_estimators for stability
clf.fit(X_train, y_train)

# --- Step 8: Evaluate Model ---
y_pred = clf.predict(X_test)

# Compute accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Classification Accuracy: {accuracy:.4f}")

# Display classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Display confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))




Classification Accuracy: 0.9412

Classification Report:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00         2
         1.0       1.00      1.00      1.00         5
         2.0       1.00      1.00      1.00         5
         3.0       0.50      1.00      0.67         1
         4.0       1.00      0.75      0.86         4

    accuracy                           0.94        17
   macro avg       0.90      0.95      0.90        17
weighted avg       0.97      0.94      0.95        17


Confusion Matrix:
[[2 0 0 0 0]
 [0 5 0 0 0]
 [0 0 5 0 0]
 [0 0 0 1 0]
 [0 0 0 1 3]]
