In [None]:
import json
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE  # If needed
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# 1. Data Loading and Preparation
with open("../datasets/ground_truth.json", "r") as f:
    data = json.load(f)

# Extract turning points and arc labels
x_ds = []
y_ds = []

for narrative in tqdm(data, desc="Processing narratives"):
    # Get turning points as features
    tps = [
        narrative['turning_points'].get("tp1", 0),
        narrative['turning_points'].get("tp2", 0),
        narrative['turning_points'].get("tp3", 0),
        narrative['turning_points'].get("tp4", 0),
        narrative['turning_points'].get("tp5", 0)
    ]
    x_ds.append(tps)
    
    # Get arc label
    y_ds.append(narrative.get("source", "Unknown"))  # Handle unknown sources if any

# Convert to pandas DataFrame
df = pd.DataFrame(x_ds, columns=['tp1', 'tp2', 'tp3', 'tp4', 'tp5'])
df['source'] = y_ds

# Handle 'Unknown' sources
df = df[df['source'] != 'Unknown']

# Feature Engineering
# Differences between turning points
df['diff_tp1_tp2'] = df['tp2'] - df['tp1']
df['diff_tp1_tp3'] = df['tp3'] - df['tp1']
df['diff_tp1_tp4'] = df['tp4'] - df['tp1']
df['diff_tp1_tp5'] = df['tp5'] - df['tp1']
df['diff_tp2_tp3'] = df['tp3'] - df['tp2']
df['diff_tp2_tp4'] = df['tp4'] - df['tp2']
df['diff_tp2_tp5'] = df['tp5'] - df['tp2']
df['diff_tp3_tp4'] = df['tp4'] - df['tp3']
df['diff_tp3_tp5'] = df['tp5'] - df['tp3']
df['diff_tp4_tp5'] = df['tp5'] - df['tp4']

# Example: Ratios
df['ratio_tp1_tp5'] = df['tp1'] / df['tp5']
df['ratio_tp2_tp4'] = df['tp2'] / df['tp4']

# Select Features
features = [
    'tp1', 'tp2', 'tp3', 'tp4', 'tp5',
    'diff_tp1_tp2', 'diff_tp1_tp3', 'diff_tp1_tp4', 'diff_tp1_tp5',
    'diff_tp2_tp3', 'diff_tp2_tp4', 'diff_tp2_tp5',
    'diff_tp3_tp4', 'diff_tp3_tp5',
    'diff_tp4_tp5',
    'ratio_tp1_tp5', 'ratio_tp2_tp4'
]
X = df[features].values
y = df['source'].values

# Label Encoding
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Optional: Handle Class Imbalance
# smote = SMOTE(random_state=42)
# X_scaled, y_encoded = smote.fit_resample(X_scaled, y_encoded)

# 2. Model Definition and Hyperparameter Tuning
# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200],          # Reduced from 3 to 2 options
    'learning_rate': [0.01, 0.1],        # Reduced from 3 to 2 options
    'num_leaves': [31, 50],              # Reduced from 3 to 2 options
    'max_depth': [-1, 10],               # Reduced from 3 to 2 options
    'min_child_samples': [20],           # Reduced from 3 to 1 option
    'subsample': [0.8],                  # Reduced from 3 to 1 option
    'colsample_bytree': [0.8],           # Reduced from 3 to 1 option
    'reg_alpha': [0.1],                  # Reduced from 3 to 1 option
    'reg_lambda': [0.1]                  # Reduced from 3 to 1 option
}
# Initialize the LightGBM classifier
lgbm_clf = lgb.LGBMClassifier(random_state=42)

# Initialize Grid Search with cross-validation
grid_search = GridSearchCV(
    estimator=lgbm_clf,
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

# Fit Grid Search with progress bar
with tqdm(total=1, desc="Grid Search") as pbar:
    grid_search.fit(X_scaled, y_encoded)
    pbar.update(1)

# Best Parameters and Score
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Cross-Validation Accuracy: {grid_search.best_score_:.4f}")

# 3. Evaluate the Best Model with Cross-Validation
best_lgbm = grid_search.best_estimator_

# Define Stratified K-Fold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Perform Cross-Validation with progress bar
cv_scores = []
for train_idx, val_idx in tqdm(skf.split(X_scaled, y_encoded), total=5, desc="Cross-validation"):
    X_train_fold, X_val_fold = X_scaled[train_idx], X_scaled[val_idx]
    y_train_fold, y_val_fold = y_encoded[train_idx], y_encoded[val_idx]
    best_lgbm.fit(X_train_fold, y_train_fold)
    score = best_lgbm.score(X_val_fold, y_val_fold)
    cv_scores.append(score)

cv_scores = np.array(cv_scores)
print(f"Final LightGBM Cross-Validation Accuracy: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

# 4. Final Model Training and Evaluation
# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

# Train the best model on the entire training set with progress bar
with tqdm(total=1, desc="Final Training") as pbar:
    best_lgbm.fit(X_train, y_train)
    pbar.update(1)

# Predict on the test set
y_pred = best_lgbm.predict(X_test)
y_prob = best_lgbm.predict_proba(X_test)[:, 1]

# Evaluate Performance
accuracy = accuracy_score(y_test, y_pred)
print(f"\nTest Accuracy: {accuracy:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=le.classes_, yticklabels=le.classes_)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion Matrix')
plt.show()

# Feature Importance
importances = best_lgbm.feature_importances_
feature_names = features
feature_importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': importances
}).sort_values(by='importance', ascending=False)

plt.figure(figsize=(10,6))
sns.barplot(x='importance', y='feature', data=feature_importance_df, palette='viridis')
plt.title('Feature Importance')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()