In [1]:
# IMPORTS

import json
import pandas as pd
import numpy as np
from tqdm import tqdm
import textstat
import spacy
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.dummy import DummyClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, classification_report, mean_absolute_error
)

In [2]:
# ========= 1. Load Data =========
with open("final_labeled_questions.json", "r") as f:
    data = json.load(f)

df = pd.DataFrame(data)
print(f"Loaded {len(df)} questions")

# ========= 2. Define NLP Tools =========
nlp = spacy.load("en_core_web_sm")

LOGICAL_CONNECTIVES = [
    "if", "then", "therefore", "because", "and", "or", "not", "but", 
    "hence", "since", "implies", "thus", "however", "although", "unless"
]

# ========= 3. Custom Feature Extractor =========
class TextFeatureExtractor(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        flesch = [textstat.flesch_kincaid_grade(text) for text in X]

        # POS ratios and logical connectives
        pos_ratios = []
        logic_counts = []

        for text, doc in zip(X, nlp.pipe(X, disable=["ner"])):
            tokens = [t for t in doc if t.is_alpha]
            total = len(tokens) if len(tokens) > 0 else 1

            pos_count = doc.count_by(spacy.attrs.POS)
            noun_ratio = pos_count.get(nlp.vocab.strings["NOUN"], 0) / total
            verb_ratio = pos_count.get(nlp.vocab.strings["VERB"], 0) / total
            adj_ratio = pos_count.get(nlp.vocab.strings["ADJ"], 0) / total
            adv_ratio = pos_count.get(nlp.vocab.strings["ADV"], 0) / total

            pos_ratios.append([noun_ratio, verb_ratio, adj_ratio, adv_ratio])

            # count logical connectives
            logic = sum(text.lower().split().count(word) for word in LOGICAL_CONNECTIVES)
            logic_counts.append(logic)

        pos_ratios = np.array(pos_ratios)
        logic_counts = np.array(logic_counts).reshape(-1, 1)
        flesch = np.array(flesch).reshape(-1, 1)

        # combine all numeric features
        return np.hstack([pos_ratios, logic_counts, flesch])

# ========= 4. TF-IDF and Feature Union =========
tfidf = TfidfVectorizer(max_features=3000, ngram_range=(1,2), stop_words='english')
text_feats = TextFeatureExtractor()

# We’ll fit TF-IDF separately, and then concatenate numeric features
X_tfidf = tfidf.fit_transform(df["question"])
X_extra = text_feats.fit_transform(df["question"])

# Scale numeric features and concatenate
scaler = StandardScaler()
X_extra_scaled = scaler.fit_transform(X_extra)

from scipy.sparse import hstack
X_all = hstack([X_tfidf, X_extra_scaled])



Loaded 1000 questions


In [3]:
# ========= 5. Split & Train Models =========
X_train, X_test, yb_train, yb_test = train_test_split(
    X_all, df["binary"], test_size=0.2, random_state=42, stratify=df["binary"]
)
_, _, ym_train, ym_test = train_test_split(
    X_all, df["multiclass"], test_size=0.2, random_state=42, stratify=df["multiclass"]
)

In [4]:
# Helper to safely compute ROC-AUC for binary
def safe_roc_auc(y_true, y_pred):
    try:
        return roc_auc_score(y_true, y_pred)
    except ValueError:
        return np.nan

# Helper to compute and store metrics
def evaluate_model(name, y_true, y_pred, task_type="binary"):
    results = {"Model": name}
    results["Accuracy"] = accuracy_score(y_true, y_pred)
    
    if task_type == "binary":
        results["Precision"] = precision_score(y_true, y_pred, zero_division=0)
        results["Recall"] = recall_score(y_true, y_pred, zero_division=0)
        results["F1"] = f1_score(y_true, y_pred, zero_division=0)
        results["ROC-AUC"] = safe_roc_auc(y_true, y_pred)
    else:
        results["Macro-F1"] = f1_score(y_true, y_pred, average="macro", zero_division=0)
        results["Weighted-F1"] = f1_score(y_true, y_pred, average="weighted", zero_division=0)
        results["MAE"] = mean_absolute_error(y_true, y_pred)
    
    return results

In [5]:
# ========= 6. Binary Models =========
binary_metrics = []

# Majority Baseline
maj_bin = DummyClassifier(strategy="most_frequent")
maj_bin.fit(X_train, yb_train)
yb_pred_maj = maj_bin.predict(X_test)
binary_metrics.append(evaluate_model("Majority (Binary)", yb_test, yb_pred_maj, "binary"))

# Random Baseline
rand_bin = DummyClassifier(strategy="uniform", random_state=42)
rand_bin.fit(X_train, yb_train)
yb_pred_rand = rand_bin.predict(X_test)
binary_metrics.append(evaluate_model("Random (Binary)", yb_test, yb_pred_rand, "binary"))

# Logistic Regression
clf_bin = LogisticRegression(max_iter=200, class_weight="balanced")
clf_bin.fit(X_train, yb_train)
yb_pred = clf_bin.predict(X_test)
binary_metrics.append(evaluate_model("Logistic Regression (Binary)", yb_test, yb_pred, "binary"))

# ========= 7. Multiclass Models =========
multi_metrics = []

# Majority Baseline
maj_multi = DummyClassifier(strategy="most_frequent")
maj_multi.fit(X_train, ym_train)
ym_pred_maj = maj_multi.predict(X_test)
multi_metrics.append(evaluate_model("Majority (Multi)", ym_test, ym_pred_maj, "multi"))

# Random Baseline
rand_multi = DummyClassifier(strategy="uniform", random_state=42)
rand_multi.fit(X_train, ym_train)
ym_pred_rand = rand_multi.predict(X_test)
multi_metrics.append(evaluate_model("Random (Multi)", ym_test, ym_pred_rand, "multi"))

# Logistic Regression
clf_multi = LogisticRegression(max_iter=200, class_weight="balanced", multi_class="multinomial")
clf_multi.fit(X_train, ym_train)
ym_pred = clf_multi.predict(X_test)
multi_metrics.append(evaluate_model("Logistic Regression (Multi)", ym_test, ym_pred, "multi"))




In [6]:
# ========= 8. Combine and Export =========
binary_df = pd.DataFrame(binary_metrics)
multi_df = pd.DataFrame(multi_metrics)

# Write to one Excel file with two sheets
with pd.ExcelWriter("evaluation_results.xlsx") as writer:
    binary_df.to_excel(writer, sheet_name="Binary", index=False)
    multi_df.to_excel(writer, sheet_name="Multi-class", index=False)

print("✅ Results saved to evaluation_results.xlsx")
display(binary_df)
display(multi_df)

✅ Results saved to evaluation_results.xlsx


Unnamed: 0,Model,Accuracy,Precision,Recall,F1,ROC-AUC
0,Majority (Binary),0.505,0.0,0.0,0.0,0.5
1,Random (Binary),0.485,0.48,0.484848,0.482412,0.484998
2,Logistic Regression (Binary),0.82,0.824742,0.808081,0.816327,0.819882


Unnamed: 0,Model,Accuracy,Macro-F1,Weighted-F1,MAE
0,Majority (Multi),0.405,0.115302,0.233488,0.975
1,Random (Multi),0.2,0.169389,0.227063,1.505
2,Logistic Regression (Multi),0.31,0.180235,0.307992,1.175
