In [2]:
from extractors.extractors import extractors
from evaluation.evaluation import evaluate_concepts, read_json_from_file

json_data = read_json_from_file("NER_prerequisites.json")
f1_results = []
f1_at_O_results = []
for name, extractor in extractors.items():
    evaluation_data = []
    for item in json_data:
        prerequisite_text = item.get('prerequisite_text', '')
        gt_concepts = item.get('concepts', [])
        extracted_concepts = extractor.extractKeyWords(prerequisite_text, int(len(gt_concepts)*2))
        evaluation_data.append((gt_concepts, extracted_concepts))
    results = evaluate_concepts(name, evaluation_data)
    f1_results.append(results['f1'])
    f1_at_O_results.append(results['f1_at_O'])

### Position Rank
Precision: 0.53
Recall: 0.85
F1 Score: 0.64
Precision@O: 0.56
Recall@O: 0.56
F1@O Score: 0.56


### YAKE
Precision: 0.39
Recall: 0.76
F1 Score: 0.51
Precision@O: 0.56
Recall@O: 0.56
F1@O Score: 0.56


### Key BERT
Precision: 0.33
Recall: 0.65
F1 Score: 0.44
Precision@O: 0.46
Recall@O: 0.46
F1@O Score: 0.46


### RAKE
Precision: 0.47
Recall: 0.74
F1 Score: 0.56
Precision@O: 0.50
Recall@O: 0.50
F1@O Score: 0.50


### TextRank
Precision: 0.53
Recall: 0.83
F1 Score: 0.63
Precision@O: 0.57
Recall@O: 0.57
F1@O Score: 0.57


### LLM Powered Approach
Precision: 0.78
Recall: 0.78
F1 Score: 0.77
Precision@O: 0.82
Recall@O: 0.77
F1@O Score: 0.79




In [4]:
import numpy as np
from scipy.stats import ttest_rel

# Assume last row is your LLM-based method
llm_f1 = np.array(f1_results[-1])
llm_f1_at_o = np.array(f1_at_O_results[-1])

print("=== Paired t-test results vs. LLM (F1) ===")
for i, method_f1 in enumerate(f1_results[:-1]):  # Skip LLM row
    method_f1 = np.array(method_f1)
    t_stat, p_value = ttest_rel(llm_f1, method_f1)
    print(f"Method {i+1}: t = {t_stat:.4f}, p = {p_value:.4f} -> {'Significant' if p_value < 0.05 else 'Not significant'}")

print("\n=== Paired t-test results vs. LLM (F1@O) ===")
for i, method_f1o in enumerate(f1_at_O_results[:-1]):
    method_f1o = np.array(method_f1o)
    t_stat, p_value = ttest_rel(llm_f1_at_o, method_f1o)
    print(f"Method {i+1}: t = {t_stat:.4f}, p = {p_value:.4f} -> {'Significant' if p_value < 0.05 else 'Not significant'}")


=== Paired t-test results vs. LLM (F1) ===
Method 1: t = 4.1884, p = 0.0001 -> Significant
Method 2: t = 7.7896, p = 0.0000 -> Significant
Method 3: t = 8.9242, p = 0.0000 -> Significant
Method 4: t = 5.5223, p = 0.0000 -> Significant
Method 5: t = 4.5800, p = 0.0000 -> Significant

=== Paired t-test results vs. LLM (F1@O) ===
Method 1: t = 5.4932, p = 0.0000 -> Significant
Method 2: t = 5.3710, p = 0.0000 -> Significant
Method 3: t = 7.5544, p = 0.0000 -> Significant
Method 4: t = 6.7479, p = 0.0000 -> Significant
Method 5: t = 5.2580, p = 0.0000 -> Significant


In [5]:
from scipy.stats import wilcoxon
import numpy as np

# Your LLM-based results
llm_f1 = np.array(f1_results[-1])
llm_f1_at_o = np.array(f1_at_O_results[-1])

print("=== Wilcoxon signed-rank test vs. LLM (F1) ===")
for i, method_f1 in enumerate(f1_results[:-1]):  # All baselines
    method_f1 = np.array(method_f1)
    stat, p = wilcoxon(llm_f1, method_f1)
    print(f"Method {i+1}: W = {stat}, p = {p:.4f} -> {'Significant' if p < 0.05 else 'Not significant'}")

print("\n=== Wilcoxon signed-rank test vs. LLM (F1@O) ===")
for i, method_f1o in enumerate(f1_at_O_results[:-1]):
    method_f1o = np.array(method_f1o)
    stat, p = wilcoxon(llm_f1_at_o, method_f1o)
    print(f"Method {i+1}: W = {stat}, p = {p:.4f} -> {'Significant' if p < 0.05 else 'Not significant'}")


=== Wilcoxon signed-rank test vs. LLM (F1) ===
Method 1: W = 442.0, p = 0.0000 -> Significant
Method 2: W = 206.5, p = 0.0000 -> Significant
Method 3: W = 155.0, p = 0.0000 -> Significant
Method 4: W = 361.0, p = 0.0000 -> Significant
Method 5: W = 380.0, p = 0.0000 -> Significant

=== Wilcoxon signed-rank test vs. LLM (F1@O) ===
Method 1: W = 224.0, p = 0.0000 -> Significant
Method 2: W = 178.0, p = 0.0000 -> Significant
Method 3: W = 161.5, p = 0.0000 -> Significant
Method 4: W = 156.5, p = 0.0000 -> Significant
Method 5: W = 228.5, p = 0.0000 -> Significant
