# 500,000 Sentences BERTopic Analysis on UNSC Corpus(inferenced output)

In [None]:

!pip install bertopic umap-learn --quiet


import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
import torch, gc


with open("/kaggle/input/labelingunsc/unsc_wholecorpus_inference_output.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# prepare DataFrame 
sentences = [d["Sentence"] for d in data]
hedge_counts = [len(d["Gold_Hedges"]) for d in data]
df = pd.DataFrame({"Sentence": sentences, "Hedge_Count": hedge_counts})

# sampling for memory issue
df = df.sample(n=500000, random_state=42).reset_index(drop=True)

# run bertopic
print("Running BERTopic...")
topic_model = BERTopic(verbose=True)
topics, _ = topic_model.fit_transform(df["Sentence"])
df["Topic"] = topics
df = df[df["Topic"] != -1]

# analyze hedging per topic
topic_summary = df.groupby("Topic").agg(
    Num_Sentences=("Sentence", "count"),
    Avg_Hedges_Per_Sentence=("Hedge_Count", "mean"),
    Total_Hedges=("Hedge_Count", "sum")
).reset_index()

# get top keywords for each topic
topic_labels = []
for topic_num in topic_summary["Topic"]:
    keywords = topic_model.get_topic(topic_num)
    label = ", ".join([word for word, _ in keywords[:3]])
    topic_labels.append(label)

topic_summary["Top_Keywords"] = topic_labels

# sort by hedging
top_topics = topic_summary.sort_values(by="Avg_Hedges_Per_Sentence", ascending=False).head(10)

plt.figure(figsize=(14, 6))
sns.barplot(data=top_topics, x="Top_Keywords", y="Avg_Hedges_Per_Sentence", palette="mako")
plt.title(" Most Hedged Topics")
plt.xlabel("Topic Keywords")
plt.ylabel("Avg. Hedge Cues per Sentence")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.savefig("/kaggle/working/auto_topics_hedging_analysis_500ksentences.png")
plt.show()

# UNSC Evaluation On Manually Annotated Dataset

* # UNSC evaluation with different trained models

In [None]:
import json
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification
from torch.nn.functional import softmax
from sklearn.metrics import precision_score, recall_score, f1_score


UNSC_PATH = " " #insert the unsc annotated subset for evaluation 
MODEL_PATH = " " #insert the model path with the model you want to train
BASE_MODEL = "bert-base-cased"  # this must match the base model used in training
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MAX_LEN = 128

# load tokenizer and model arch.
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
model = AutoModelForTokenClassification.from_pretrained(BASE_MODEL, num_labels=2).to(DEVICE)

# load the .pt weights
state_dict = torch.load(MODEL_PATH, map_location=DEVICE)
model.load_state_dict(state_dict)
model.eval()

#  Predict hedge cue words 
def predict_hedge_cues(sentence):
    encoding = tokenizer(
        sentence,
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=MAX_LEN,
        return_offsets_mapping=True,
        return_attention_mask=True
    )

    input_ids = encoding["input_ids"].to(DEVICE)
    attention_mask = encoding["attention_mask"].to(DEVICE)
    offset_mapping = encoding["offset_mapping"][0].tolist()
    tokens = tokenizer.convert_ids_to_tokens(input_ids[0])

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1).squeeze().tolist()

    # collect tokens where prediction == 1 (HEDGE)
    cue_words = set()
    for i, pred in enumerate(predictions):
        if pred == 1 and offset_mapping[i] != [0, 0]:
            start, end = offset_mapping[i]
            word = sentence[start:end]
            cue_words.add(word)

    return list(cue_words)

with open(UNSC_PATH, "r", encoding="utf-8") as f:
    unsc_data = json.load(f)

# running the inference
all_results = []

for entry in unsc_data:
    review_id = entry["Review_id"]
    for sent in entry["Sentences"]:
        sentence = sent["Sentence"]
        sentence_id = sent["Sentence_id"]
        gold = sent.get("Gold_Hedges", [])

        predicted_cues = predict_hedge_cues(sentence)

        all_results.append({
            "Review_id": review_id,
            "Sentence_id": sentence_id,
            "Sentence": sentence,
            "Predicted_Cues": predicted_cues,
            "Gold_Hedges": gold
        })


with open("unsc_hedge_cue_predictions.json", "w", encoding="utf-8") as f:
    json.dump(all_results, f, indent=2, ensure_ascii=False)

print("Saved predictions to 'unsc_hedge_cue_predictions.json'")

# evaluate
with open("unsc_hedge_cue_predictions.json", "r", encoding="utf-8") as f:
    predictions = json.load(f)

all_true = []
all_pred = []

for entry in predictions:
    pred_cues = set(entry.get("Predicted_Cues", []))
    gold_cues = set(h["Hedge"] for h in entry.get("Gold_Hedges", []))

    for cue in gold_cues:
        all_true.append(1)
        all_pred.append(1 if cue in pred_cues else 0)

    for cue in pred_cues:
        if cue not in gold_cues:
            all_true.append(0)
            all_pred.append(1)

# metrics computation
precision = precision_score(all_true, all_pred)
recall = recall_score(all_true, all_pred)
f1 = f1_score(all_true, all_pred)

print("Cue Detection Evaluation Results:")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 Score:  {f1:.4f}")


#  Inter-Annotator Agreement for Hedge Cues (Kseniya&Melisa)UNSC

In [None]:
import json
from collections import Counter
from sklearn.metrics import precision_recall_fscore_support

def load_annotations(path):
    with open(path, 'r', encoding='utf-8') as f:
        return json.load(f)

def extract_cues(sentence):
    return set(h['Hedge'].strip().lower() for h in sentence.get("Gold_Hedges", []))

def compute_agreement(my_data, friend_data):
    assert len(my_data) == len(friend_data), "Mismatch in number of reviews"

    total_agreed, total_mine, total_friends = 0, 0, 0
    per_sentence_results = []

    for my_review, friend_review in zip(my_data, friend_data):
        for my_sent, friend_sent in zip(my_review["Sentences"], friend_review["Sentences"]):
            my_cues = extract_cues(my_sent)
            friend_cues = extract_cues(friend_sent)
            agreed = my_cues & friend_cues

            total_agreed += len(agreed)
            total_mine += len(my_cues)
            total_friends += len(friend_cues)

            per_sentence_results.append({
                "Sentence_id": my_sent["Sentence_id"],
                "Sentence": my_sent["Sentence"],
                "My Cues": list(my_cues),
                "Friend's Cues": list(friend_cues),
                "Agreed Cues": list(agreed)
            })

    precision = total_agreed / total_mine if total_mine else 0
    recall = total_agreed / total_friends if total_friends else 0
    f1 = 2 * precision * recall / (precision + recall) if precision + recall else 0

    print(f"\n Agreement Scores (Cue-level):")
    print(f"  • Precision: {precision:.2f}")
    print(f"  • Recall:    {recall:.2f}")
    print(f"  • F1 Score:  {f1:.2f}")

    return per_sentence_results


# replace these with your actual file paths
my_annotations_path = "/kaggle/input/gold-labeled-unsc-last/gold_labeled_unsc_MELISA.json"
friend_annotations_path = "/kaggle/input/gold-labeled-unsc-last/gold_labeled_unsc_kseniya_LAST.json"

my_data = load_annotations(my_annotations_path)
friend_data = load_annotations(friend_annotations_path)

results = compute_agreement(my_data, friend_data)

with open("/kaggle/working/unsc_annotation_agreement.json", "w", encoding="utf-8") as f:
    json.dump(results, f, indent=2, ensure_ascii=False)

print("\n per-sentence comparison saved to: /kaggle/working/unsc_annotation_agreement.json")


# #  Cohen’s Kappa Hedge Cue

In [None]:
import json
import re
from sklearn.metrics import cohen_kappa_score
from transformers import AutoTokenizer


def load_annotations(path):
    with open(path, 'r', encoding='utf-8') as f:
        return json.load(f)

def get_token_labels(sentence_text, hedge_cues, tokenizer):
    labels = []
    tokens = tokenizer.tokenize(sentence_text)
    token_ids = tokenizer.convert_tokens_to_ids(tokens)

    cue_token_flags = [0] * len(tokens)

    for cue in hedge_cues:
        cue_tokens = tokenizer.tokenize(cue['Hedge'].strip())
        cue_len = len(cue_tokens)
        for i in range(len(tokens) - cue_len + 1):
            if tokens[i:i + cue_len] == cue_tokens:
                for j in range(cue_len):
                    cue_token_flags[i + j] = 1

    return cue_token_flags, tokens

# compute kappa across all tokens
def compute_kappa(my_data, friend_data, tokenizer):
    y_true, y_pred = [], []

    for my_review, friend_review in zip(my_data, friend_data):
        for my_sent, friend_sent in zip(my_review["Sentences"], friend_review["Sentences"]):
            text = my_sent["Sentence"]
            my_labels, _ = get_token_labels(text, my_sent.get("Gold_Hedges", []), tokenizer)
            friend_labels, _ = get_token_labels(text, friend_sent.get("Gold_Hedges", []), tokenizer)

            # match token length 
            if len(my_labels) == len(friend_labels):
                y_true.extend(friend_labels)
                y_pred.extend(my_labels)
            else:
                print(f" Token length mismatch in sentence: {text[:50]}... Skipping.")

    kappa = cohen_kappa_score(y_true, y_pred)
    print(f"\n Cohen’s Kappa (token-level): {kappa:.4f}")
    return kappa

my_path = "/kaggle/input/gold-labeled-unsc-last/gold_labeled_unsc_MELISA.json"
friend_path = "/kaggle/input/gold-labeled-unsc-last/gold_labeled_unsc_kseniya_LAST.json"

my_data = load_annotations(my_path)
friend_data = load_annotations(friend_path)

# use the same tokenizer used during training 
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

compute_kappa(my_data, friend_data, tokenizer)


# Span agreement(fuzzy)

In [None]:
import json
import re
from sklearn.metrics import precision_recall_fscore_support

def clean_text(text):
    return re.sub(r'\s+', ' ', text).strip()

def tokenize(text):
    return clean_text(text).split()

def span_iou(span1, span2):
    # convert to set of tokens
    tokens1 = set(tokenize(span1))
    tokens2 = set(tokenize(span2))
    if not tokens1 or not tokens2:
        return 0
    intersection = tokens1 & tokens2
    union = tokens1 | tokens2
    return len(intersection) / len(union)

def match_spans(my_spans, friend_spans, threshold=1):
    matched = set()
    for i, my in enumerate(my_spans):
        for j, friend in enumerate(friend_spans):
            if j in matched:
                continue
            iou = span_iou(my["Span"], friend["Span"])
            if iou >= threshold:
                matched.add(j)
                break
    return len(matched)

def extract_spans(sentence):
    return sentence.get("Gold_Hedges", [])

def evaluate_fuzzy_span_agreement(my_data, friend_data, threshold=1):
    tp, total_my, total_friend = 0, 0, 0

    for my_review, friend_review in zip(my_data, friend_data):
        for my_sent, friend_sent in zip(my_review["Sentences"], friend_review["Sentences"]):
            my_spans = extract_spans(my_sent)
            friend_spans = extract_spans(friend_sent)

            total_my += len(my_spans)
            total_friend += len(friend_spans)
            tp += match_spans(my_spans, friend_spans, threshold)

    precision = tp / total_my if total_my else 0
    recall = tp / total_friend if total_friend else 0
    f1 = 2 * precision * recall / (precision + recall) if precision + recall else 0

    print(f" Span-Level Agreement (IOU ≥ {threshold}):")
    print(f"  • Precision: {precision:.2f}")
    print(f"  • Recall:    {recall:.2f}")
    print(f"  • F1 Score:  {f1:.2f}")
    return precision, recall, f1

my_path = "/kaggle/input/gold-labeled-unsc-last/gold_labeled_unsc_MELISA.json"
friend_path = "/kaggle/input/gold-labeled-unsc-last/gold_labeled_unsc_kseniya_LAST.json"

with open(my_path, "r", encoding="utf-8") as f:
    my_data = json.load(f)

with open(friend_path, "r", encoding="utf-8") as f:
    friend_data = json.load(f)

evaluate_fuzzy_span_agreement(my_data, friend_data, threshold=1)


# Span-Level Cohen’s Kappa

In [None]:
import json
import re
from sklearn.metrics import cohen_kappa_score
from transformers import AutoTokenizer

def clean_text(text):
    return re.sub(r'\s+', ' ', text).strip()

def tokenize(text, tokenizer):
    return tokenizer.tokenize(clean_text(text))

def label_tokens_with_spans(text, spans, tokenizer):
    tokens = tokenize(text, tokenizer)
    labels = [0] * len(tokens)

    for span in spans:
        span_text = span['Span']
        span_tokens = tokenizer.tokenize(clean_text(span_text))

        # match span tokens with sentence tokens
        for i in range(len(tokens) - len(span_tokens) + 1):
            if tokens[i:i+len(span_tokens)] == span_tokens:
                for j in range(len(span_tokens)):
                    labels[i+j] = 1
                break 

    return labels, tokens

def compute_span_kappa(my_data, friend_data, tokenizer):
    y_true, y_pred = [], []

    for my_review, friend_review in zip(my_data, friend_data):
        for my_sent, friend_sent in zip(my_review["Sentences"], friend_review["Sentences"]):
            sentence = clean_text(my_sent["Sentence"])

            my_labels, my_tokens = label_tokens_with_spans(sentence, my_sent.get("Gold_Hedges", []), tokenizer)
            friend_labels, friend_tokens = label_tokens_with_spans(sentence, friend_sent.get("Gold_Hedges", []), tokenizer)

            if len(my_labels) == len(friend_labels):
                y_true.extend(friend_labels)
                y_pred.extend(my_labels)
            else:
                print(f" Skipping due to token length mismatch:\n{sentence[:100]}...")

    kappa = cohen_kappa_score(y_true, y_pred)
    print(f"\n Cohen's Kappa for SPAN-level (token-based): {kappa:.4f}")
    return kappa

my_path = "/kaggle/input/gold-labeled-unsc-last/gold_labeled_unsc_MELISA.json"
friend_path = "/kaggle/input/gold-labeled-unsc-last/gold_labeled_unsc_kseniya_LAST.json"

with open(my_path, "r", encoding="utf-8") as f:
    my_data = json.load(f)

with open(friend_path, "r", encoding="utf-8") as f:
    friend_data = json.load(f)

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
compute_span_kappa(my_data, friend_data, tokenizer)


# UNSC whole Corpus Inference with the finetuned bert on bioscope

In [None]:
import os
import json
import torch
import pyreadr
import nltk
import pandas as pd
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForTokenClassification


RDATA_FILE = "/kaggle/input/rdataunsc/docs.RData"  # full UNSC corpus
PT_MODEL_PATH = "/kaggle/input/clean-new-training-cue-hedgepeer-bioscope/unsc_finetuned_bert/finetuned_unsc_bert.pt"  # .pt file
BASE_MODEL = "bert-base-cased"  # make sure it matches the architecture
OUTPUT_JSON = "/kaggle/working/unsc_wholecorpus_inference_output.json"
MAX_LEN = 128
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# load tonezier and architecture
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
model = AutoModelForTokenClassification.from_pretrained(BASE_MODEL, num_labels=2).to(DEVICE)

# load the .pt trained model
state_dict = torch.load(PT_MODEL_PATH, map_location=DEVICE)
model.load_state_dict(state_dict)
model.eval()

# load unsc data
result = pyreadr.read_r(RDATA_FILE)
df = result["raw_docs"]
df = df[["text", "doc_id"]].rename(columns={"text": "text"})


nltk.download("punkt")
sent_tokenizer = nltk.sent_tokenize

# inference function
def predict_hedges(sentence, model, tokenizer):
    encoding = tokenizer(
        sentence,
        return_offsets_mapping=True,
        padding="max_length",
        truncation=True,
        max_length=MAX_LEN,
        return_tensors="pt"
    )

    input_ids = encoding["input_ids"].to(DEVICE)
    attention_mask = encoding["attention_mask"].to(DEVICE)
    offsets = encoding["offset_mapping"].squeeze().tolist()

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=-1).squeeze().cpu().tolist()

    hedge_spans = []
    current_span = ""

    for idx, (label, offset) in enumerate(zip(predictions, offsets)):
        if offset == [0, 0]:
            continue
        if label == 1:
            word = sentence[offset[0]:offset[1]]
            current_span += word if current_span == "" else " " + word
        elif current_span:
            hedge_spans.append({"Hedge": current_span})
            current_span = ""

    if current_span:
        hedge_spans.append({"Hedge": current_span})

    return hedge_spans


final_data = []

for _, row in tqdm(df.iterrows(), total=len(df), desc="Processing UNSC Inference"):
    review_id = row["doc_id"]
    text = row["text"]
    sentences = sent_tokenizer(text)

    for i, sentence in enumerate(sentences):
        hedges = predict_hedges(sentence, model, tokenizer)
        final_data.append({
            "Review_id": review_id,
            "Sentence_id": i + 1,
            "Sentence": sentence,
            "Gold_Hedges": hedges
        })

with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
    json.dump(final_data, f, indent=2, ensure_ascii=False)

print(f"Inference completed. Results saved to {OUTPUT_JSON}")

# Data Analysis on UNSC Hedge Cues

* # Percentage of Hedged vs Non-Hedged Sentences Per Year

In [None]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from collections import defaultdict


with open("/kaggle/input/labelingunsc/unsc_wholecorpus_inference_output.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# count hedged vs non-hedged per year 
year_counts = defaultdict(lambda: {"hedged": 0, "non_hedged": 0})

for entry in data:
    review_id = entry["Review_id"]
    year_match = re.search(r"UNSC_(\d{4})_", review_id)
    if not year_match:
        continue
    year = int(year_match.group(1))
    
    if entry["Gold_Hedges"]:
        year_counts[year]["hedged"] += 1
    else:
        year_counts[year]["non_hedged"] += 1

# dataframe 
rows = []
for year, counts in sorted(year_counts.items()):
    total = counts["hedged"] + counts["non_hedged"]
    hedged_pct = counts["hedged"] / total * 100
    non_hedged_pct = counts["non_hedged"] / total * 100
    rows.append({
        "Year": year,
        "Hedged (%)": hedged_pct,
        "Non-Hedged (%)": non_hedged_pct
    })

df = pd.DataFrame(rows)

df_melted = df.melt(id_vars="Year", var_name="Category", value_name="Percentage")


plt.figure(figsize=(14, 7))
sns.barplot(data=df_melted, x="Year", y="Percentage", hue="Category")
plt.title("Percentage of Hedged vs. Non-Hedged Sentences per Year")
plt.ylabel("Percentage")
plt.xlabel("Year")
plt.xticks(rotation=45)
plt.legend(title="Sentence Type")
plt.tight_layout()
plt.show()
plot_path = "/kaggle/working/percentage_of_hedged_vs_nonhedged_sentences_peryear.png"

plt.savefig(plot_path)


* # # number of hedge cues

In [None]:
import json


OUTPUT_JSON = "/kaggle/input/labelingunsc/unsc_wholecorpus_inference_output.json"

with open(OUTPUT_JSON, "r", encoding="utf-8") as f:
    data = json.load(f)

# count hedge cues
total_hedge_cues = 0

for entry in data:
    total_hedge_cues += len(entry["Gold_Hedges"])

print(f"Total hedge cues found: {total_hedge_cues}")


* # number of hedged vs non-hedged sentences

In [None]:
import json

# load inference results 
OUTPUT_JSON = "/kaggle/input/labelingunsc/unsc_wholecorpus_inference_output.json"

with open(OUTPUT_JSON, "r", encoding="utf-8") as f:
    data = json.load(f)


hedged_count = 0
non_hedged_count = 0

for entry in data:
    if entry["Gold_Hedges"]:
        hedged_count += 1
    else:
        non_hedged_count += 1


total_sentences = hedged_count + non_hedged_count
hedged_ratio = hedged_count / total_sentences * 100
non_hedged_ratio = non_hedged_count / total_sentences * 100

print(f" Total sentences: {total_sentences}")
print(f"Hedged sentences: {hedged_count} ({hedged_ratio:.2f}%)")
print(f" Non-hedged sentences: {non_hedged_count} ({non_hedged_ratio:.2f}%)")


* # total sentences and documents number

In [None]:
import json


OUTPUT_JSON = "/kaggle/input/labelingunsc/unsc_wholecorpus_inference_output.json"

with open(OUTPUT_JSON, "r", encoding="utf-8") as f:
    data = json.load(f)


total_sentences = len(data)
print(f"Total number of sentences processed in inference: {total_sentences}")
unique_docs = set(entry["Review_id"] for entry in data)
print(f"Total unique UNSC documents: {len(unique_docs)}")


* # most common hedge cues

In [None]:
import json
from collections import Counter


OUTPUT_JSON = "/kaggle/input/labelingunsc/unsc_wholecorpus_inference_output.json"

with open(OUTPUT_JSON, "r", encoding="utf-8") as f:
    data = json.load(f)


hedge_list = []

for entry in data:
    for hedge in entry["Gold_Hedges"]:
        hedge_text = hedge["Hedge"].strip().lower()  # normalize
        hedge_list.append(hedge_text)

# count and display the most common hedge cues
hedge_counter = Counter(hedge_list)
top_hedges = hedge_counter.most_common(20)  

print("Most Common Hedge Cues:")
for cue, count in top_hedges:
    print(f"{cue:30s} → {count}")
