In [None]:
from xml.etree import ElementTree as ET
from torch.utils.data import Dataset, DataLoader
from transformers import XLNetTokenizer, XLNetModel, XLNetForSequenceClassification, AutoTokenizer, AutoModel, BertTokenizer, BertModel
import torch
from torch import nn, optim
import pandas as pd
import numpy as np
import os
import re
from tqdm import tqdm
from sklearn.metrics import f1_score,accuracy_score,precision_score,recall_score,confusion_matrix,classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,MinMaxScaler,OneHotEncoder,LabelEncoder

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="darkgrid")
import json
import csv
import zipfile



# Data preprocessing for Span detection training # 

In [None]:
# Configuration

############# Indicate trans_model to fine-tune: bert, scibert or xlnet #############
trans_model = 'xlnet'

############# Indicate data for training: hedgepeer or bioscope #############
data = 'bioscope'

In [None]:
# Dataset load and visualization
root = '../input'
os.chdir(root)

# Change dataset path if necessary 
hedgepeer_path = 'hedgepeer/HedgePeer.jsonl'
bioscope_path = 'merged-bioscope/merged_bioscope.jsonl'

# Select dataset path based on 'data' variable
if data == 'hedgepeer':
    dataObj = pd.read_json(path_or_buf=hedgepeer_path, lines=True)
elif data == 'bioscope':
    dataObj = pd.read_json(path_or_buf=bioscope_path, lines=True)
else:
    raise ValueError("Invalid dataset name. Define 'data' as 'bioscope' or 'hedgepeer'.")

data_list = []
for index, row in dataObj.iterrows():
    rev_id = row['Review_id']
    sents = row['Sentences']
    for s in sents:
        hedges = s['Hedges']
        if(len(hedges)==0):
            d = {}
            d['Review_id'] = rev_id
            d['Sentence_id'] = s['Sentence_id']
            d['Raw Sentence'] = s['Sentence']
            d['Hedged Sentence'] = s['Sentence']
            d['Hedge'] = 'NO HEDGE'
            d['Span'] = None
            data_list.append(d)
        else:
            for h in hedges:
                d = {}
                d['Review_id'] = rev_id
                d['Sentence_id'] = s['Sentence_id']
                d['Raw Sentence'] = s['Sentence']
                d['Hedged Sentence'] = h['Hedged Sentence']
                d['Hedge'] = h['Hedge']
                d['Span'] = h['Span']
                data_list.append(d)

In [None]:
df = pd.DataFrame(data_list)
df

In [None]:

rev_id = df['Review_id']
sen_id = df['Sentence_id']
unq_id = [i+'_'+str(j) for i,j in zip(rev_id,sen_id)]
df['Unique_id'] = unq_id
df

In [None]:
# Count how many unique_id values are there
unique_id_count = df['Unique_id'].nunique()
unique_id_count

In [None]:
df['Span']

In [None]:
df.to_csv(f'/kaggle/working/{data}_df_unique_id.csv')

In [None]:
# Group by unique_id

unq_list = []
sent_list = []
hedged_sent_list = []
hed_list = []
span_list = []


gp = df.groupby(by=['Unique_id'])
for name,grp in tqdm(gp):
    sent_df = gp.get_group(name)
    raw_sent = list(set(sent_df['Raw Sentence']))
    hed_sent = list(sent_df['Hedged Sentence'])
        
    sent_hedges = list(sent_df['Hedge'])
    sent_spans = list(sent_df['Span'])
    sent_hed_span = [(i,j,k) for i,j,k in zip(hed_sent,sent_hedges,sent_spans) if j not in ['NO HEDGE','IDENT_PRECED']]
        
    hedged_sents = [i[0] for i in sent_hed_span]
    hedges = [i[1] for i in sent_hed_span]
    spans = [i[2] for i in sent_hed_span]
        
    unq_list.append(name)
    sent_list.append(raw_sent)
    hedged_sent_list.append(hedged_sents)
    hed_list.append(hedges)
    span_list.append(spans)
    

In [None]:
# Data dictionary

data_dict = {'sentence_id':unq_list, 'sentence':sent_list, 'hedged_sentence':hedged_sent_list, 'speculative_cues':hed_list, 'scope_string':span_list}
df_dict = pd.DataFrame(data_dict)

df_dict.to_csv(f'/kaggle/working/{data}_data_dic.csv',  index=False)

# Create dataloaders


In [None]:
# Creates dictionary with input_tokens, att_mask, targets tensors

class Dataset_gen(Dataset):

    def __init__(self,sentences,targets,att_masks):
        self.sent = sentences
        self.tar = targets
        self.att = att_masks
        
    def __len__(self):
        return len(self.sent)
    
    def __getitem__(self, item):
        sent = torch.tensor(self.sent[item])
        target = torch.tensor(self.tar[item])
        att = torch.tensor(self.att[item])
        ret_dict = {'input':sent,'attention_mask':att,'targets':target}
        
        return ret_dict

In [None]:
# Data loader 

def dataloader_gen(sent,data2mark,trans_model,tokenizer,max_len,batch_size):
    spans = data2mark  
    
    b = Biot2_dataset(sent,spans,trans_model,tokenizer,max_len)
        
    x,att,y = b.tokenids_gen()
    
    data = Dataset_gen(x,y,att)
    
    return DataLoader(data,batch_size=batch_size)


In [None]:
# leng_more = list of indices with sent tokens length > max_len

def remove_big_instances(data,sen_list,sent,data2mark,tokenizer,max_len):
    l = list(data['sentence'])
    leng_more = [i[0] for i in enumerate(l) if len(tokenizer.encode_plus(i[1],truncation=False,return_token_type_ids=True,return_attention_mask=True)['input_ids'])>max_len]
    if(sen_list!=None):
        sen_list = [i[1] for i in enumerate(sen_list) if i[0] not in leng_more]
    sent = [i[1] for i in enumerate(sent) if i[0] not in leng_more]
    data2mark = [i[1] for i in enumerate(data2mark) if i[0] not in leng_more]
    return (leng_more,sen_list,sent,data2mark)

In [None]:
# Convert hedged sentences to hashed sentences. Sent and spans used in Biot2_dataset

sent_span_d = df['Hedged Sentence'].to_list()
span_span_d = df['Span'].to_list()
sen_t2list = None
sent = []
spans = []

for i,d in enumerate(zip(sent_span_d,span_span_d)):
    hs = d[0]
    s = d[1]
    if(hs.find('<h>')>-1):
        hs = hs.replace('<span>','#')
        hs = hs.replace('</span>','#')
        hs = hs.replace('<h>','token[0]')
        hs = hs.replace('</h>','')
        s = s.replace('<h>','token[0]')
        s = s.replace('</h>','')
    elif(hs.find('<mh>')>-1):
        hs = hs.replace('<span>','#')
        hs = hs.replace('</span>','#')
        hs = hs.replace('<mh>','token[1]')
        hs = hs.replace('</mh>','')
        s = s.replace('<mh>','token[1]')
        s = s.replace('</mh>','')
    if(type(s)!=str):
        s = ''
    sent.append(hs)
    spans.append(s)

In [None]:
# 0=out of scope, 1=in scope 
# Class to process input data

class Biot2_dataset(Dataset):
    def __init__(self,sentences,spans,trans_model,tokenizer,max_len):
        self.sent = sentences
        self.trans_model = trans_model
        self.token = tokenizer
        self.max = max_len
        self.spans = spans
        
    def __len__(self):
        return len(self.sent)
    def tokenids_gen(self):
        targets = []
        senids=[]
        attention_masks=[]
        pad_token_ids = {'xlnet':5,'bert':0, 'scibert':0}
        for s,sc in zip(self.sent,self.spans):
            encodings = tokenizer.encode_plus(s,
                                  return_tensors='pt',
                                  truncation=False,
                                  return_token_type_ids=True,
                                  return_attention_mask=True,
                                  )
            
            att = list(encodings['attention_mask'][0])
            senid = list(encodings['input_ids'][0])
            att = [i.item() for i in att]
            senid = [i.item() for i in senid]
            k = [tokenizer.decode(i) for i in senid]

            tar = [0 for i in range(len(k))]
            if(sc != ''):
                idxstart = k.index('#')
                idxend = k.index('#',idxstart+1)
                tar[idxstart] = -1
                tar[idxend] = -1
                tar[idxstart+1:idxend] = [1 for i in range(idxend-idxstart-1)]
            
            
                for i in range(2):
                    idx = k.index('#')
                    if(k[idx-1]==''):
                        del k[idx-1:idx+1]
                        del senid[idx-1:idx+1]
                        del tar[idx-1:idx+1]
                        del att[idx-1:idx+1]
                    else:
                        del k[idx]
                        del senid[idx]
                        del tar[idx]
                        del att[idx]
                
            senid = [i for i,j in zip(senid,k) if re.search('[A-Za-z0-9]+', j)!=None]
            tar = [i for i,j in zip(tar,k) if re.search('[A-Za-z0-9]+', j)!=None]
            att = [i for i,j in zip(att,k) if re.search('[A-Za-z0-9]+', j)!=None]
            k = [i for i in k if re.search('[A-Za-z0-9]+', i)!=None]
            
            if(len(k)!=len(tar)):
                print(k)
                print('#'*40)
                
            ## adding pad token at the end....
            tar = tar+[0 for i in range(self.max - len(k))]
            senid = senid+[pad_token_ids[self.trans_model] for i in range(self.max - len(k))]
            att = att+[0 for i in range(self.max - len(k))]

            targets.append(tar)
            senids.append(senid)
            attention_masks.append(att)
        return (senids,attention_masks,targets)
        
data_span = df.reset_index().drop(columns=['index']).rename(columns = {'Raw Sentence': 'sentence'})
data_span

In [None]:
# Choose tokenizer type 

tokenizer1 = XLNetTokenizer.from_pretrained('xlnet-base-cased')

tokenizer2 = BertTokenizer.from_pretrained('bert-base-cased')

tokenizer3 = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_cased')

tokenizer_dict = {'xlnet':tokenizer1,'bert':tokenizer2,'scibert':tokenizer3}


tokenizer = tokenizer_dict[trans_model]

# Remove instances with length more than 100 (max_size might be changed)
len_more,sen_t2list,sent,spans = remove_big_instances(data_span,sen_t2list,sent,spans,tokenizer,100)

In [None]:
len(len_more)

In [None]:
max_len = 100
batch_size = 4

# Define the percentages for train, validation, and test splits
train_percentage = 0.7  # 70%
val_percentage = 0.2    # 20%
test_percentage = 0.1   # 10%

# Calculate the sizes based on the dataset length
total_samples = len(sent)
train_size = int(total_samples * train_percentage)
val_size = int(total_samples * val_percentage)
test_size = total_samples - train_size - val_size

y12 = spans  

# Split the data into train, validation, and test sets using the calculated sizes
sen_train, sen_temp, y12_train, y12_temp = train_test_split(sent, y12, test_size=(val_size + test_size), random_state=0)
sen_val, sen_test, y12_val, y12_test = train_test_split(sen_temp, y12_temp, test_size=test_size / (val_size + test_size), random_state=0)

# Create DataLoaders for the train, validation, and test sets
train_data_loader = dataloader_gen(sen_train, y12_train, trans_model, tokenizer, max_len, batch_size)
val_data_loader = dataloader_gen(sen_val, y12_val, trans_model, tokenizer, max_len, batch_size)
test_data_loader = dataloader_gen(sen_test, y12_test, trans_model, tokenizer, max_len, batch_size)


In [None]:
len(sent)

In [None]:
len(sen_train)

In [None]:
len(sen_val)

In [None]:
len(sen_test)

# Training

In [None]:
# Model
# Select model based on 'trans_model' variable
class scoperes_model(nn.Module):
    def __init__(self):
        super().__init__()
        if trans_model == 'xlnet':
            self.model = XLNetModel.from_pretrained('xlnet-base-cased')
        elif trans_model == 'bert':
            self.model = BertModel.from_pretrained('bert-base-cased')
        elif trans_model == 'scibert':
            self.model = AutoModel.from_pretrained('allenai/scibert_scivocab_cased')
        else:
            raise ValueError("Invalid model name. Choose 'xlnet', 'bert', or 'scibert'.")
        
        self.lin = nn.Linear(768, 2)

    def forward(self, x, att):
        xl = self.model(x, attention_mask=att)[0]
        xl = xl.view(-1, xl.shape[2])
        lin = self.lin(xl)
        return lin  


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
model = scoperes_model()
model.to(device)

In [None]:
# Evaluation function

def evaluate(model,val_data):
    model.eval()
    model.to(device)
    main_loss = 0
    true=[]
    pred=[]
    with torch.no_grad():
        for i,d in enumerate(val_data):
            inp = d['input'].to(device)
            att = d['attention_mask'].to(device)
            targets = d['targets'].view(-1).to(device)
            
            logits = model(inp,att)  
            
            loss = cse_loss(logits,targets)
            main_loss += loss.item()
            
            _,predictions = torch.max(logits,dim=1)
            
            targets = targets.cpu().detach().numpy()
            predictions = predictions.cpu().detach().numpy()
        
            true += list(targets)
            pred += list(predictions)
                
        main_loss = main_loss/(i+1)
    return (main_loss,true,pred)

In [None]:
# Train 

epochs = 5
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5)
cse_loss = torch.nn.CrossEntropyLoss()

loss_list = []

# Output directory
output_dir = '/kaggle/working/output_metrics/'
os.makedirs(output_dir, exist_ok=True)

# Lists to store training and validation metrics
metrics = []

# Training Loop
for ep in range(epochs):
    total_loss = 0
    true, pred = [], []
    model.train()

    # Training phase
    for i, d in enumerate(train_data_loader):
        if(i%300 == 299):
            print('batch - ',i+1)
        
        inp = d['input'].to(device)
        att = d['attention_mask'].to(device)
        targets = d['targets'].view(-1).to(device)
        
        logits = model(inp, att)
        
        loss = cse_loss(logits, targets)

        _, predictions = torch.max(logits, dim=1)
        
        targets = targets.cpu().detach().numpy()
        predictions = predictions.cpu().detach().numpy()

        true += list(targets)
        pred += list(predictions)
        
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        
    total_loss = total_loss/(i+1)

    f1 = f1_score(true,pred,average='macro')
    acc = accuracy_score(true, pred)
    cm = confusion_matrix(true, pred)
    print('epoch : ',ep+1,' --','\n','loss : ',total_loss,'\t','f1 : ',f1,'\t','acc : ',acc)
    print('train confusion matrix :')
    print(cm)
    print(classification_report(true, pred))
    
    # Confusion Matrix plot
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['out-scope', 'in-scope'],
                yticklabels=['out-scope', 'in-scope'])
    plt.title(f'{trans_model} - Span Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, f'{trans_model}_{data}_span_train_confusion_matrix.png'))
    plt.close()
    
    # Validation phase
    val_loss, val_true, val_pred = evaluate(model=model, val_data=val_data_loader)

    # Compute validation metrics
    val_precision = precision_score(val_true, val_pred, pos_label=1)
    val_recall = recall_score(val_true, val_pred, pos_label=1)
    val_f1 = f1_score(val_true, val_pred, average='macro')
    val_acc = accuracy_score(val_true, val_pred)
    val_cm = confusion_matrix(val_true, val_pred)

    print(f'Epoch {ep+1}/{epochs}')
    print(f'val loss: {val_loss}\t val_f1: {val_f1}\t val_acc: {val_acc}')
    print('val confusion matrix :')
    print(val_cm)

    # Confusion Matrix plot
    plt.figure(figsize=(8, 6))
    sns.heatmap(val_cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['out-scope', 'in-scope'],
                yticklabels=['out-scope', 'in-scope'])
    plt.title(f'{trans_model} - Span Val Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, f'{trans_model}_{data}_span_val_confusion_matrix.png'))
    plt.close()
    
    # Save model
    torch.save(model, f'/kaggle/working/{trans_model}_{data}_span_only_model_ep{ep+1}.pt')
    
    loss_list.append({'train_loss':total_loss,'val_loss':val_loss})
    
    # Store metrics for CSV
    metrics.append({
        'model_name': trans_model,
        'epoch': ep + 1,
        'train_data': data,
        'total_train_loss': total_loss,
        'total_train_f1': f1,
        'train_acc': acc,
        'val_loss': val_loss,
        'val_precision': val_precision,
        'val_recall': val_recall,
        'val_f1': val_f1,
        'val_acc': val_acc,
        
    })

# Save metrics to CSV
df_train_span = pd.DataFrame(metrics)
csv_path = os.path.join(output_dir, 'span_training_metrics.csv')
df_train_span.to_csv(csv_path, index=False)
print(f"Span train metrics saved to {csv_path}")


In [None]:
# Results on Test Data

# Define model name
model_name = trans_model

# Store results in a list
test_metrics = []

# Evaluate on test data
print(f'{trans_model} MODEL RESULTS ON {data} TEST DATA')
root = '/kaggle/working/'

for model_name in os.listdir(root):
    model_path = root+model_name
    if model_name[-3:] != '.pt':
        continue
    model = torch.load(model_path)
    model.to(device)
    test_loss,test_true,test_pred = evaluate(model=model,val_data=test_data_loader)

    print("Evaluate function outputs:")
    print(f"Test Loss: {test_loss}")
    
    # Compute evaluation metrics
    precision = precision_score(test_true, test_pred, pos_label=1)
    recall = recall_score(test_true, test_pred, pos_label=1)
    test_f1 = f1_score(test_true, test_pred, average='macro')
    test_acc = accuracy_score(test_true, test_pred)
    test_cm = confusion_matrix(test_true, test_pred)
                               
    # Print results
    print(f'Model: {model_name}')
    print(f'Test Loss: {test_loss:.4f} | Test precision F1: {precision:.4f} | Test recall: {recall:.4f} | Test F1: {test_f1:.4f} | Test Accuracy: {test_acc:.4f}')
    print('Test Confusion Matrix:')
    print(test_cm)
    print(classification_report(test_true, test_pred))
    print('\n')

    # Confusion Matrix plot
    plt.figure(figsize=(8, 6))
    sns.heatmap(val_cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['out-scope', 'in-scope'],
                yticklabels=['out-scope', 'in-scope'])
    plt.title(f'{model_name} - Span Test Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, f'{model_name}_{data}_span_test_confusion_matrix.png'))
    plt.close()

    
    # Store metrics for CSV
    test_metrics.append({
        'model_name': model_name,
        'train_data': data,
        'test_loss': test_loss,
        'test_precision': precision,
        'test_recall': recall,
        'test_f1': test_f1,
        'test_accuracy': test_acc,
        
    })

# Save test metrics to CSV
df_test_span = pd.DataFrame(test_metrics)
metrics_path = os.path.join(output_dir, 'span_test_metrics.csv')
df_test_span.to_csv(metrics_path, index=False)
print(f'Span test metrics saved to {csv_path}')