## Run Challenge Dataset on Standalone Functions

In [3]:
#!python -m spacy download en_core_web_sm
# pip install transformers[torch] datasets accelerate scipy numpy pandas joblib spacy scikit-learn
# python -m spacy download en_core_web_sm

In [4]:
from helpers import get_complex_feature, get_pos_feature, get_lemma_feature, tokenize_and_align_labels, make_word_ids
from joblib import load
from scipy.sparse import hstack
from datasets import Dataset
import numpy as np
import os
from transformers import AutoTokenizer, AutoModelForTokenClassification, AutoConfig, Trainer
import json
import pandas as pd
from datasets import disable_progress_bar

  from .autonotebook import tqdm as notebook_tqdm


### Logistic Regresssion Standalone Function

Download models

https://github.com/orlandocloss/SRLClassification/releases/download/model/SRLLogisticRegression.pkl

https://github.com/orlandocloss/SRLClassification/releases/download/model/SRLEncoders.pkl

In [5]:
logistic_regression_model = load('./SRLLogisticRegression.pkl')
feature_encoders, label_encoder = load('./SRLEncoders.pkl')

In [6]:
def lr_standalone_function(tokenized_sentence, predicate_indicators, model, feature_encoders, label_encoder):
    """
    This function takes a tokenized sentence and a list of predicate indicators (0=not predicate, 1=predicate)
    and returns the predicate labels for all tokens for each predicate.
    """
    # Find all predicate positions from the indicators
    predicate_positions = [i for i, indicator in enumerate(predicate_indicators) if indicator == 1]
    
    if not predicate_positions:
        return []  # No predicates found
    
    all_predictions = []
    
    for predicate_position in predicate_positions: #process each predicate
        # Feature 1: Complex feature
        complex_features = get_complex_feature(tokenized_sentence, predicate_position)
        complex_features = [[feature] for feature in complex_features]
        
        # Feature 2: Lemma feature
        lemma_features = get_lemma_feature(tokenized_sentence)
        lemma_features = [[feature] for feature in lemma_features] 
        
        # Feature 3: POS feature
        pos_features = get_pos_feature(tokenized_sentence)
        pos_features = [[feature] for feature in pos_features]
        
        # Transform features
        encoded_feature1 = feature_encoders[0].transform(complex_features)
        encoded_feature2 = feature_encoders[1].transform(lemma_features)
        encoded_feature3 = feature_encoders[2].transform(pos_features)
        features = hstack([encoded_feature1, encoded_feature2, encoded_feature3])
        
        predicted_labels = model.predict(features) #predict labels for each predicate
        predicted_labels = [label if label is not None else 'O' for label in label_encoder.inverse_transform(predicted_labels)]
        
        predicted_labels[predicate_position] = 'V' # Add the predicate role marker at the predicate position
        
        all_predictions.append(predicted_labels)
    
    return all_predictions

test model

In [7]:
tokenized_sentence = ["John", "bought", "a", "car", "and", "drove", "it", "to", "work", "."]
predicate_indicators = [0, 1, 0, 0, 0, 1, 0, 0, 0, 0]
print(lr_standalone_function(tokenized_sentence, predicate_indicators, logistic_regression_model,  feature_encoders, label_encoder))

[['ARG0', 'V', 'O', 'ARG1', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'V', 'ARG1', 'O', 'O', 'O']]


### Bert Transformers Standalone Function

Download model
https://github.com/orlandocloss/SRLClassification/releases/download/finetuned-model/BertSRLModel.zip

In [8]:
bert_model_path = "./BertSRLModel"
training_args_path = os.path.join(bert_model_path, "training_args.bin")

In [9]:
# The model loading from local path
model = AutoModelForTokenClassification.from_pretrained(bert_model_path)
tokenizer = AutoTokenizer.from_pretrained(bert_model_path)
# training_args = torch.load(training_args_path, weights_only=False)
# training_args.evaluation_strategy = "no"
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
)


  trainer = Trainer(


In [10]:
def predict_srl_tags(tokens,predicate_indicators, trainer, label_list):
    """
    Make SRL predictions for a sentence with potentially multiple predicates.
    """
    predicate_positions = [i for i, indicator in enumerate(predicate_indicators) if indicator == 1] # Find all predicate positions
    
    if not predicate_positions:
        return []
    # Create a mini dataset with all predicates at once
    examples = {
        "id": list(range(len(predicate_positions))),
        "tokens": [tokens] * len(predicate_positions),
        "predicate_pos": predicate_positions,
        "SRL_tags": [[0] * len(tokens)] * len(predicate_positions)  # Placeholder tags
    }
    mini_dataset = Dataset.from_dict(examples)
    
    tokenize_fn = lambda examples, **kwargs: tokenize_and_align_labels(examples, tokenizer, **kwargs)

    disable_progress_bar()
    tokenized_examples = mini_dataset.map( # Apply tokenization to the entire batch
        tokenize_fn,
        batched=True
    )
    predictions, _, _ = trainer.predict(tokenized_examples)
    predictions = np.argmax(predictions, axis=2)
    
    all_predictions = []
    
    for i, predicate_pos in enumerate(predicate_positions):
        pred = predictions[i]
        
        tokens_before = tokens[:predicate_pos+1]
        tokens_after = tokens[predicate_pos+1:]
        
        word_ids = make_word_ids(tokens_before, tokens_after, predicate_pos, tokenizer)
        predicted_tags = [None] * len(tokens)
        for token_idx, word_idx in enumerate(word_ids):
            if word_idx is None: # Skip special tokens 
                continue
            else:
                predicted_tags[word_idx] = label_list[pred[token_idx]]
        
        for j in range(len(predicted_tags)): # Fill in any missing predictions with 'O'
            if predicted_tags[j] is None:
                predicted_tags[j] = 'O'
        
        predicted_tags[predicate_pos] = 'V' # Mark the predicate position with 'V'
        
        all_predictions.append(predicted_tags)
    
    return all_predictions


Manually enter label list

In [11]:
label_list=['O',
 'ARG0',
 'ARG1',
 'ARG1-DSP',
 'ARG2',
 'ARG3',
 'ARG4',
 'ARG5',
 'ARGA',
 'ARGM-ADJ',
 'ARGM-ADV',
 'ARGM-CAU',
 'ARGM-COM',
 'ARGM-CXN',
 'ARGM-DIR',
 'ARGM-DIS',
 'ARGM-EXT',
 'ARGM-GOL',
 'ARGM-LOC',
 'ARGM-LVB',
 'ARGM-MNR',
 'ARGM-MOD',
 'ARGM-NEG',
 'ARGM-PRD',
 'ARGM-PRP',
 'ARGM-PRR',
 'ARGM-REC',
 'ARGM-TMP',
 'C-ARG0',
 'C-ARG1',
 'C-ARG1-DSP',
 'C-ARG2',
 'C-ARG3',
 'C-ARG4',
 'C-ARGM-ADV',
 'C-ARGM-COM',
 'C-ARGM-CXN',
 'C-ARGM-DIR',
 'C-ARGM-EXT',
 'C-ARGM-GOL',
 'C-ARGM-LOC',
 'C-ARGM-MNR',
 'C-ARGM-PRP',
 'C-ARGM-PRR',
 'C-ARGM-TMP',
 'C-V',
 'R-ARG0',
 'R-ARG1',
 'R-ARG2',
 'R-ARG3',
 'R-ARG4',
 'R-ARGM-ADV',
 'R-ARGM-CAU',
 'R-ARGM-COM',
 'R-ARGM-DIR',
 'R-ARGM-GOL',
 'R-ARGM-LOC',
 'R-ARGM-MNR',
 'R-ARGM-TMP']

In [12]:
tokenized_sentence = ["John", "bought", "a", "car", "and", "drove", "it", "to", "work", "."]
predicate_indicators = [0, 1, 0, 0, 0, 1, 0, 0, 0, 0]

print(predict_srl_tags(tokenized_sentence, predicate_indicators, trainer, label_list))

[['ARG0', 'V', 'O', 'ARG1', 'O', 'O', 'O', 'O', 'O', 'O'], ['ARG0', 'O', 'O', 'O', 'O', 'V', 'ARG1', 'O', 'ARGM-PRP', 'O']]


### Challenge Set

One function to evaluate models on challenge set at once

In [13]:
def evaluate_srl_tests(dataset_path, model_choice="lr"):
    """
    Evaluate Semantic Role Labeling tests from a dataset and show results in a table.
    
    Parameters:
    - dataset_path: Path to the dataset.json file
    - model_choice: Either "bert" or "lr" to choose which model to use
    
    Returns:
    - A pandas DataFrame showing test results
    """
    
    # Load the dataset
    with open(dataset_path, 'r') as f:
        dataset = json.load(f)
    
    results = []
    
    for capability, tests in dataset["capabilities"].items():
        for test_type, test_groups in tests.items():
            for test_number, examples in test_groups.items():
                success = 0
                failure = 0
                
                for example in examples:
                    if test_type == "MFT":
                        tokens = example["tokenized"]
                        predicate_index = example["predicate_index"]
                        target_index = example["target_index"]
                        expected_srl = example["target_SRL"]
                        
                        predicate_indicators = [0] * len(tokens)
                        predicate_indicators[predicate_index] = 1
                        
                        if model_choice == "lr":
                            predictions = lr_standalone_function(
                                tokens, 
                                predicate_indicators,
                                logistic_regression_model,
                                feature_encoders, 
                                label_encoder
                            )
                        else:  # bert model
                            predictions = predict_srl_tags(
                                tokens,
                                predicate_indicators,
                                trainer,
                                label_list
                            )
                        
                        if predictions and len(predictions) > 0:
                            pred_tags = predictions[0]  # First predicate's predictions
                            if target_index < len(pred_tags) and pred_tags[target_index] == expected_srl:
                                success += 1
                            else:
                                failure += 1
                        else:
                            failure += 1
                            
                    elif test_type == "DIR":
                        tokens1 = example["tokenized1"]
                        tokens2 = example["tokenized2"]
                        predicate_index = example["predicate_index"]
                        target_index1 = example["target_index1"]
                        target_index2 = example["target_index2"]
                        expected_srl1 = example["target_SRL1"]
                        expected_srl2 = example["target_SRL2"]
                        
                        predicate_indicators1 = [0] * len(tokens1)
                        predicate_indicators1[predicate_index] = 1
                        
                        predicate_indicators2 = [0] * len(tokens2)
                        predicate_indicators2[predicate_index] = 1
                        
                        if model_choice == "lr":
                            predictions1 = lr_standalone_function(
                                tokens1, 
                                predicate_indicators1,
                                logistic_regression_model,
                                feature_encoders, 
                                label_encoder
                            )
                            predictions2 = lr_standalone_function(
                                tokens2, 
                                predicate_indicators2,
                                logistic_regression_model,
                                feature_encoders, 
                                label_encoder
                            )
                        else:  # bert model
                            predictions1 = predict_srl_tags(
                                tokens1,
                                predicate_indicators1,
                                trainer,
                                label_list
                            )
                            predictions2 = predict_srl_tags(
                                tokens2,
                                predicate_indicators2,
                                trainer,
                                label_list
                            )
                        
                        if (predictions1 and len(predictions1) > 0 and 
                            predictions2 and len(predictions2) > 0):
                            
                            pred_tags1 = predictions1[0]
                            pred_tags2 = predictions2[0]
                            
                            if (target_index1 < len(pred_tags1) and 
                                target_index2 < len(pred_tags2) and
                                pred_tags1[target_index1] == expected_srl1 and
                                pred_tags2[target_index2] == expected_srl2):
                                success += 1
                            else:
                                failure += 1
                        else:
                            failure += 1
                            
                    elif test_type == "INV":
                        tokens1 = example["tokenized1"]
                        tokens2 = example["tokenized2"]
                        predicate_index = example["predicate_index"]
                        target_index1 = example["target_index1"]
                        target_index2 = example["target_index2"]
                        expected_srl = example["target_SRL"]  # Same SRL for both
                        
                        predicate_indicators1 = [0] * len(tokens1)
                        predicate_indicators1[predicate_index] = 1
                        
                        predicate_indicators2 = [0] * len(tokens2)
                        predicate_indicators2[predicate_index] = 1
                        
                        if model_choice == "lr":
                            predictions1 = lr_standalone_function(
                                tokens1, 
                                predicate_indicators1,
                                logistic_regression_model,
                                feature_encoders, 
                                label_encoder
                            )
                            predictions2 = lr_standalone_function(
                                tokens2, 
                                predicate_indicators2,
                                logistic_regression_model,
                                feature_encoders, 
                                label_encoder
                            )
                        else:  # bert model
                            predictions1 = predict_srl_tags(
                                tokens1,
                                predicate_indicators1,
                                trainer,
                                label_list
                            )
                            predictions2 = predict_srl_tags(
                                tokens2,
                                predicate_indicators2,
                                trainer,
                                label_list
                            )
                        
                        # Check both predictions
                        if (predictions1 and len(predictions1) > 0 and 
                            predictions2 and len(predictions2) > 0):
                            
                            pred_tags1 = predictions1[0]
                            pred_tags2 = predictions2[0]
                            
                            if (target_index1 < len(pred_tags1) and 
                                target_index2 < len(pred_tags2) and
                                pred_tags1[target_index1] == expected_srl and
                                pred_tags2[target_index2] == expected_srl):
                                success += 1
                            else:
                                failure += 1
                        else:
                            failure += 1
                            
                    elif test_type == "specialDIR":
                        # Handle specialDIR tests with different predicates
                        tokens1 = example["tokenized1"]
                        tokens2 = example["tokenized2"]
                        predicate_index1 = example["predicate_index1"]
                        predicate_index2 = example["predicate_index2"]
                        expected_srl = example["SRL"]
                        
                        predicate_indicators1 = [0] * len(tokens1)
                        predicate_indicators1[predicate_index1] = 1
                        
                        predicate_indicators2 = [0] * len(tokens2)
                        predicate_indicators2[predicate_index2] = 1
                        
                        if model_choice == "lr":
                            predictions1 = lr_standalone_function(
                                tokens1, 
                                predicate_indicators1,
                                logistic_regression_model,
                                feature_encoders, 
                                label_encoder
                            )
                            predictions2 = lr_standalone_function(
                                tokens2, 
                                predicate_indicators2,
                                logistic_regression_model,
                                feature_encoders, 
                                label_encoder
                            )
                        else:  # bert model
                            predictions1 = predict_srl_tags(
                                tokens1,
                                predicate_indicators1,
                                trainer,
                                label_list
                            )
                            predictions2 = predict_srl_tags(
                                tokens2,
                                predicate_indicators2,
                                trainer,
                                label_list
                            )
                        
                        if (predictions1 and len(predictions1) > 0 and 
                            predictions2 and len(predictions2) > 0):
                            
                            # For specialDIR, we're only checking if the SRL is present in the second
                            # but not in the first, without specific target indices
                            first_labels = set(predictions1[0])
                            second_labels = set(predictions2[0])
                            
                            if expected_srl not in first_labels and expected_srl in second_labels:
                                success += 1
                            else:
                                failure += 1
                        else:
                            failure += 1 
                                           
                total = success + failure
                failure_rate = failure / total if total > 0 else 0
                
                results.append({
                    "Capability": capability,
                    "Test Number": test_number,
                    "Test Type": test_type,
                    "Total Examples": total,
                    "Failures": failure,
                    "Failure Rate": f"{failure_rate:.2%}"
                })
    
    results_df = pd.DataFrame(results)
    
    # results_df = results_df.sort_values(["Capability", "Test Number"])
    
    return results_df



Logistic Regression Performance

In [14]:
results_table = evaluate_srl_tests("dataset.json", model_choice="lr")
results_table

Unnamed: 0,Capability,Test Number,Test Type,Total Examples,Failures,Failure Rate
0,distance,1,MFT,86,68,79.07%
1,distance,2,MFT,100,100,100.00%
2,spacetemp,1,DIR,59,59,100.00%
3,dativealter,1,INV,57,38,66.67%
4,negation,1,MFT,50,34,68.00%
5,head,1,MFT,50,45,90.00%
6,head,2,MFT,50,48,96.00%
7,predicatedis,1,specialDIR,6,6,100.00%


DistilBert Performance

In [16]:
results_table2 = evaluate_srl_tests("dataset.json", model_choice="bert")
results_table2

Unnamed: 0,Capability,Test Number,Test Type,Total Examples,Failures,Failure Rate
0,distance,1,MFT,86,4,4.65%
1,distance,2,MFT,100,6,6.00%
2,spacetemp,1,DIR,59,8,13.56%
3,dativealter,1,INV,57,12,21.05%
4,negation,1,MFT,50,0,0.00%
5,head,1,MFT,50,29,58.00%
6,head,2,MFT,50,24,48.00%
7,predicatedis,1,specialDIR,6,6,100.00%


In [17]:
results_table2

Unnamed: 0,Capability,Test Number,Test Type,Total Examples,Failures,Failure Rate
0,distance,1,MFT,86,4,4.65%
1,distance,2,MFT,100,6,6.00%
2,spacetemp,1,DIR,59,8,13.56%
3,dativealter,1,INV,57,12,21.05%
4,negation,1,MFT,50,0,0.00%
5,head,1,MFT,50,29,58.00%
6,head,2,MFT,50,24,48.00%
7,predicatedis,1,specialDIR,6,6,100.00%
