In [None]:
# Install required packages
# !pip install -r requirements.txt

In [1]:
import spacy
from spacy.tokens import Token, DocBin, Doc, Span
from spacy.training import Example, offsets_to_biluo_tags
import spacy_udpipe
from sklearn.metrics import classification_report

from parc3corpus import Parc3Corpus
from df_corpus import CsvCorpus
import verb_cue_classifier
import content_classifier
import source_classifier
import content_resolver
import source_resolver
import quote_resolver

2023-04-05 00:20:33.374896: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-04-05 00:20:33.374928: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-04-05 00:20:35.351454: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2023-04-05 00:20:35.351476: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2023-04-05 00:20:35.351492: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (lordinateur): /proc/driver/nvidia/version does not exist


In [2]:
def init_pipeline(nlp, text_features=False, ner=False):
    if ner:
        ner_vcc = spacy.load("verb-cue-classifier/output/model-best")
        nlp.add_pipe("ner", source=ner_vcc, name="ner_vcc", before="ner")
    nlp.add_pipe('verb_cue_classifier')
    nlp.add_pipe('content_classifier_features')
    if text_features:
        nlp.add_pipe('content_classifier_text_features')
        nlp.add_pipe('content_text_classifier')
    else:
        nlp.add_pipe('content_classifier')
    nlp.add_pipe('source_classifier_features')
    if text_features:
        nlp.add_pipe('source_classifier_text_features')
        nlp.add_pipe('source_text_classifier')
    else:
        nlp.add_pipe('source_classifier')
    nlp.add_pipe('content_resolver')
    nlp.add_pipe('source_resolver')
    nlp.add_pipe('quote_resolver')
    return nlp

def span_after_alignment(span, example):
    start = example.alignment.x2y[span.start][0]
    end = example.alignment.x2y[span.end-1][-1] + 1
    return Span(example.reference, start, end)

def is_span_in(span, arr):
    for other in arr:
        if span.start == other.start and span.end == other.end:
            return True
    return False

def get_other_span(span, arr):
    for other in arr:
        if span.start == other.start and span.end == other.end:
            return other
    return None

def exact_matching_metrics(corpus):
    tp = 0
    fp = 0
    true_count = 0
    for ex in corpus:
        doc = nlp(ex.predicted)
        other = ex.reference
        true_count += len(other._.cue_to_content)
        for cue, content_spans in doc._.cue_to_content.items():
            other_cue = span_after_alignment(cue, ex)
            match = True

            other_cue = get_other_span(other_cue, other._.verb_cues)
            if other_cue is None:
                match = False
            else:
                for content in content_spans:
                    other_content = span_after_alignment(content, ex)
                    if not is_span_in(other_content, other._.cue_to_content[other_cue]):
                        match = False
                        break
                for source in doc._.cue_to_source[cue]:
                    other_source = span_after_alignment(source, ex)
                    if not is_span_in(other_source, other._.cue_to_source[other_cue]):
                        match = False
                        break
            if match:
                tp += 1
            else:
                fp += 1
                
    precision = tp / (tp + fp)
    recall = tp / true_count
    if precision + recall != 0: 
        f1 = 2 * precision * recall / (precision + recall)
    else:
        f1 = None
    return precision, recall, f1

In [3]:
def spans_to_label(example, span_f):
    label = 'lbl'
    
    tags_pred = offsets_to_biluo_tags(example.predicted, [(s.start_char, s.end_char, label)
                                       for s in span_f(example.predicted)])
    tags_pred = ['None' if tag == 'O' else label for tag in tags_pred]
    
    tags_true = offsets_to_biluo_tags(example.reference, [(s.start_char, s.end_char, label)
                                       for s in span_f(example.reference)])
    tags_true = ['None' if tag == 'O' else label for tag in tags_true]
    
    tmp = []
    for idx, tag in enumerate(tags_pred):
        align = example.alignment.x2y[idx]
        labels = set([tags_true[i] for i in align])
        # t -> [lbl, lbl, None] labels
        # if at least one matching tag is labelled, then we consider t labelled
        if label in labels:
            tmp.append(label)
        else:
            tmp.append('None')
    tags_true = tmp
    return tags_pred, tags_true

def per_label_metrics(examples):
    cue_pred, cue_true = [], []
    content_pred, content_true = [], []
    source_pred, source_true = [], []
    for ex in examples:
        doc = nlp(ex.predicted)
        new_cue_pred, new_cue_true = spans_to_label(ex, lambda doc: doc._.verb_cues)
        cue_pred += new_cue_pred
        cue_true += new_cue_true
        new_content_pred, new_content_true = spans_to_label(ex, lambda doc: doc._.content_spans)
        content_pred += new_content_pred
        content_true += new_content_true
        new_source_pred, new_source_true = spans_to_label(ex, lambda doc: doc._.source_spans)
        source_pred += new_source_pred
        source_true += new_source_true
    
    print("Source")
    print(classification_report(source_true, source_pred))
    
    print("Cue")
    print(classification_report(cue_true, cue_pred))
    
    print("Content")                
    print(classification_report(content_true, content_pred))
    
    return 0

In [4]:
def per_sentence_labels(examples):

    pred_sent_labels = []
    true_sent_labels = []

    if not Token.has_extension("sent_content"):
        Token.set_extension("sent_content", default='None')
    for ex in examples:
        doc = nlp(ex.predicted)
        for content_span in ex.predicted._.content_spans:
            for token in content_span:
                token._.sent_content = 'content'

        for content_span in ex.reference._.content_spans:
            for token in content_span:
                token._.sent_content = 'content'

        for sent in ex.reference.sents:
            yes = False
            for token in Span(doc,
                              ex.alignment.y2x[sent.start][0],
                              ex.alignment.y2x[sent.end - 1][-1] + 1):
                if token._.sent_content == 'content':
                    yes = True
                    break
            pred_sent_labels.append(yes)

        for sent in ex.reference.sents:
            yes = False
            for token in sent:
                if token._.sent_content == 'content':
                    yes = True
                    break
            true_sent_labels.append(yes)
    
    return pred_sent_labels, true_sent_labels

### Czech

Data loading

In [5]:
cs = CsvCorpus("./data/cs_tokens.csv", "./data/cs_sentences.csv")

#### Evaluation of the system trained without text features

Initialization of the Czech language model and initialization of a pipeline for a system without text features

In [6]:
nlp = init_pipeline(spacy_udpipe.load("cs"), text_features=False)
print(exact_matching_metrics(cs(nlp)))

(0.0, 0.0, None)


Metric for each element of a quotation

In [27]:
p = per_label_metrics(cs(nlp))

Source
              precision    recall  f1-score   support

        None       1.00      0.71      0.83      4012
         lbl       0.17      0.96      0.29       250

    accuracy                           0.73      4262
   macro avg       0.58      0.84      0.56      4262
weighted avg       0.95      0.73      0.80      4262

Cue
              precision    recall  f1-score   support

        None       1.00      1.00      1.00      4137
         lbl       0.92      0.84      0.88       125

    accuracy                           0.99      4262
   macro avg       0.96      0.92      0.94      4262
weighted avg       0.99      0.99      0.99      4262

Content
              precision    recall  f1-score   support

        None       0.92      0.43      0.59      2715
         lbl       0.48      0.94      0.64      1547

    accuracy                           0.61      4262
   macro avg       0.70      0.68      0.61      4262
weighted avg       0.76      0.61      0.60      4262



Generation of labels for sentence-based metrics

In [46]:
pred_sent_labels, true_sent_labels = per_sentence_labels(cs(nlp))

Evaluation of the model per sentence

In [47]:
print(classification_report(true_sent_labels, pred_sent_labels))

              precision    recall  f1-score   support

       False       1.00      0.59      0.74       108
        True       0.71      1.00      0.83       108

    accuracy                           0.80       216
   macro avg       0.86      0.80      0.79       216
weighted avg       0.86      0.80      0.79       216



#### Evaluation of the system trained with text features

Initialization of the Czech language model and initialization of a pipeline for a system with text features

In [15]:
nlp = init_pipeline(spacy_udpipe.load("cs"), text_features=True)
print(exact_matching_metrics(cs(nlp)))

(0.017699115044247787, 0.018018018018018018, 0.01785714285714286)


In [49]:
p = per_label_metrics(cs(nlp))

Source
              precision    recall  f1-score   support

        None       0.98      0.94      0.96      4012
         lbl       0.40      0.68      0.50       250

    accuracy                           0.92      4262
   macro avg       0.69      0.81      0.73      4262
weighted avg       0.94      0.92      0.93      4262

Cue
              precision    recall  f1-score   support

        None       1.00      1.00      1.00      4137
         lbl       0.92      0.84      0.88       125

    accuracy                           0.99      4262
   macro avg       0.96      0.92      0.94      4262
weighted avg       0.99      0.99      0.99      4262

Content
              precision    recall  f1-score   support

        None       0.99      0.65      0.79      2715
         lbl       0.62      0.99      0.76      1547

    accuracy                           0.78      4262
   macro avg       0.80      0.82      0.77      4262
weighted avg       0.86      0.78      0.78      4262



Generation of labels for sentence-based metrics

In [50]:
pred_sent_labels, true_sent_labels = per_sentence_labels(cs(nlp))

Evaluation of the model per sentence

In [51]:
print(classification_report(true_sent_labels, pred_sent_labels))

              precision    recall  f1-score   support

       False       1.00      0.88      0.94       108
        True       0.89      1.00      0.94       108

    accuracy                           0.94       216
   macro avg       0.95      0.94      0.94       216
weighted avg       0.95      0.94      0.94       216



### Russian

In [19]:
ru = CsvCorpus("./data/ru_tokens.csv", "./data/ru_sentences.csv")

#### Evaluation of the system trained without text features

Initialization of the Russian language model and initialization of a pipeline for a system without text features

In [11]:
# Download a language model for Russian
# !python -m spacy download ru_core_news_sm

In [28]:
nlp = init_pipeline(spacy.load("ru_core_news_sm"), text_features=False)
print(exact_matching_metrics(ru(nlp)))

(0.0, 0.0, None)


In [54]:
p = per_label_metrics(ru(nlp))

Source
              precision    recall  f1-score   support

        None       0.99      0.89      0.94      4217
         lbl       0.28      0.84      0.42       208

    accuracy                           0.89      4425
   macro avg       0.64      0.87      0.68      4425
weighted avg       0.96      0.89      0.92      4425

Cue
              precision    recall  f1-score   support

        None       0.99      1.00      0.99      4299
         lbl       0.81      0.71      0.75       126

    accuracy                           0.99      4425
   macro avg       0.90      0.85      0.87      4425
weighted avg       0.99      0.99      0.99      4425

Content
              precision    recall  f1-score   support

        None       0.87      0.87      0.87      2920
         lbl       0.74      0.74      0.74      1505

    accuracy                           0.82      4425
   macro avg       0.80      0.80      0.80      4425
weighted avg       0.82      0.82      0.82      4425



In [57]:
pred_sent_labels, true_sent_labels = per_sentence_labels(ru(nlp))

In [58]:
print(classification_report(true_sent_labels, pred_sent_labels))

              precision    recall  f1-score   support

       False       0.84      0.95      0.90       108
        True       0.94      0.81      0.87       100

    accuracy                           0.88       208
   macro avg       0.89      0.88      0.88       208
weighted avg       0.89      0.88      0.88       208



#### Evaluation of the system trained with text features

Initialization of the Russian language model and initialization of a pipeline for a system with text features

In [29]:
nlp = init_pipeline(spacy.load("ru_core_news_sm"), text_features=True)
print(exact_matching_metrics(ru(nlp)))

(0.0, 0.0, None)


In [60]:
p = per_label_metrics(ru(nlp))

Source
              precision    recall  f1-score   support

        None       0.98      0.95      0.97      4217
         lbl       0.40      0.65      0.50       208

    accuracy                           0.94      4425
   macro avg       0.69      0.80      0.73      4425
weighted avg       0.95      0.94      0.94      4425

Cue
              precision    recall  f1-score   support

        None       0.99      1.00      0.99      4299
         lbl       0.81      0.71      0.75       126

    accuracy                           0.99      4425
   macro avg       0.90      0.85      0.87      4425
weighted avg       0.99      0.99      0.99      4425

Content
              precision    recall  f1-score   support

        None       0.96      0.78      0.86      2920
         lbl       0.68      0.94      0.79      1505

    accuracy                           0.83      4425
   macro avg       0.82      0.86      0.82      4425
weighted avg       0.87      0.83      0.84      4425



In [61]:
pred_sent_labels, true_sent_labels = per_sentence_labels(ru(nlp))

In [62]:
print(classification_report(true_sent_labels, pred_sent_labels))

              precision    recall  f1-score   support

       False       0.95      0.94      0.94       108
        True       0.93      0.95      0.94       100

    accuracy                           0.94       208
   macro avg       0.94      0.94      0.94       208
weighted avg       0.94      0.94      0.94       208



## English

In [77]:
en = Parc3Corpus('./data/PARC3_complete/test/') #English dataset isn't provided

#### Evaluation of the system trained without text features

In [81]:
nlp = init_pipeline(spacy.load("en_core_web_sm"), text_features=False, ner=True)
print(exact_matching_metrics(en(nlp)))



(0.4541176470588235, 0.37658536585365854, 0.41173333333333334)


In [7]:
p = per_label_metrics(en(nlp))

Source
              precision    recall  f1-score   support

        None       0.98      0.98      0.98     54277
         lbl       0.77      0.71      0.74      3920

    accuracy                           0.97     58197
   macro avg       0.87      0.85      0.86     58197
weighted avg       0.97      0.97      0.97     58197

Cue
              precision    recall  f1-score   support

        None       0.99      1.00      1.00     56916
         lbl       0.84      0.72      0.78      1281

    accuracy                           0.99     58197
   macro avg       0.92      0.86      0.89     58197
weighted avg       0.99      0.99      0.99     58197

Content
              precision    recall  f1-score   support

        None       0.89      0.94      0.91     38269
         lbl       0.87      0.78      0.82     19928

    accuracy                           0.88     58197
   macro avg       0.88      0.86      0.87     58197
weighted avg       0.88      0.88      0.88     58197



In [27]:
pred_sent_labels, true_sent_labels = per_sentence_labels(en(nlp))

In [28]:
print(classification_report(true_sent_labels, pred_sent_labels))

              precision    recall  f1-score   support

       False       0.82      0.95      0.88      1306
        True       0.93      0.75      0.83      1110

    accuracy                           0.86      2416
   macro avg       0.88      0.85      0.86      2416
weighted avg       0.87      0.86      0.86      2416



#### Evaluation of the system trained with text features

In [80]:
nlp = init_pipeline(spacy.load("en_core_web_sm"), text_features=True, ner=True)
print(exact_matching_metrics(en(nlp)))
#print(f1_bbc(en(nlp)))



(0.4517593643586833, 0.3882926829268293, 0.4176285414480587)


In [9]:
p = per_label_metrics(en(nlp))

Source
              precision    recall  f1-score   support

        None       0.98      0.99      0.98     54277
         lbl       0.80      0.77      0.78      3920

    accuracy                           0.97     58197
   macro avg       0.89      0.88      0.88     58197
weighted avg       0.97      0.97      0.97     58197

Cue
              precision    recall  f1-score   support

        None       0.99      1.00      1.00     56916
         lbl       0.84      0.72      0.78      1281

    accuracy                           0.99     58197
   macro avg       0.92      0.86      0.89     58197
weighted avg       0.99      0.99      0.99     58197

Content
              precision    recall  f1-score   support

        None       0.89      0.94      0.91     38269
         lbl       0.87      0.78      0.82     19928

    accuracy                           0.89     58197
   macro avg       0.88      0.86      0.87     58197
weighted avg       0.88      0.89      0.88     58197



In [30]:
pred_sent_labels, true_sent_labels = per_sentence_labels(en(nlp))

In [31]:
print(classification_report(true_sent_labels, pred_sent_labels))

              precision    recall  f1-score   support

       False       0.83      0.95      0.89      1306
        True       0.93      0.77      0.85      1110

    accuracy                           0.87      2416
   macro avg       0.88      0.86      0.87      2416
weighted avg       0.88      0.87      0.87      2416

