In [None]:
# Install required packages
# !pip install -r requirements.txt

In [6]:
import spacy
from spacy.tokens import Token, DocBin, Doc, Span
from spacy.training import Example, offsets_to_biluo_tags
import spacy_udpipe
from sklearn.metrics import classification_report

import baseline
from parc3corpus import Parc3Corpus
from df_corpus import CsvCorpus

In [4]:
def spans_to_label(example, span_f):
    label = 'lbl'
    
    tags_pred = offsets_to_biluo_tags(example.predicted, [(s.start_char, s.end_char, label)
                                       for s in span_f(example.predicted)])
    tags_pred = ['None' if tag == 'O' else label for tag in tags_pred]
    
    tags_true = offsets_to_biluo_tags(example.reference, [(s.start_char, s.end_char, label)
                                       for s in span_f(example.reference)])
    tags_true = ['None' if tag == 'O' else label for tag in tags_true]
    
    tmp = []
    for idx, tag in enumerate(tags_pred):
        align = example.alignment.x2y[idx]
        labels = set([tags_true[i] for i in align])
        # t -> [lbl, lbl, None] labels
        # if at least one matching tag is labelled, then we consider t labelled
        if label in labels:
            tmp.append(label)
        else:
            tmp.append('None')
    tags_true = tmp
    return tags_pred, tags_true

def per_label_metrics(examples):
    cue_pred, cue_true = [], []
    content_pred, content_true = [], []
    source_pred, source_true = [], []
    for ex in examples:
        doc = nlp(ex.predicted)
        new_cue_pred, new_cue_true = spans_to_label(ex, lambda doc: doc._.verb_cues)
        cue_pred += new_cue_pred
        cue_true += new_cue_true
        new_content_pred, new_content_true = spans_to_label(ex, lambda doc: doc._.content_spans)
        content_pred += new_content_pred
        content_true += new_content_true
        new_source_pred, new_source_true = spans_to_label(ex, lambda doc: doc._.source_spans)
        source_pred += new_source_pred
        source_true += new_source_true
    
    print("Source")
    print(classification_report(source_true, source_pred))
    
    print("Cue")
    print(classification_report(cue_true, cue_pred))
    
    print("Content")                
    print(classification_report(content_true, content_pred))
    
    return 0

In [7]:
def per_sentence_labels(examples):

    pred_sent_labels = []
    true_sent_labels = []

    if not Token.has_extension("sent_content"):
        Token.set_extension("sent_content", default='None')
    for ex in examples:
        doc = nlp(ex.predicted)
        for content_span in ex.predicted._.content_spans:
            for token in content_span:
                token._.sent_content = 'content'

        for content_span in ex.reference._.content_spans:
            for token in content_span:
                token._.sent_content = 'content'

        for sent in ex.reference.sents:
            yes = False
            for token in Span(doc,
                              ex.alignment.y2x[sent.start][0],
                              ex.alignment.y2x[sent.end - 1][-1] + 1):
                if token._.sent_content == 'content':
                    yes = True
                    break
            pred_sent_labels.append(yes)

        for sent in ex.reference.sents:
            yes = False
            for token in sent:
                if token._.sent_content == 'content':
                    yes = True
                    break
            true_sent_labels.append(yes)
    
    return pred_sent_labels, true_sent_labels

## English

Data loading

In [3]:
en = Parc3Corpus('./data/PARC3_complete/test/') #English dataset isn't provided

Initialization of the English language model and a custom-made pipeline for a baseline rule-based model

In [None]:
# Download a language model for English
# !python -m spacy download en_core_web_sm

In [3]:
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe('rule_based_attribution')

Metric for each element of a quotation

In [8]:
p = per_label_metrics(en(nlp))

Source
              precision    recall  f1-score   support

        None       0.94      1.00      0.97     54277
         lbl       0.95      0.13      0.22      3920

    accuracy                           0.94     58197
   macro avg       0.94      0.56      0.60     58197
weighted avg       0.94      0.94      0.92     58197

Cue
              precision    recall  f1-score   support

        None       0.98      1.00      0.99     56916
         lbl       0.94      0.23      0.37      1281

    accuracy                           0.98     58197
   macro avg       0.96      0.61      0.68     58197
weighted avg       0.98      0.98      0.98     58197

Content
              precision    recall  f1-score   support

        None       0.74      0.99      0.85     38269
         lbl       0.94      0.33      0.49     19928

    accuracy                           0.76     58197
   macro avg       0.84      0.66      0.67     58197
weighted avg       0.81      0.76      0.72     58197



Generation of labels for sentence-based metrics

In [9]:
pred_sent_labels, true_sent_labels = per_sentence_labels(list(en(nlp)))

Evaluation of the model per sentence

In [10]:
print(classification_report(true_sent_labels, pred_sent_labels))

              precision    recall  f1-score   support

       False       0.66      0.97      0.78      1306
        True       0.92      0.40      0.56      1110

    accuracy                           0.71      2416
   macro avg       0.79      0.69      0.67      2416
weighted avg       0.78      0.71      0.68      2416



## Czech

Data loading

In [14]:
cs = CsvCorpus("./data/okens.csv", "./data/cs_sentences.csv")

Initialization of the Czech language model and a custom-made pipeline for a baseline rule-based model

In [9]:
nlp = spacy_udpipe.load("cs")
nlp.add_pipe('rule_based_attribution')

<baseline.RuleBasedAttribution at 0x7f9e8689bcd0>

Metric for each element of a quotation

In [10]:
p = per_label_metrics(cs(nlp))

1 quotation marks found, indicating an unclosed quotation; given the limitations of this method, it's safest to bail out rather than guess which quotation is unclosed
Source
              precision    recall  f1-score   support

        None       0.95      1.00      0.97      4012
         lbl       1.00      0.12      0.21       250

    accuracy                           0.95      4262
   macro avg       0.97      0.56      0.59      4262
weighted avg       0.95      0.95      0.93      4262

Cue
              precision    recall  f1-score   support

        None       0.98      1.00      0.99      4137
         lbl       1.00      0.23      0.38       125

    accuracy                           0.98      4262
   macro avg       0.99      0.62      0.68      4262
weighted avg       0.98      0.98      0.97      4262

Content
              precision    recall  f1-score   support

        None       0.74      0.99      0.85      2715
         lbl       0.97      0.40      0.56      15

Generation of labels for sentence-based metrics

In [43]:
pred_sent_labels, true_sent_labels = per_sentence_labels(list(cs(nlp)))

1 quotation marks found, indicating an unclosed quotation; given the limitations of this method, it's safest to bail out rather than guess which quotation is unclosed


Evaluation of the model per sentence

In [44]:
print(classification_report(true_sent_labels, pred_sent_labels))

              precision    recall  f1-score   support

       False       0.61      0.96      0.75       108
        True       0.91      0.39      0.55       108

    accuracy                           0.68       216
   macro avg       0.76      0.68      0.65       216
weighted avg       0.76      0.68      0.65       216



## Russian

Data loading

In [12]:
ru = CsvCorpus("./data/ru_tokens.csv", "./data/ru_sentences.csv")

Initialization of the Russian language model and a custom-made pipeline for a baseline rule-based model

In [16]:
# Download a language model for Russian
# !python -m spacy download ru_core_news_sm

In [50]:
nlp = spacy.load("ru_core_news_sm")
nlp.add_pipe('rule_based_attribution')


[W111] Jupyter notebook detected: if using `prefer_gpu()` or `require_gpu()`, include it in the same cell right before `spacy.load()` to ensure that the model is loaded on the correct device. More information: http://spacy.io/usage/v3#jupyter-notebook-gpu



<baseline.RuleBasedAttribution at 0x7f73310d7be0>

Metric for each element of a quotation

In [47]:
p = per_label_metrics(ru(nlp))

1 quotation marks found, indicating an unclosed quotation; given the limitations of this method, it's safest to bail out rather than guess which quotation is unclosed
1 quotation marks found, indicating an unclosed quotation; given the limitations of this method, it's safest to bail out rather than guess which quotation is unclosed
Source
              precision    recall  f1-score   support

        None       0.96      1.00      0.98      4217
         lbl       0.90      0.18      0.30       208

    accuracy                           0.96      4425
   macro avg       0.93      0.59      0.64      4425
weighted avg       0.96      0.96      0.95      4425

Cue
              precision    recall  f1-score   support

        None       0.98      1.00      0.99      4299
         lbl       0.95      0.31      0.47       126

    accuracy                           0.98      4425
   macro avg       0.97      0.65      0.73      4425
weighted avg       0.98      0.98      0.97      4425

C

Generation of labels for sentence-based metrics

In [51]:
pred_sent_labels, true_sent_labels = per_sentence_labels(list(ru(nlp)))

1 quotation marks found, indicating an unclosed quotation; given the limitations of this method, it's safest to bail out rather than guess which quotation is unclosed
1 quotation marks found, indicating an unclosed quotation; given the limitations of this method, it's safest to bail out rather than guess which quotation is unclosed


Evaluation of the model per sentence

In [52]:
print(classification_report(true_sent_labels, pred_sent_labels))

              precision    recall  f1-score   support

       False       0.68      0.93      0.78       108
        True       0.87      0.53      0.66       100

    accuracy                           0.74       208
   macro avg       0.77      0.73      0.72       208
weighted avg       0.77      0.74      0.72       208

