# Classification-pipeline Demo
This is a demo version of the classification pipeline. The pipeline will be applied to a one sample document from the MAKG.

In [6]:
# Imports
from parsing.mag_parser import read_documents, parse_document
from ner.tsener import extract_entities
from usage_classification.bert_model import init_bert_model, usage_classification
from aggregation.aggregator import aggregate_probabilities, filter_entities
from os.path import join

## Step 1: Read and parse a MAG paper and extract all named entities

In [2]:
# Get the first document id from the CSV file
with open('document_id_sampled_per_year.csv', 'r') as f:
    f.readline() # header
    document_id = f.readline().strip()
    
document_id

'2118681326'

In [4]:
# Read the GROBID fulltext
with open(join('mag_papers_sample1', document_id + ".tei.xml"), 'r') as f:
    document = parse_document(f)
    
# Document is a list tuples, where each tuple consists of a section name and
# a list of all sentences inside that section.
document

[('Abstract',
  ['In Statistics-Based Summarization -Step One: Sentence Compression, Knight and Marcu [reference]) (K&M) present a noisy-channel model for sentence compression.',
   'The main difficulty in using this method is the lack of data; Knight and Marcu use a corpus of 1035 training sentences.',
   'More data is not easily available, so in addition to improving the original K&M noisy-channel model, we create unsupervised and semi-supervised models of the task.',
   'Finally, we point out problems with modeling the task in this way.',
   'They suggest areas for future research.']),
 ('Introduction',
  ['Summarization in general, and sentence compression in particular, are popular topics.',
   'Knight and Marcu (henceforth K&M) introduce the task of statistical sentence compression in Statistics-Based Summarization -Step One: Sentence Compression [reference]).',
   'The appeal of this problem is that it produces summarizations on a small scale.',
   'It simplifies general compres

In [5]:
# Perform a named entity recognition for the entity type 'method' using the trained TSE-NER model.
entities = extract_entities(document, 'method')
# entities is a list of tuples, where each tuple consists of the named entity,
# the sentence in which the entity occurs, the pre- and post-sentence and the
# section name.
entities

Tagging 4107 words


[('Marcu',
  'The main difficulty in using this method is the lack of data; Knight and Marcu use a corpus of 1035 training sentences.',
  'In Statistics-Based Summarization -Step One: Sentence Compression, Knight and Marcu [reference]) (K&M) present a noisy-channel model for sentence compression.',
  'More data is not easily available, so in addition to improving the original K&M noisy-channel model, we create unsupervised and semi-supervised models of the task.',
  'Abstract'),
 ('machine translation',
  'The K&M probabilistic model, adapted from machine translation to this task, is the noisy-channel model.',
  '',
  'In machine translation, one imagines that a string was originally in English, but that someone adds some noise to make it a foreign string.',
  'The K&M Model'),
 ('machine translation',
  'In machine translation, one imagines that a string was originally in English, but that someone adds some noise to make it a foreign string.',
  'The K&M probabilistic model, adapted f

## 2. Classify all extracted entities using the fine-tuned SciBERT model

In [8]:
# Initialize the BERT tokenizer and model
tokenizer, model = init_bert_model(finetuned_name='bert_model.bin')

In [11]:
# Probabilities is a dictionary with ne as key and a list of probabilities as value.
probabilities = {}
for entity in entities:
    # Apply BERT model
    prob = usage_classification(tokenizer, model, *entity)
    # Insert all returned probabilities into the probabilities-dictionary.
    ne = entity[0].lower()
    sentence = entity[1]
    if ne not in probabilities:
        probabilities[ne] = ([], sentence)
        probabilities[ne][0].append(prob)
        
probabilities

{'marcu': ([0.002498308662325144],
  'The main difficulty in using this method is the lack of data; Knight and Marcu use a corpus of 1035 training sentences.'),
 'machine translation': ([0.0036958118434995413],
  'The K&M probabilistic model, adapted from machine translation to this task, is the noisy-channel model.'),
 'tree': ([0.34160423278808594],
  'We do not use a packed tree structure, because we make far fewer sentences.'),
 'whereas': ([0.9868675470352173],
  'The unsupervised version does not compress at all, whereas the semi-supervised version is identical with the better supervised version.'),
 'although': ([0.9273919463157654],
  'Although the final length of the sentences is roughly the same, the unsupervised and semisupervised versions are able to take the action of deleting the parenthetical.'),
 'algorithm': ([0.0019390813540667295],
  'original: The faster transfer rate is made possible by an MTI-proprietary data buffering algorithm that off-loads lock-manager functio

## 3. Aggregate the probabilities for all extracted entities

In [12]:
# Aggregate the probabilities for each entity using majority voting.
unique_entities = aggregate_probabilities(probabilities)
# Filter out non-relevant entities:
filtered_entities = filter_entities(unique_entities)

filtered_entities

[('whereas',
  0.9868675470352173,
  1,
  'The unsupervised version does not compress at all, whereas the semi-supervised version is identical with the better supervised version.'),
 ('although',
  0.9273919463157654,
  1,
  'Although the final length of the sentences is roughly the same, the unsupervised and semisupervised versions are able to take the action of deleting the parenthetical.'),
 ('machine translation',
  0.0036958118434995413,
  1,
  'The K&M probabilistic model, adapted from machine translation to this task, is the noisy-channel model.'),
 ('marcu',
  0.002498308662325144,
  1,
  'The main difficulty in using this method is the lack of data; Knight and Marcu use a corpus of 1035 training sentences.')]