# Main

Using the notebook will allow you to load the model once and use it as many times as you
want. Also makes the code more resilient to errors, such as a bad path.

In [1]:
from pathlib import Path

INFO_TAG = "persuasive_essays_paragraph"
PROCESS_TAG = "granma_letters"
BASE_DATA = Path("data")

SOURCE_LANGUAGE = "english"
TARGET_LANGUAGE = "spanish"

# Corpus Projection
CORPUS = BASE_DATA / "corpus" / INFO_TAG
PROCESSED_CORPUS = BASE_DATA / "parsed_to_conll" / INFO_TAG
SENTENCE_ALIGN = BASE_DATA / 'sentence_alignment' / INFO_TAG
BIDIRECTIONAL_ALIGN = BASE_DATA / 'bidirectional_alignment' / INFO_TAG
PROJECTION = BASE_DATA / 'projection' / INFO_TAG

# Link Prediction
TO_PROCESS = BASE_DATA / "to_process" / PROCESS_TAG
SEGMENTER = BASE_DATA / "segmenter_processed" / INFO_TAG / PROCESS_TAG
LINK_PREDICTION = BASE_DATA / 'link_prediction_processed' / INFO_TAG / PROCESS_TAG

# Export to Brat
BRAT = Path("brat", "data", PROCESS_TAG)


# Corpus Projection

In [None]:
from pipelines.corpus_pipelines import full_corpus_processing_pipeline

from aligner.aligner import AwesomeAlignAligner as Aligner
from corpus_parser.unified_parser import UnifiedParser as Parser
from projector.projector import CrossLingualAnnotationProjector as Projector
from sentence_aligner.sentence_aligner import SentenceAligner
from sentence_aligner.translator import GoogleDeepTranslator as Translator

full_corpus_processing_pipeline(
    corpus_dir=CORPUS,
    standard_corpus_dest_dir=PROCESSED_CORPUS,
    sentence_alignment_dest_dir=SENTENCE_ALIGN,
    bidirectional_alignment_dest_dir=BIDIRECTIONAL_ALIGN,
    projection_dest_dir=PROJECTION,
    corpus_parser=Parser(),
    sentence_aligner=SentenceAligner(Translator()),
    aligner=Aligner(),
    projector=Projector(),
    source_language=SOURCE_LANGUAGE,
    target_language=TARGET_LANGUAGE
)

# Link Prediction

In [2]:
from pipelines.segmenter_pipelines import perform_full_inference_pipeline, perform_segmentation_pipeline, perform_link_prediction_pipeline


In [3]:
from segmenter.tf_segmenter import TensorflowArgumentSegmenter as Segmenter

segmenter = Segmenter(INFO_TAG, TARGET_LANGUAGE)


tag_amount 11
char_amount 89
word_amount 9007
max_word_size 19
max_seq_size 579

word_to_index
Length: 9009
First 20: [('', 0), ('[UNK]', 1), (',', 2), ('de', 3), ('.', 4), ('la', 5), ('que', 6), ('los', 7), ('en', 8), ('y', 9), ('a', 10), ('el', 11), ('las', 12), ('para', 13), ('un', 14), ('es', 15), ('una', 16), ('más', 17), ('no', 18), ('se', 19)]

tag_to_index
Length: 13
First 20: [('', 0), ('[UNK]', 1), ('I-Premise', 2), ('O', 3), ('I-Claim', 4), ('I-MajorClaim', 5), ('E-Premise', 6), ('B-Premise', 7), ('E-Claim', 8), ('B-Claim', 9), ('E-MajorClaim', 10), ('B-MajorClaim', 11), ('S-MajorClaim', 12)]

char_to_index
Length: 91
First 20: [('', 0), ('[UNK]', 1), (' ', 2), ('e', 3), ('a', 4), ('s', 5), ('o', 6), ('n', 7), ('r', 8), ('i', 9), ('d', 10), ('l', 11), ('t', 12), ('u', 13), ('c', 14), ('m', 15), ('p', 16), ('b', 17), (',', 18), ('g', 19)]
tf.Tensor(
[[b'I-Premise' b'I-Premise' b'O' b'O' b'O' b'O' b'O' b'O' b'O' b'O' b'O'
  b'O' b'O' b'O' b'O' b'O' b'O' b'O' b'O' b'O' b'O' b'B

In [3]:
from link_prediction.tf_link_predictor import TensorflowLinkPredictor as LinkPredictor

link_predictor = LinkPredictor(INFO_TAG, TARGET_LANGUAGE)

dev relations 652
dev source argumentative units 326
dev target argumentative units 144
test relations 1618
test source argumentative units 809
test target argumentative units 365
train relations 5392
train source argumentative units 2696
train target argumentative units 1198
Vocab size 8958
Relation tags ['supports_Inverse', 'attacks', 'attacks_Inverse', 'supports']
Proposition tags ['Claim', 'Premise', 'MajorClaim']
max_size_prop 70
max_amount_doc 20
[('supports', 'Premise', 'Premise'), ('', 'Premise', 'Premise'), ('supports_Inverse', 'Premise', 'Premise')]
3


In [5]:

# perform_segmentation_pipeline(
#     segmenter=segmenter,
#     source_dir=TO_PROCESS,
#     destination_dir=SEGMENTER,
#     language=TARGET_LANGUAGE,
# )

perform_link_prediction_pipeline(
    link_predictor=link_predictor,
    source_dir=SEGMENTER,
    destination_dir=LINK_PREDICTION,
    source_language=TARGET_LANGUAGE
)

# perform_full_inference_pipeline(
#     segmenter=segmenter,
#     link_predictor=link_predictor,
#     source_dir=TO_PROCESS,
#     segmenter_destination_dir=SEGMENTER,
#     destination_dir=LINK_PREDICTION,
#     source_language=TARGET_LANGUAGE
# )

# Export to Brat

In [6]:
from corpus_parser.brat_parser import BratParser
from corpus_parser.conll_parser import ConllParser

dataframes_dict = ConllParser(bioes=True).parse_dir(LINK_PREDICTION)

BratParser().export_from_dataframes(BRAT, dataframes_dict)
