# Main

Change INFO_TAG according the data you want to project and/or train the models with.

Change PROCESS_TAG according the data you want to process with the trained models.

In [None]:
from pathlib import Path


INFO_TAG = "cdcp"
# INFO_TAG = "persuasive_essays_paragraph_all_linked"
# INFO_TAG = "abstrct"

# PROCESS_TAG = "responded_granma_letters"
# PROCESS_TAG = "response_responded_granma_letters"
PROCESS_TAG = "selected_response_responded_granma_letters"


BASE_DATA = Path("data")

SOURCE_LANGUAGE = "english"
TARGET_LANGUAGE = "spanish"

# Corpus Projection
CORPUS = BASE_DATA / "corpus" / INFO_TAG
PROCESSED_CORPUS = BASE_DATA / "parsed_to_conll" / INFO_TAG
SENTENCE_ALIGN = BASE_DATA / 'sentence_alignment' / INFO_TAG
BIDIRECTIONAL_ALIGN = BASE_DATA / 'bidirectional_alignment' / INFO_TAG
PROJECTION = BASE_DATA / 'projection' / INFO_TAG

# Link Prediction
TO_PROCESS = BASE_DATA / "to_process" / PROCESS_TAG
SEGMENTER = BASE_DATA / "segmenter_processed" / INFO_TAG / PROCESS_TAG
LINK_PREDICTION = BASE_DATA / 'link_prediction_processed' / INFO_TAG / PROCESS_TAG

# Export to Brat
BRAT = Path("brat", "data", PROCESS_TAG, INFO_TAG)


# Corpus Projection

Make corpus projection. From SOURCE_LANGUAGE to TARGET_LANGUAGE.

To change the algorithms used in each step, import other versions of it. For example:

python
```
from aligner.aligner import FastAlignAligner as Aligner
```


In [None]:
from pipelines.corpus_pipelines import full_corpus_processing_pipeline, make_alignemnts_pipeline
from aligner.aligner import AwesomeAlignAligner as Aligner
from corpus_parser.unified_parser import UnifiedParser as Parser
from projector.projector import CrossLingualAnnotationProjector as Projector
from sentence_aligner.sentence_aligner import SentenceAligner
from sentence_aligner.translator import GoogleDeepTranslator as Translator
from data_augmentation.translation_augmentation import TranslateDataAugmentator as DataAugmentator

for split in ['dev', 'test', 'train']:
    
    print(split)
    print()
    
    full_corpus_processing_pipeline(
        corpus_dir=CORPUS / split,
        standard_corpus_dest_dir=PROCESSED_CORPUS / split,
        sentence_alignment_dest_dir=SENTENCE_ALIGN / split,
        bidirectional_alignment_dest_dir=BIDIRECTIONAL_ALIGN / split,
        projection_dest_dir=PROJECTION / split,
        corpus_parser=Parser(),
        sentence_aligner=SentenceAligner(Translator()),
        aligner=Aligner(),
        projector=Projector(),
        data_augmentator=DataAugmentator(),
        source_language=SOURCE_LANGUAGE,
        target_language=TARGET_LANGUAGE,
        middle_language=TARGET_LANGUAGE,
        use_spacy=True,
    )
    
#     make_alignemnts_pipeline(
#         standard_corpus_dir=PROCESSED_CORPUS/ split,
#         sentence_alignment_dest_dir=SENTENCE_ALIGN/ split,
#         bidirectional_alignment_dest_dir=BIDIRECTIONAL_ALIGN/ split,
#         projection_dest_dir=PROJECTION/ split,
#         sentence_aligner=SentenceAligner(Translator()),
#         aligner=Aligner(),
#         projector=Projector(),
#         data_augmentator=DataAugmentator(),
#         source_language=SOURCE_LANGUAGE,
#         target_language=TARGET_LANGUAGE,
#         use_spacy=True,
#     )
    

## Train Segmentator



In [None]:
from segmenter.models.train import train as segmenter_train

segmenter_kwargs = {
    "corpus_tag": INFO_TAG,
    "language": TARGET_LANGUAGE,
    # Define other kwargs. For full list see param dictionary at segmenter/models/segmenter.ipynb
}


In [None]:
segmenter_train(**segmenter_kwargs)

## Train Link Predictor

In [None]:
from link_prediction.models.train import train as link_prediction_train

link_prediction_kwargs = {
    "corpus_tag": INFO_TAG,
    "language": TARGET_LANGUAGE,
    # Define other kwargs. For full list see param dictionary at link_prediction/models/link_prediction.ipynb
}


In [None]:
link_prediction_train(**link_prediction_kwargs)

# Segmentation

In [None]:
from pipelines.segmenter_pipelines import perform_segmentation_pipeline
from segmenter.tf_segmenter import TensorflowArgumentSegmenter as Segmenter

segmenter = Segmenter(INFO_TAG, TARGET_LANGUAGE, **segmenter_kwargs)

In [None]:
# Only segmentation

perform_segmentation_pipeline(
    segmenter=segmenter,
    source_dir=TO_PROCESS,
    destination_dir=SEGMENTER,
    language=TARGET_LANGUAGE,
)

# Link Prediction

In [None]:
from pipelines.segmenter_pipelines import perform_link_prediction_pipeline
from link_prediction.tf_link_predictor import TensorflowLinkPredictor as LinkPredictor

link_predictor = LinkPredictor(INFO_TAG, TARGET_LANGUAGE, **link_prediction_kwargs)

In [None]:
# Only link prediction (Segmentation process must be done first)

perform_link_prediction_pipeline(
    link_predictor=link_predictor,
    source_dir=SEGMENTER,
    destination_dir=LINK_PREDICTION,
    source_language=TARGET_LANGUAGE
)

# Link Prediction and Segmentation

In [None]:
from pipelines.segmenter_pipelines import perform_full_inference_pipeline

# Both processes Segmentation and Link prediction
perform_full_inference_pipeline(
    segmenter=segmenter,
    link_predictor=link_predictor,
    source_dir=TO_PROCESS,
    segmenter_destination_dir=SEGMENTER,
    destination_dir=LINK_PREDICTION,
    source_language=TARGET_LANGUAGE
)


# Export to Brat

To run brat server run the script `run_brat.sh`

In [None]:
from corpus_parser.brat_parser import BratParser
from corpus_parser.conll_parser import ConllParser

dataframes_dict = ConllParser(bioes=True).parse_dir(LINK_PREDICTION)
BratParser().export_from_dataframes(BRAT, dataframes_dict)