# Argument Mining Repo

This notebook presents an end-to-end cross language algorithm to perform the argumentative structure extraction from texts. 

# Install Software

This section will install and setup all you need to run the program

In [None]:
# Delete repo
# !rm -r argument-mining/

# Clone repo
!git clone https://github.com/luisoibarra/argument-mining.git

# Update repo with master
# !cd argument-mining/; git reset --hard && git clean -fd; git pull

In [None]:
from pathlib import Path

# Changing .sh permissions
!cd argument-mining/ && chmod 777 install.sh
for file in Path("argument-mining", "scripts").iterdir():
  if file.is_file() and file.suffix == ".sh":
    name = str(file)
    !chmod 777 $name

!cd argument-mining && ./install.sh
!cd argument-mining/scripts && ./install_tools.sh nobrat


# Configuration


In [None]:
#@title Setting params: { display-mode: "form" }
import os
import time
from IPython.display import clear_output
from IPython.utils import capture

#@markdown ## Set the target language and the source language 

#@markdown ---

with capture.capture_output() as cap: 
  %cd /content/
#@markdown Cropus language (can try with another language, although is not been tested with other):
source_language = "english" #@param ["english", "spanish", "french"]  {allow-input: true}

#@markdown Target language (same as source language):
target_language = "spanish" #@param ["english", "spanish", "french"]  {allow-input: true}

#@markdown ## Set the training corpus
#@markdown ---

#@markdown This tag must exist in argument-mining/data/corpus/ with train, test and dev subfolders.
corpus = "persuasive_essays_paragraph_all_linked" #@param ["abstrct", "cdcp", "persuasive_essays_paragraph", "persuassive_essays_paragraph_all_linked"] {allow-input: true}

#@markdown ## Set the process tag.
#@markdown ---

#@markdown This tag must exist in argument-mining/data/to_process/
process = "testing" #@param ["testing", "granma_letters", "response_responded_granma_letters", "selected_response_responded_granma_letters"] {allow-input: true}

#@markdown ## Path of the saved models, if any.
#@markdown ---

#@markdown Segmenter path
segmenter_saved_path = "/content/drive/MyDrive/argument-mining/segmenter" #@param {type:"string"}
segmenter_saved_path = Path(segmenter_saved_path, corpus)

#@markdown Link Predictor path
link_predictor_saved_path = "/content/drive/MyDrive/argument-mining/link_prediction" #@param {type:"string"}
link_predictor_saved_path = Path(link_predictor_saved_path, corpus)

#@markdown Segmenter path
segmenter_to_save_path = "/content/segmenter" #@param {type:"string"}
segmenter_to_save_path = Path(segmenter_to_save_path, corpus)

#@markdown Link Predictor path
link_predictor_to_save_path = "/content/link_prediction" #@param {type:"string"}
link_predictor_to_save_path = Path(link_predictor_to_save_path, corpus)

if segmenter_saved_path.exists():
  path = str(segmenter_saved_path)
  target = Path(f"/content/argument-mining/data/segmenter_corpus/")
  target.mkdir(parents=True, exist_ok=True)
  target_path = str(target)
  !cp -r "$path/" "$target_path"
  print("Model segmenter copied.")

if link_predictor_saved_path.exists():
  path = str(link_predictor_saved_path)
  target = Path(f"/content/argument-mining/data/link_prediction/")
  target.mkdir(parents=True, exist_ok=True)
  target_path = str(target)
  !cp -r "$path/" "$target_path"
  print("Model link predictor copied.")

# Corpus Projection

This section will create the corpus in the target language by projecting the labels from the corpus in the source language. Can be skipped if abstrct, cdcp or persuasive_essays_paragraph_all_linked were selected, or if you already have the projections in the folder /content/argument-mining/data/projection/$corpus_name.

A corpus can be added to the /content/argument-mining/data/corpus/$corpus_name with train, dev and test subfolders.

In [None]:
# Projecting corpus
for split in ["dev", "test", "train"]:
  !cd argument-mining/ python3 project_corpus.py \
  "data/corpus/$corpus/$split" \
  "data/parsed_to_conll/$corpus/$split" \
  "data/sentence_alignment/$corpus/$split" \
  "data/bidirectional_alignment/$corpus/$split" \
  "data/projection/$corpus/$split"\
  --source_language $source_language\
  --target_language $target_language


# Train Models

This section will train the segmenter and the link predictor. If the models were loaded can skip this section.

## Segmenter

The segmenter model will perform the split of the argumentative discourse units (ADU) and its classification.

## Link Predictor

The link predictor will perform the prediction and classification of the links between the extracted ADUs.

In [None]:
# Training segmenter
!cd argument-mining/scripts && ./train_segmenter.sh $corpus $target_language --epochs 1

In [None]:
# Training link projection
!cd argument-mining/scripts && ./train_link_predictor.sh $corpus $target_language --epochs 1

In [None]:
segmenter_to_save_path.mkdir(parents=True, exist_ok=True)
link_predictor_to_save_path.mkdir(parents=True, exist_ok=True)

segmenter_to_save_path_str = str(segmenter_to_save_path)
link_predictor_to_save_path_str = str(link_predictor_to_save_path)

!cp -r /content/argument-mining/data/segmenter_corpus/$corpus/* $segmenter_to_save_path_str
!cp -r /content/argument-mining/data/link_prediction/$corpus/* $link_predictor_to_save_path_str

# Extract Argumentative Structures

This section will extract the argumentative structure using the trained models.

You can copy the text to process into /content/argument-mining/data/to_process/$name

In [None]:
processed_data_path = Path(f"data/link_prediction_processed/{corpus}/{process}")
processed_data_path_str = str(processed_data_path)

# Applying segmenter and link projector
!cd argument-mining/ && python3 predict_relations.py --corpus_tag $corpus --source_language $target_language \
"data/to_process/$process" \
"data/segmenter_processed/$corpus/$process" \
$processed_data_path_str


# View Results


In [12]:
from pathlib import Path
from typing import Dict, List, Optional, Tuple, Union
from typing import Dict, Iterable, List, Optional, Tuple
import re
import logging as log
from nltk.tokenize import word_tokenize, sent_tokenize
import numpy as np
from concurrent.futures import ThreadPoolExecutor, Future, wait
import pandas as pd


ArgumentationInfo = Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]
AnnotatedRawTextInfo = Tuple[str,str]

class Parser():
    
    def __init__(self, accepted_files: Iterable[str], suffix: str) -> None:
        """
        accepted_files: List of files extensions to be read
        """
        self.accepted_files = tuple(accepted_files)
        self.suffix = suffix
        
    def _should_read_file(self, file: Path):
        """
        Returns if a file should be read as a corpus file
        """
        return file.is_file() and file.name.endswith(self.accepted_files)

    def parse_dir(self, corpus_path: Path, **kwargs) -> Dict[str, ArgumentationInfo]:
        """
        Parse the file
        
        corpus_path: Base corpus address
        
        return: A dictionary mapping file address to its information
        """
        
        results = {}
        futures: List[Future] = []
        max_worker = 20
        
        files = [file for file in corpus_path.iterdir()]
        batch = len(files)//max_worker + 1
        
        def read(slice):
            for file in files[batch*slice:batch*(slice+1)]:
                if self._should_read_file(file):
                    result = self.parse_file(file, **kwargs)
                    results[str(file)] = result
        
        with ThreadPoolExecutor(max_workers=max_worker) as exe:
            for i in range(max_worker):
                futures.append(exe.submit(read, i))
                # read(i)
        wait(futures)
        exceptions = [future.exception() for future in futures if future.exception()]
        
        if exceptions:
            raise Exception(exceptions)
        
        return results

    def parse_file(self, file: Path, **kwargs) -> ArgumentationInfo:
        """
        Parse the content of `file` returning two DataFrames containing
        the argumentative unit and the relation information.
        
        argumentative_units columns: 
          - `prop_id` Proposition ID inside the document
          - `prop_type` Proposition type
          - `prop_init` When the proposition starts in the original text
          - `prop_end` When the proposition ends in the original text
          - `prop_text` Proposition text
          
        relations columns:
          - `relation_id` Relation ID inside the document
          - `relation_type` Relation type
          - `prop_id_source` Relation's source proposition id 
          - `prop_id_target` Relation's target proposition id
          
        non_argumentative_units columns:
          - `prop_init` When the proposition starts in the original text
          - `prop_end` When the proposition ends in the original text
          - `prop_text` Proposition text
          
        return: (argumentative_units, relations, non_argumentative_units)
        """
        return self.parse(file.read_text(), file, **kwargs)
    
    def parse(self, content:str, file: Optional[Path] = None, **kwargs) -> ArgumentationInfo:
        """
        Parse `content` returning DataFrames containing
        the argumentative unit and the relation information.
        
        content: text containing the content to parse
        file: Optional, content's original file
        
        argumentative_units columns: 
          - `prop_id` Proposition ID inside the document
          - `prop_type` Proposition type
          - `prop_init` When the proposition starts in the original text
          - `prop_end` When the proposition ends in the original text
          - `prop_text` Proposition text
          
        relations columns:
          - `relation_id` Relation ID inside the document
          - `relation_type` Relation type
          - `prop_id_source` Relation's source proposition id 
          - `prop_id_target` Relation's target proposition id
          
        non_argumentative_units columns:
          - `prop_init` When the proposition starts in the original text
          - `prop_end` When the proposition ends in the original text
          - `prop_text` Proposition text
          
        return: (argumentative_units, relations, non_argumentative_units)
        """
        raise NotImplementedError()

    def from_dataframes(self, dataframes: Dict[str, ArgumentationInfo], source_language="english", **kwargs) -> Dict[str, AnnotatedRawTextInfo]:
        """
        Creates file with annotated corpus representing the received DataFrames. 
        
        dataframes: The result from calling a parse function in any Parser class
        the keys aren't important, so a mock key can be passed.
        language: Language for tokenization process
        
        returns: Annotated string, Raw entire text
        """
        raise NotImplementedError()
        
    def export_from_dataframes(self, dest_address: Path, dataframes: Dict[str, ArgumentationInfo], **kwargs):
        """
        Saves the corpus to into dest_address, converting the dataframe version into the corresponding representation.
        
        dest_address: Path where to save the corpus. May not exist
        dataframes: DataFrame representation of the corpus
        """
        representation = self.from_dataframes(dataframes, **kwargs)
        self.export_corpus_from_files(dest_address, representation, **kwargs)
        self.create_conf_files_from_argumentation_dict(dest_address, dataframes)
    
    def create_conf_files_from_argumentation_dict(self, corpus_path: Path, argumentation_dict: Dict[str, ArgumentationInfo]):
        """
        Create any extra requirement file for the corpus
        
        corpus_path: Path to save the generated files
        argumentation_dict: Result of calling the `parse_dir` function

        """
        pass
    
    def export_corpus_from_files(self, dest_address: Path, files: Dict[str,AnnotatedRawTextInfo], **kwargs):
        """
        Saves the corpus into dest_address. The files will be named after its key.
        
        dest_addres: Path where to save the corpus. May not exist
        files: Maps file address or file name to its corpus representation and its full text.
        """
        if not dest_address.is_dir():
            dest_address.mkdir(exist_ok=True, parents=True)
            
        for filedir, (annotated_text, raw_text) in files.items():
            name = Path(filedir).name
            name += self.suffix
            dest = dest_address / name
            dest.write_text(annotated_text)
            dest = dest_address / (name + ".txt")
            dest.write_text(raw_text)

ConllTagInfo = Dict[str, Union[str,int]]

class ConllParser(Parser):
    
    ANNOTATION_REGEX = r"^(?P<tok>[^\s]+)\s(?P<bio_tag>[{TAGS}])(-(?P<prop_type>\w+))?(?P<relations>(-\w+--?\d+)*)\s*$"
    TAG_FORMAT = "{bio_tag}-{prop_type}-{relations_string}"
    ANNOTATION_FORMAT = f"{{tok}}\t{TAG_FORMAT}\n"
    
    def __init__(self, *additional_supported_formats, bioes=False, use_spacy=False, **kwargs) -> None:
        super().__init__((".conll", *additional_supported_formats), ".conll")
        tags = "BIO" if not bioes else "BIOES"
        self.bioes = bioes
        self.use_spacy = use_spacy
        self.ANNOTATION_REGEX = self.ANNOTATION_REGEX.format_map({"TAGS": tags})
        self.annotation_regex = re.compile(self.ANNOTATION_REGEX)
        self.__sent_separator = {"tok":"\n", "bio_tag":""}
        self.__relation_regex = re.compile(r"-(?P<relation_tag>\w+)-(?P<distance>-?\d+)")
    
    def __split_sentences(self, line_infos: list, language: str) -> list:
        """
        Create a new list and adds a sentence separator to the `line_infos`'s content
        the separator is the dtctionary `{"tok":"", "bio_tag":""}`
        
        line_infos: Original information list
        language: Language content
        
        returns: A new list containing a sentence separator 
        """
        new_line_infos = []
        previous_splitted = [i for i, tok in enumerate(line_infos) if tok["bio_tag"] == ""]
        if len(previous_splitted) == 0 or line_infos[-1] != self.__sent_separator:
             previous_splitted.append(len(line_infos))
        index = 0
        
        for end in previous_splitted:
            content = " ".join(tok["tok"] for tok in line_infos[index:end])
            if self.use_spacy:
                nlp = get_spacy_model(language)
                sentences = [x.text for x in nlp(content).sents]
            else:
                sentences = sent_tokenize(content, language=language)

            prev_word_sentence_split = None
            for i, sentence in enumerate(sentences):
                words = sentence.strip().split(" ")
                if prev_word_sentence_split:
                    words[0] = prev_word_sentence_split + words[0]
                    prev_word_sentence_split = None
                for j, word in enumerate(words):
                    if word != line_infos[index]["tok"]:
                        if j == len(words) - 1 and i < len(sentences) - 1 and \
                           word + sentences[i+1].strip().split(" ")[0] == line_infos[index]["tok"]:
                            # Sentece splited in middle of a token and continues next
                            prev_word_sentence_split = word
                        else:
                            # Raise exception
                            assert word == line_infos[index]["tok"]
                    else:
                        new_line_infos.append(line_infos[index])
                        index += 1

                if prev_word_sentence_split is not None:
                    continue

                # Sentence separator
                new_line_infos.append(self.__sent_separator)
            assert index == end
            index += 1
        return new_line_infos
        
    def __extract_relations(self, relations_string: Optional[str]) -> Optional[List[Tuple[str,int]]]:
        """
        Extracts the relations from `relations_string`
        
        Example "-RelA--1-RelB-2-RelA--3-RelC-4" -> [("RelA", -1), ("RelB", 2), ("RelA", -3), ("RelC", 4)]
        """

        if relations_string is None:
            return None
        relations = []
        for match in self.__relation_regex.finditer(relations_string):
            g_dict = match.groupdict()
            relations.append((g_dict['relation_tag'], int(g_dict['distance'])))
        return relations
       
    def __get_relations_string(self, relations: List[Tuple[str, int]]) -> str:
        
        if not relations:
            return 'none'
        
        return "-".join(f"{tag}-{distance}" for tag, distance in relations)
        
    def parse(self, content:str, file: Optional[Path] = None, get_tags=False, **kwargs) -> ArgumentationInfo:
        """
        Parse `content` returning DataFrames containing
        the argumentative unit and the relation information.
        
        content: text containing the content to parse
        file: Optional, content's original file
        get_tags: If a List of tags info is returned instead a dataframe representation
        
        argumentative_units columns: 
          - `prop_id` Proposition ID inside the document
          - `prop_type` Proposition type
          - `prop_init` When the proposition starts in the original text
          - `prop_end` When the proposition ends in the original text
          - `prop_text` Proposition text
          
        relations columns:
          - `relation_id` Relation ID inside the document
          - `relation_type` Relation type
          - `prop_id_source` Relation's source proposition id 
          - `prop_id_target` Relation's target proposition id
          
        non_argumentative_units columns:
          - `prop_init` When the proposition starts in the original text
          - `prop_end` When the proposition ends in the original text
          - `prop_text` Proposition text
          
        return: (argumentative_units, relations, non_argumentative_units)
        """
        
        content = content.splitlines()
        
        line_parse = []
        
        for i,line in enumerate(content):
            match = self.annotation_regex.match(line)
            
            if match:
                g_dict = match.groupdict()
                g_dict['relations'] = self.__extract_relations(g_dict['relations'])
                line_parse.append(g_dict)
            elif line == "":
                line_parse.append(self.__sent_separator)
            else:
                if file:
                    log.warning(f"Line {i} file {file.name}. Match not found: {line}")
                else:
                    log.warning(f"Line {i}. Match not found: {line}")

        if get_tags:
            return line_parse

        def extract_proposition(propositions: List[dict], start_index=0):
            """
            Extracts the first proposition in `propositions`.
            
            Raise: IndexOutOfRange if no proposition is found
            """
            current = start_index
            
            def extract_language_tag(word):
                """
                Check if the word is annotated with a language tag i.e. [_es, _en, _de]
                and return the unannotated word.
                """
                if len(word) > 3 and word[-3] == "_":
                    return word[:-3]
                return word

            if propositions[current]["bio_tag"] == "": # Sentence separator
                proposition_text = extract_language_tag(propositions[current]["tok"])
                current += 1
            elif propositions[current]["bio_tag"] == "O":
                proposition_text = extract_language_tag(propositions[current]["tok"])
                current += 1

                # Join all tokens
                while current < len(propositions) and propositions[current]["bio_tag"] == "O":
                    proposition_text += " " + extract_language_tag(propositions[current]["tok"])
                    current += 1
            else:
                if self.bioes and propositions[current]["bio_tag"] == "S":
                    proposition_text = extract_language_tag(propositions[current]["tok"])
                    current += 1
                    return proposition_text, current
                
                if propositions[current]["bio_tag"] != "B":
                    proposition = propositions[current]
                    if file:
                        log.warning(f"File {file.name}. Proposition '{proposition['tok']}' at index {current} doesn't start with a B")
                    else:
                        log.warning(f"Proposition '{proposition['tok']}' at index {current} doesn't start with a B")
                
                # Current should be B
                proposition_text = extract_language_tag(propositions[current]["tok"])
                current += 1
                
                # Join all tokens
                while current < len(propositions) and propositions[current]["bio_tag"] == "I":
                    proposition_text += " " + extract_language_tag(propositions[current]["tok"])
                    current += 1
                    
                if self.bioes:
                    proposition = propositions[current]
                    if current >= len(propositions) or propositions[current]["bio_tag"] != "E":
                        if file:
                            log.warning(f"File {file.name}. Proposition '{proposition['tok']}' at index {current} doesn't end with an E")
                        else:
                            log.warning(f"Proposition '{proposition['tok']}' at index {current} doesn't end with an E")
                    elif current < len(propositions):
                        proposition_text += " " + extract_language_tag(propositions[current]["tok"])
                        current += 1
                        
            return proposition_text, current
        
        argumentative_units = {
            "prop_id": [], 
            "prop_type": [], 
            "prop_init": [], 
            "prop_end": [], 
            "prop_text": [],
        }
        
        non_argumentative_units = {
            "prop_init": [], 
            "prop_end": [], 
            "prop_text": [],
        }
        
        relations = {
            "relation_id": [], 
            "relation_type": [], 
            "prop_id_source": [], 
            "prop_id_target": [],            
        }

        current = 0
        accumulative_offset = 0
        while current < len(line_parse):
            
            proposition, current = extract_proposition(line_parse, current)
            prop_info = line_parse[current-1] # All annotations of the argument are equal 
            prop_id = len(argumentative_units['prop_id']) + 1 # 0 is the root node
            
            if prop_info["bio_tag"] in ["O", ""]:
                non_argumentative_units['prop_init'].append(accumulative_offset)
                non_argumentative_units['prop_end'].append(accumulative_offset + len(proposition))
                non_argumentative_units['prop_text'].append(proposition)
            else:
                argumentative_units['prop_id'].append(prop_id)
                argumentative_units['prop_type'].append(prop_info["prop_type"])
                argumentative_units['prop_init'].append(accumulative_offset)
                argumentative_units['prop_end'].append(accumulative_offset + len(proposition))
                argumentative_units['prop_text'].append(proposition)

                if prop_info["relations"] is not None:
                    for relation_tag, distance in prop_info['relations']:
                        relations['relation_id'].append(len(relations['relation_id']))
                        relations['relation_type'].append(relation_tag)
                        relations['prop_id_source'].append(prop_id)
                        relations['prop_id_target'].append(prop_id + int(distance))
            
            accumulative_offset += len(proposition)
            if prop_info["bio_tag"] != "":
                accumulative_offset += 1 # Extra separator when rebuilding text
        
        return pd.DataFrame(argumentative_units), pd.DataFrame(relations), pd.DataFrame(non_argumentative_units)
        
    def fix_annotations(self, annotations: List[ConllTagInfo]) -> List[ConllTagInfo]:
        """
        Fix posible errors found in `annotations` returning a new list without them.
        
        annotations: Original list of conll annotations
        """
        fixed_annotations = []
        for i, annotation in enumerate(annotations):
            # The next annotation can go after the previous annotation
            # But a sentence separator is in the middle
            if annotation == self.__sent_separator \
               and i < len(annotations) - 1 \
               and i > 0 \
               and annotations[i+1]["bio_tag"] == "I" \
               and annotations[i-1]["bio_tag"] in ["B", "I"] \
               and annotations[i-1]["prop_type"] == annotations[i+1]["prop_type"] \
               and annotations[i-1]["relations"] == annotations[i+1]["relations"]:
                # Skip sentence separator
                continue
            fixed_annotations.append(annotation)
        return fixed_annotations

    def from_dataframes(self, dataframes: Dict[str, ArgumentationInfo], source_language="english", get_tags=False, exact_text=True, split_sentences=True, **kwargs) -> Dict[str, Union[AnnotatedRawTextInfo, Tuple[List[ConllTagInfo], str]]]:
        """
        Creates a CONLL annotated corpus representing the received DataFrames. 
        
        dataframes: The result from calling a parse function in any Parser class
        the keys aren't important, so a mock key can be passed.
        source_language: Language for tokenization process
        get_tags: If true, returns the tags instead of the annotated text
        exact_text: If true, returns the exact text representation else will 
        be returned the tokens separated by whitespaces
        
        returns: CONLL annotated string or CONLL annotations, Raw text
        """
        
        results = {}
        default_gap = " "
                
        for file_path_str, (argumentative_units, relations, non_argumentative_units) in dataframes.items():

            tags_info = []
            all_units = pd.concat([argumentative_units, non_argumentative_units], sort=True)
            all_units.sort_values(by="prop_init", inplace=True)
            all_units = all_units.reindex(columns=["prop_id", "prop_type", "prop_init", "prop_end", "prop_text"])
            all_units['prop_init'] = all_units['prop_init'].apply(np.int64)
            all_units['prop_end'] = all_units['prop_end'].apply(np.int64)
            max_length = all_units["prop_end"].max()
            
            text = default_gap*max_length if exact_text else ""
            
            for index, (prop_id, prop_type, prop_init, prop_end, prop_text) in all_units.iterrows():
                prop_id = int(prop_id) if pd.notna(prop_id) else prop_id
                if self.use_spacy:
                    nlp = get_spacy_model(source_language)
                    doc = nlp(prop_text)
                    prop_tokens = [x.text for x in doc if "\n" not in x.text and x.text.strip()]
                else:
                    prop_tokens = word_tokenize(prop_text, language=source_language)
                
                if exact_text:
                    text = text[:prop_init] + prop_text + text[prop_end:]
                else:
                    text += default_gap.join(prop_tokens) + default_gap
                
                if pd.notna(prop_type):
                    # It's the begining of a proposition
                    for i,tok in enumerate(prop_tokens):
                        current_relations = []
                        if i == 0:
                            if len(prop_tokens) == 1 and self.bioes:
                                bio_tag = "S"
                            else:    
                                bio_tag = "B"  
                        elif i == len(prop_tokens) - 1 and self.bioes:
                            bio_tag = "E"
                        else:
                            bio_tag = "I"
                        relation = relations[relations["prop_id_source"] == prop_id]
                        for _, relation in relation.iterrows():
                            relation_type = relation["relation_type"]
                            relation_distance = relation["prop_id_target"] - relation["prop_id_source"]
                            current_relations.append((relation_type, relation_distance))
                        
                        tags_info.append({
                                "tok": tok,
                                "bio_tag": bio_tag,
                                "prop_type": prop_type,
                                "relations": current_relations,
                                "relations_string": self.__get_relations_string(current_relations)
                        })
                        tags_info[-1]["full_tag"] = self.TAG_FORMAT.format_map(tags_info[-1]).replace("-none", "")

                else:
                    if all(x == "\n" for x in prop_text): # Sentence and Paragraph separators
                        for x in prop_text:
                            tags_info.append(self.__sent_separator)
                    else:
                        # Out of proposition
                        for tok in prop_tokens:
                            # Fill the gap with O until the proposition is found
                            tags_info.append({
                                "tok": tok,
                                "bio_tag": "O",
                                "prop_type": "none",
                                "relations": "none",
                                "relations_string": "none",
                            })
                            tags_info[-1]["full_tag"] = "O"
                
            if split_sentences:
                tags_info = self.__split_sentences(tags_info, source_language)
            
            tags_info = self.fix_annotations(tags_info)
            
            if get_tags:
                results[file_path_str] = tags_info, text
            else:
                result = self.get_conll_text_from_annotation_dicts(tags_info)
                results[file_path_str] = result, text
        
        return results

    def get_conll_text_from_annotation(self, annotations: List[str]) -> str:
        """
        Maps the anotation to its associated conll text representation.
        
        annotations: List containig the dictionary that holds the information about the tag
        
        returns: The annotated conll text representation
        """
        text = ""
        for annotation in annotations:
            if annotation == "":
                to_write = "\n"
            else:
                match = self.annotation_regex.match(annotation)
                assert match
                annotation = match.groupdict()
                to_write = self.ANNOTATION_FORMAT.format_map(annotation)
                to_write = to_write.replace("-none", "") # Remove unnecesary labels
                to_write = to_write.replace("-None", "") # Remove unnecesary labels
            text += to_write
        return text

    def get_text_from_annotation(self, annotations: List[ConllTagInfo]) -> str:
        """
        Returns the text associated with `annotations`. All tokens are placed in
        a single line separated by a whitespace. 
        
        annotations: List containig the dictionary that holds the information about the tag
        
        returns: The text representation
        """
        return " ".join([x["tok"] for x in annotations])
    
    def get_conll_text_from_annotation_dicts(self, annotations: List[ConllTagInfo]) -> str:
        """
        Returns the conll text associated with `annotations`.
        
        annotations: List containig the dictionary that holds the information about the tag
        
        returns: The annotated conll text representation
        """
        # Create text
        result = ""
        for tag_info in annotations:
            if tag_info == self.__sent_separator:
                to_write = "\n"
            else:
                to_write = self.ANNOTATION_FORMAT.format_map(tag_info)
                to_write = to_write.replace("-none", "") # Remove unnecesary labels
            result += to_write
        return result
    

In [None]:
conll_content = []
global_processed_data_path = Path("/content/argument-mining", processed_data_path)

info = ConllParser().parse_dir(global_processed_data_path)

In [None]:

import networkx as nx
import matplotlib.pyplot as plt

for file in info:
  print("Argumenttative structure of", Path(file).name)
  print()
  arguments, relations, non_arguments = info[file]

  nodes = {}
  relations_list = []
  arg_types = []
  rel_types = []
  
  print("ARGUMENTS:")
  print()
  for i,a in arguments.iterrows():
    nodes[a['prop_id']] = a['prop_text']
    arg_types.append(a['prop_type'])
    print(f"{i}: {a['prop_type']}:  {a['prop_text']}")
  print()
  print("RELATIONS:")
  print()
  for i,a in relations.iterrows():
    # relations_list.append((nodes[a['prop_id_source']], nodes[a['prop_id_target']]))
    rel_types.append(a['relation_type'])
    relations_list.append((a['prop_id_source'], a['prop_id_target']))
    print(f"{i}:  {a['prop_id_source']}-{a['relation_type']}->{a['prop_id_target']}")
  print()
  print()

  colors = ["red", "green", "blue", "yellow", "black", "orange"]

  arg_colors = list(set(arg_types))
  arg_colors = { arg_type: colors[i % len(colors)] for i, arg_type in enumerate(arg_colors, 3) }
  rel_colors = list(set(rel_types))
  rel_colors = { rel_type: colors[i % len(colors)] for i, rel_type in enumerate(rel_colors) }

  G = nx.DiGraph()
  G.add_nodes_from(nodes)
  G.add_edges_from(relations_list)

  node_color = [arg_colors[arg] for arg in arg_types]
  edge_color = [rel_colors[rel] for rel in rel_types]
  options = {
      "font_size": 36,
      "node_size": 3000,
      "node_color": "white",
      "edge_color": edge_color,
      "linewidths": 5,
      "width": 5,
      "edgecolors": node_color, 
  }

  nx.draw_networkx(G, **options)

  # Set margins for the axes so that nodes aren't clipped
  ax = plt.gca()
  ax.margins(0.20)
  plt.axis("off")
  plt.show()


# Export models and files processed

This section will create zip files with the contet you wish to export. Once created you can download the files.

In [6]:
#@title Export options: { display-mode: "form" }

#@markdown ## Export options
#@markdown ---

#@markdown Segmenter path
export_segmenter = True #@param {type: "boolean"}
export_link_predictor = True #@param {type: "boolean"}
export_processed_files = True #@param {type: "boolean"}



In [None]:
segmenter_target = Path(f"/content/argument-mining/data/segmenter_corpus/")
link_prediction_target = Path(f"/content/argument-mining/data/link_prediction/")
processed_data_target = Path("argument-mining") / processed_data_path

processed_data_path_str = str(processed_data_target.resolve())
segmenter_to_save_path_str = str(segmenter_target.resolve())
link_predictor_to_save_path_str = str(link_prediction_target.resolve())

processed_name = processed_data_path.name
segmenter_name = segmenter_to_save_path.name
link_predictor_name = link_predictor_to_save_path.name

if export_segmenter:
  !zip -r "$segmenter_name segmenter.zip" $segmenter_to_save_path_str
if export_link_predictor:
  !zip -r "$link_predictor_name link predictor.zip" $link_predictor_to_save_path_str
if export_processed_files:
  !zip -r "$processed_name data.zip" $processed_data_path_str
