# Metrics

In [None]:
from pathlib import Path
import pandas as pd
from typing import List, Union
from corpus_parser.brat_parser import BratParser
from corpus_parser.conll_parser import ConllParser
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
from collections import Counter


BASE_DATA = Path("data")
PROCESS_TAG = "selected_response_responded_granma_letters"

Indexable = Union[List, str]

def jaccard_sim(s1, s2):
    """
    Jaccard similarity
    """
    a = set(s1)
    b = set(s2)
    return len(a.intersection(b))/len(a.union(b))

def stredit(s1:Indexable, s2:Indexable):
    """
    Levenshtein distance
    """
    len1 = len(s1)
    len2 = len(s2)
    
    table = [[0 for _ in range(len2 + 1)] for _ in range(len1 + 1)]
    for i in range(1, len1 + 1):
        table[i][0] = i
    for j in range(1, len2 + 1):
        table[0][j] = j


    for i in range(1, len1+1):
        for j in range(1,len2+1):
            if s1[i-1] == s2[j-1]:
                d = 0
            else:
                d = 1
            
            copy_sub_value = table[i-1][j-1] + d
            insert_value = table[i][j-1]+1
            delete_value = table[i-1][j]+1

            table[i][j] = value = min(copy_sub_value,
                              delete_value,
                              insert_value)

    return table[len1][len2]


def data_augmentation_metrics(corpus_path: Path):
    
    def process_dir(path: Path):
        originals = {}
        augmented = {}
        
        info = {
            "file": [],
            "augmented_language": [],
            "original_word_len": [],
            "augmented_word_len": [],
            "edit_word_distance": [],
            "jaccard_similarity": [],
        }
        
        for file in path.iterdir():
            if file.is_file() and file.suffix == ".conll":
                if all(x in file.name for x in ["from", "augmented"]):
                    # Is aguemnted
                    name = file.name.split("_")[3].split(".")[0]
                    if name in augmented:
                        augmented[name].append(file)
                    else:
                        augmented[name] = [file]
                else:
                    # Is original
                    name = file.name.split(".")[0]
                    if name in originals:
                        print("WARNING: Repeated original file:", name)
                    else:
                        originals[name] = file
        
        for orig in originals:
            orig_file = originals[orig]
            orig_text = orig_file.read_text()
            augmented_files = augmented[orig] if orig in augmented else []
            for augmented_file in augmented_files:
                augm_text = augmented_file.read_text()
                
                language = augmented_file.name.split("_")[1]
                
                orig_tokens = [x.split("\t")[0] for x in orig_text.split("\n") if x]
                augm_tokens = [x.split("\t")[0] for x in augm_text.split("\n") if x]
                
                orig_leng = len(orig_tokens)
                augm_leng = len(augm_tokens)
                
                edit = stredit(orig_tokens, augm_tokens)
                jaccard = jaccard_sim(orig_tokens, augm_tokens)
                
                info["file"].append(orig)
                info["augmented_language"].append(language)
                info["original_word_len"].append(orig_leng)
                info["augmented_word_len"].append(augm_leng)
                info["edit_word_distance"].append(edit)
                info["jaccard_similarity"].append(jaccard)
        
        return info
    
    data = None
    for path in corpus_path.iterdir():
        result = process_dir(path)
        if data:
            for key, value in result.items():
                data[key].extend(value)
        else:
            data = result

    return pd.DataFrame(data)


def get_corpus_info(path: Path, is_corpus: bool = False):
    
    if is_corpus:
        tags_info = ConllParser(bioes=False).parse_dir(path / "dev", get_tags=True)
        file_info = ConllParser(bioes=False).parse_dir(path / "dev")
        
        for t in ["test", "train"]:
            tags_info_ = ConllParser(bioes=False).parse_dir(path / t, get_tags=True)
            file_info_ = ConllParser(bioes=False).parse_dir(path / t)
            
            tags_info.update(tags_info_)
            file_info.update(file_info_)
    else:
        tags_info = ConllParser(bioes=True).parse_dir(path, get_tags=True)
        file_info = ConllParser(bioes=True).parse_dir(path)
    
    statistic = {
        "key": [],
        "token_amount": [],
        "inside_token_amount": [],
        "relation_amount": [],
        "argumentative_units_amount": [],
        "non_argumentative_units_amount": [],
    }

    arg_type_counter = Counter([x for _, (df, _, _) in file_info.items() for x in df['prop_type']])
    relation_type_counter = Counter([x for _, (_, df, _) in file_info.items() for x in df['relation_type']])
    
    for key in tags_info:
        token_amount = 0
        inside_token_amount = 0
        for tag_info in tags_info[key]:
            token_amount += 1
            inside_token_amount += 0 if tag_info['bio_tag'] == "O" else 1

        arg, rel, non_arg = file_info[key]

        arg_amount = len(arg)
        non_arg_amount = len(non_arg[non_arg['prop_text'] != "\n"])
        rel_amount = len(rel)

        statistic['key'].append(key)
        statistic['token_amount'].append(token_amount)
        statistic['inside_token_amount'].append(inside_token_amount)
        statistic['relation_amount'].append(rel_amount)
        statistic['argumentative_units_amount'].append(arg_amount)
        statistic['non_argumentative_units_amount'].append(non_arg_amount)

    statistic_df = pd.DataFrame(statistic)    

    print(path)
    print(arg_type_counter)
    print(relation_type_counter)
    
    return statistic_df

def plot_histogram(serie, title: str, bin_factor: int = 1, xlabel: str = None, ylabel: str = None):
    bins = sorted(set(serie))
    serie.hist(bins=len(bins)//bin_factor)
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)


def plot_increasing_values(serie: pd.Series, title: str, xlabel: str = None, ylabel: str = None):
    serie = serie.sort_values()
    plt.plot([i for i in range(len(serie))], serie)
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)

def plot_and_save(save = None):
    if save:
        plt.savefig(save)
    plt.show()
        
def plot_results(statistic_dfs: dict, normalize_document_amount: bool=True):
    
    statistic_dfs = statistic_dfs.copy()
    
    keys = list(statistic_dfs.keys())
    minim_amount = min([len(x) for x in statistic_dfs.values()])
    
    if normalize_document_amount:
        # Remove rows random from datasets until all have the same length
        for key, value in statistic_dfs.items():
            statistic_dfs[key] = value.sample(minim_amount)
    
    for key, statistic_df in statistic_dfs.items():
        plot_histogram(statistic_df['token_amount'], f"Cantidad de tokens", bin_factor=5, xlabel="Cantidad de tokens", ylabel="Cantidad de documentos")
    plt.legend(keys)
    plot_and_save()
   
    print("Percentage of tokens annotated with BIES")
    for key, statistic_df in statistic_dfs.items():
        inside_percentage = (statistic_df['inside_token_amount'] / statistic_df['token_amount'])
        plot_increasing_values(inside_percentage, f"Porciento tokens argumentativos", xlabel="Índice de documentos", ylabel="Porciento de tokens argumentativos")
    plt.legend(keys)
    plot_and_save()
        
    print("Percentage of argumentative units")
    for key, statistic_df in statistic_dfs.items():
        total_components = statistic_df['argumentative_units_amount'] + statistic_df['non_argumentative_units_amount']
        argumentative_percentage = statistic_df['argumentative_units_amount'] / total_components
        plot_increasing_values(argumentative_percentage, f"Porciento componentes argumentativas", xlabel="Índice de documentos", ylabel="Porciento de componentes argumentativas")
    plt.legend(keys)
    plot_and_save()

    print("Relations per argumentative component")
    for key, statistic_df in statistic_dfs.items():
        normalized_relation = statistic_df['relation_amount'] / statistic_df['argumentative_units_amount']
        plot_histogram(normalized_relation, f"Relaciones", bin_factor=3, xlabel="Cantidad de relaciones normalizada", ylabel="Cantidad de documentos")
    plt.legend(keys)
    plot_and_save()


def print_tabular_info(statistics_df: dict):
    
    data = {
        "% promedio de UDA": [],
        "% promedio de tokens argumentatvos": [],
        "Promedio de relaciones por UDA": [],
    }
    index = list(statistics_df.keys())
    
    for key, table in statistics_df.items():
        total_components = table['argumentative_units_amount'] + table['non_argumentative_units_amount']
        argumentative_percentage = table['argumentative_units_amount'] / total_components
        argumentative_percentage = argumentative_percentage.describe()
        data["% promedio de UDA"].append(argumentative_percentage["mean"])
        
        token_argumentative_percentage = table['inside_token_amount'] / table["token_amount"]
        token_argumentative_percentage = token_argumentative_percentage.describe()
        data["% promedio de tokens argumentatvos"].append(token_argumentative_percentage["mean"])
        
        normalized_relation = table['relation_amount'] / table['argumentative_units_amount']
        normalized_relation = normalized_relation.describe()
        data["Promedio de relaciones por UDA"].append(normalized_relation["mean"])
    
    return pd.DataFrame(data, index=index)
    

In [None]:
CORPUS_TO_MEASURE = ["cdcp", "abstrct", "persuasive_essays_paragraph_all_linked"]

data_augmentation_dict = { key: data_augmentation_metrics(BASE_DATA / "parsed_to_conll" / key) for key in CORPUS_TO_MEASURE }

for key, df in data_augmentation_dict.items():
    df['word_len_relation'] = df["original_word_len"] / df["augmented_word_len"]
    # print(key)
    # display(df.describe())


In [None]:

corpus_info_dict = {f"{key}_{PROCESS_TAG}": get_corpus_info(BASE_DATA / 'link_prediction_processed' / key / PROCESS_TAG) for key in CORPUS_TO_MEASURE }
corpus_info_dict.update({f"{key}_corpus": get_corpus_info(BASE_DATA / 'parsed_to_conll' / key, is_corpus=True) for key in CORPUS_TO_MEASURE })


In [None]:

# Similarity between corpus and processed
for key in CORPUS_TO_MEASURE:
    plot_results({x: corpus_info_dict[x] for x in [f"{key}_{PROCESS_TAG}", f"{key}_corpus"]})

# Similarity between processed
plot_results({x: corpus_info_dict[x] for x in [f"{key}_{PROCESS_TAG}" for key in CORPUS_TO_MEASURE]})


In [None]:
# Tabular info


info = print_tabular_info({x: corpus_info_dict[x] for x in corpus_info_dict})
display(info)

for table_name in corpus_info_dict:
    print(table_name)
    display(corpus_info_dict[table_name].describe())
