In [36]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
import warnings
from argminer.data import ArgumentMiningDataset, PersuadeProcessor, TUDarmstadtProcessor
from argminer.utils import decode_model_name
from argminer.evaluation import inference
from argminer.config import LABELS_MAP_DICT
import time
from torch.utils.data import DataLoader
from mlutils.torchtools.metrics import FScore

## PATHS

In [37]:
# need to have a relative load of the models
import os
os.chdir('/Users/yousefnami/Desktop')

# path to models
JOB_DIR = 'tmpdir/job'

In [38]:
# TODO double check config MAP!
CONFIG_MAP = {
    '822594.undefined': dict(
        processor=PersuadeProcessor,
        strat='bieo'
    ),
    '822595.undefined': dict(
        processor=PersuadeProcessor,
        strat='bio'
    ),
    '822596.undefined': dict(
        processor=PersuadeProcessor,
        strat='io'
    ),
    '820966.undefined': dict(
        processor=PersuadeProcessor,
        strat='bieo'
    ),
    '820965.undefined': dict(
       processor=PersuadeProcessor,
        strat='bio' 
    ),
    '820962.undefined': dict(
        processor=PersuadeProcessor,
        strat='io'
    ),
    '820985.undefined': dict(
        processor=TUDarmstadtProcessor,
        strat='bieo'
    ),
    '820986.undefined': dict(
        processor=TUDarmstadtProcessor,
        strat='bio'
    ),
    '820987.undefined': dict(
        processor=TUDarmstadtProcessor,
        strat='io'
    )
}

MAX_LENGTH_DICT = {
    'google/bigbird-roberta-base': 1024,
    'roberta-base': 512
}

In [39]:
RESULTS = {}

In [40]:
#metrics = [FScore(average='macro')]

In [41]:
def _get_loader(df_label_map, df_test, batch_size):
    testset = ArgumentMiningDataset(
        df_label_map, df_test, tokenizer, max_length, f'standard_{strategy}', is_train=False
    )
    testloader = DataLoader(testset, batch_size=batch_size)
    return testloader

def _get_data(path, Processor, strategy, batch_size=32, limit=None):
    processor = Processor(path).from_json()
    if 'test' in path:
        df_test = processor.dataframe[['text', 'labels']]
        # TODO here might need to do a label_map
    else:
        df_dict = processor.get_tts(test_size=0.3)
        df_test = df_dict['test'][['text', 'labels']]
    if limit is not None:
        warnings.warn('LOADING LIMITED DATA')
        df_test = df_test.head(limit)
    
    df_label_map = LABELS_MAP_DICT[processor.__class__.__name__.split('Processor')[0]][strategy]
    return df_test, df_label_map

def _get_core_data(path, Processor, strategy, batch_size=32, limit=None):
    #processor = Processor(path).from_json()
    #if 'test' in path:
    #    print( processor.dataframe.head())
    #    df_test = processor.dataframe[['text', 'labels']]
    #else:
    #    df_dict = processor.get_tts(test_size=0.3)
    #    df_test = df_dict['test'][['text', 'labels']]
    #if limit is not None:
    #    warnings.warn('LOADING LIMITED DATA')
    #   df_test = df_test.head(limit)
    
    #df_label_map = LABELS_MAP_DICT[processor.__class__.__name__.split('Processor')[0]][strategy]
    
    #testset = ArgumentMiningDataset(
    #    df_label_map, df_test, tokenizer, max_length, f'standard_{strategy}', is_train=False
    #)
    #testloader = DataLoader(testset, batch_size=batch_size)
    df_test, df_label_map = _get_data(path, Processor, strategy, batch_size, limit)
    testloader = _get_loader(df_label_map, df_test, batch_size)
    
    return testloader


def _get_other_data(path, Processor, strategy, batch_size=32, limit=None):
    if Processor == PersuadeProcessor:
        # we are using a Persuade Model but would like to test on TUDarmstadt
        df_test, _ = _get_data(path, TUDarmstadtProcessor, strategy, batch_size, limit)
        # HARD RESET DF-LABEL-MAP
        df_label_map = LABELS_MAP_DICT['Persuade'][strategy]
        
        label_map_dict = {
            'Claim': 'Claim',
            'MajorClaim': 'Position',
            'Premise': 'O',
            'O': 'O'
        }
        
    else:
        df_test, _ = _get_data(path, PersuadeProcessor, strategy, batch_size, limit)
        df_label_map = LABELS_MAP_DICT['TUDarmstadt'][strategy]


        label_map_dict = {
            'Lead': 'O',
            'Rebuttal': 'O',
            'Concluding Statement': 'O',
            'Position': 'MajorClaim',
            'Evidence': 'O',
            'Claim': 'Claim',
            'Counterclaim': 'O',
            'O': 'O'
        }
    df_test.labels = df_test.labels.apply(
        lambda x: [
            'O' if label_map_dict[text.split('-')[-1]] =='O' else text.replace(
                text.split('-')[-1],
                label_map_dict[text.split('-')[-1]]
            ) for text in x]
    )


    testloader = _get_loader(df_label_map, df_test, batch_size)
    return testloader

def _get_scores_agg(df):
    df = df.groupby('class').sum()
    df['f1'] = df.tp / (df.tp + 1/2*(df.fp + df.fn))
    macro_f1 = df['f1'].mean()
    return macro_f1, df

In [42]:
BATCH_SIZE = 128
LIMIT = None

In [43]:
for MODEL_ID, job in enumerate(os.listdir(JOB_DIR)):
    if job != '.DS_Store':
        job_path = os.path.join(JOB_DIR, job)
        model_dir = os.path.join(job_path, 'models')
        model_name = [item for item in os.listdir(model_dir) if item != '.DS_Store'][0]
        model_name_decoded = decode_model_name(model_name).split('_')[0] # get base model name
        
        max_length = MAX_LENGTH_DICT[model_name_decoded]
        # define tokenizer
        tokenizer = AutoTokenizer.from_pretrained(model_name_decoded, add_prefix_space=True)
        model_path = os.path.join(model_dir, model_name)
        
        strategy = CONFIG_MAP[job].get('strat')
        Processor = CONFIG_MAP[job].get('processor')
        print(
            f'RUNNING FOR MODEL {MODEL_ID}: {model_name_decoded} at PATH: {model_path}\n'
            f'PARAMETERS: {strategy}. {Processor}'
        )
        print('=================================================================================')
        
        


        s = time.time()
        trained_model = AutoModelForTokenClassification.from_pretrained(model_path)
        
        RESULTS[job] = {}
        
        # test the model against itself
        RESULTS[job]['self'] = {}
        
        # specify the path to the json
        path = f'Main/0.Education/2.UCL/Courses/NLP/argument-mining/data/core/{strategy}'

        #df_test, df_label_map = _get_core_data(path, Processor, strategy)
        #processor = Processor(path).from_json()
        #df_dict = processor.get_tts(test_size=0.3)
        #df_test = df_dict['test'][['text', 'labels']]
        #df_label_map = LABELS_MAP_DICT[processor.__class__.__name__.split('Processor')[0]][strategy]
        
        #testset = ArgumentMiningDataset(
        #    df_label_map, df_test, tokenizer, max_length, f'standard_{strategy}', is_train=False
        #)
        #testloader = DataLoader(testset, batch_size=32)
        testloader = _get_core_data(path, Processor, strategy, batch_size=BATCH_SIZE, limit=LIMIT)
        print(f'Loaded and data loaded. Time: {time.time() - s: .3g}')


        # TODO add metrics support
        SELF = 'self'
        s = time.time()
        df_metrics, df_scores = inference(trained_model, testloader, )
        macro_f1, df_scores_agg = _get_scores_agg(df_scores)
        RESULTS[job][SELF]['core'] = {'macro_f1': macro_f1, 'scores': df_scores_agg}
        
        print(f'Took {time.time() -s:.3g} to get scores')
        # aggregate the scores        
        # TODO save these scores!!!!!
        
        
        
        # test models against self adversarial examples


        augmented_path = f'test/{strategy}'
        for augmentation in os.listdir(augmented_path):
            if augmentation != '.DS_Store':
                augmentation_path = os.path.join(augmented_path, augmentation)
                testloader = _get_core_data(augmentation_path, Processor, strategy,
                                                       batch_size=BATCH_SIZE, limit=LIMIT)
                #processor = Processor(augmentation).from_json()
                #df_test = processor.dataframe[['text', 'labels']]
                #testset = ArgumentMiningDataset(
                #    df_label_map, df_test, tokenizer, max_length, f'standard_{strategy}', is_train=False
                #)
                #testloader = DataLoader(testset, batch_size=32)
                df_metrics, df_scores = inference(trained_model, testloader, )
                macro_f1, df_scores_agg = _get_scores_agg(df_scores)
                RESULTS[job][SELF][augmentation] = {'macro_f1': macro_f1, 'scores': df_scores_agg}
                # aggregate the scores
        
        
        OTHER = 'other'
        RESULTS[job][OTHER] = {}
        
        # get other processor
        testloader = _get_other_data(path, Processor, strategy, batch_size=BATCH_SIZE, limit=LIMIT)
        df_metrics, df_scores = inference(trained_model, testloader)
        macro_f1, df_scores_agg = _get_scores_agg(df_scores)
        RESULTS[job][OTHER]['core'] = {'macro_f1': macro_f1, 'scores': df_scores_agg}
        
        
        for augmentation in os.listdir(augmented_path):
            if augmentation != '.DS_Store':
                augmentation_path = os.path.join(augmented_path, augmentation)
                testloader = _get_other_data(augmentation_path, Processor, strategy,
                                             batch_size=BATCH_SIZE, limit=LIMIT)
                df_metrics, df_scores = inference(trained_model, testloader)
                macro_f1, df_scores_agg = _get_scores_agg(df_scores)
                RESULTS[job][OTHER][augmentation] = {'macro_f1': macro_f1, 'scores': df_scores_agg}
        
        #RESULTS[model_name_decoded]['self']['core'] # the score
        
        
        # test model against other datasets, need to convert datasets
        #RESULTS[model_name_decoded]['transfer'] = {}


        



RUNNING FOR MODEL 0: roberta-base at PATH: tmpdir/job/822594.undefined/models/cm9iZXJ0YS1iYXNlX2ZpbmFs
PARAMETERS: bieo. <class 'argminer.data.PersuadeProcessor'>
Loaded and data loaded. Time:  8.91
Prediction time: 468
tensor([0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 7])
Agg to word time: 2.21
Get predstring time: 0.226
Evaluate time: 1.77
Batch 1 complete.
Prediction time: 555
tensor([0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 7])
Agg to word time: 1.51
Get predstring time: 0.163
Evaluate time: 0.411
Batch 2 complete.
Prediction time: 420
tensor([0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 7])
Agg to word time: 1.66
Get predstring time: 0.153
Evaluate time: 0.475
Batch 3 complete.
Prediction time: 433
tensor([0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 7])
Agg to word time: 1.31
Get predstring time: 0.12
Evaluate time: 0.406
Batch 4 complete.
Prediction time: 315
tensor([0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4

KeyboardInterrupt: 

In [35]:
RESULTS

{'822594.undefined': {'self': {'core': {'macro_f1': 0.46401515151515155,
    'scores':         tp   fn   fp        f1
    class                         
    0      0.0  2.0  4.0  0.000000
    1      1.0  1.0  1.0  0.500000
    2      1.0  1.0  1.0  0.500000
    3      3.0  4.0  6.0  0.375000
    6      5.0  0.0  1.0  0.909091
    7      1.0  1.0  1.0  0.500000},
   'custom_fillers': {'macro_f1': 0.0,
    'scores':         tp  fn  fp   f1
    class                  
    1      0.0   0   2  0.0
    2      0.0   1   2  0.0
    3      0.0   0   2  0.0
    5      0.0   1   1  0.0
    6      0.0   1   2  0.0
    7      0.0   0   1  0.0},
   'synonym': {'macro_f1': 0.0,
    'scores':         tp  fn  fp   f1
    class                  
    1      0.0   0   1  0.0
    2      0.0   1   2  0.0
    3      0.0   0   4  0.0
    5      0.0   1   0  0.0
    6      0.0   1   4  0.0},
   'keywordChange': {'macro_f1': 0.0,
    'scores':         tp  fn  fp   f1
    class                  
    1      0.0  