In [1]:
!pip install argminer@git+https://github.com/namiyousef/argument-mining.git@develop

from transformers import AutoTokenizer, AutoModelForTokenClassification
import warnings
from argminer.data import ArgumentMiningDataset, PersuadeProcessor, TUDarmstadtProcessor
from argminer.utils import decode_model_name
from argminer.evaluation import inference
from argminer.config import LABELS_MAP_DICT
import time
from torch.utils.data import DataLoader
from mlutils.torchtools.metrics import FScore

Collecting argminer@ git+https://github.com/namiyousef/argument-mining.git@develop
  Cloning https://github.com/namiyousef/argument-mining.git (to revision develop) to /tmp/pip-install-pvauaoei/argminer_be5a9b3d68214639b7cdef015ef9ce26
  Running command git clone -q https://github.com/namiyousef/argument-mining.git /tmp/pip-install-pvauaoei/argminer_be5a9b3d68214639b7cdef015ef9ce26
Collecting mlutils@ git+https://git@github.com/namiyousef/ml-utils.git@develop
  Cloning https://****@github.com/namiyousef/ml-utils.git (to revision develop) to /tmp/pip-install-pvauaoei/mlutils_03245099f2ad467f85afcc1ec64b8c0f
  Running command git clone -q 'https://****@github.com/namiyousef/ml-utils.git' /tmp/pip-install-pvauaoei/mlutils_03245099f2ad467f85afcc1ec64b8c0f
Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 5.1 MB/s 
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x8

## PATHS

In [2]:
# need to have a relative load of the models
!pip install colab-dev-tools
from colabtools.utils import mount_drive
import os

drive_path = mount_drive()
BASE_PATH = os.path.join(drive_path, 'Desktop/')

# path to models
JOB_DIR = os.path.join(BASE_PATH, 'tmpdir/job')
TEST_DIR = os.path.join(BASE_PATH, 'test')
MODEL_PATH = os.path.join(drive_path, 'COMP0087/data/core')

Mounted at /content/drive
Google Drive mount successful.


In [3]:
# TODO double check config MAP!
CONFIG_MAP = {
    '822594.undefined': dict(
        processor=PersuadeProcessor,
        strat='bieo',
        dataset='Persuade',
    ),
    '822595.undefined': dict(
        processor=PersuadeProcessor,
        strat='bio',
        dataset='Persuade'
    ),
    '822596.undefined': dict(
        processor=PersuadeProcessor,
        strat='io',
        dataset='Persuade'
    ),
    '820966.undefined': dict(
        processor=PersuadeProcessor,
        strat='bieo',
        dataset='Persuade'
    ),
    '820965.undefined': dict(
       processor=PersuadeProcessor,
        strat='bio',
        dataset='Persuade'
    ),
    '820962.undefined': dict(
        processor=PersuadeProcessor,
        strat='io',
        dataset='Persuade'
    ),
    '820985.undefined': dict(
        processor=TUDarmstadtProcessor,
        strat='bieo',
        dataset='TUDarmstadt'
    ),
    '820986.undefined': dict(
        processor=TUDarmstadtProcessor,
        strat='bio',
        dataset='TUDarmstadt'
    ),
    '820987.undefined': dict(
        processor=TUDarmstadtProcessor,
        strat='io',
        dataset='TUDarmstadt'
    ),
    '826025.undefined': dict(
        processor=TUDarmstadtProcessor,
        strat='bieo',
        dataset='TUDarmstadt'
    ),
    '826026.undefined': dict(
        processor=TUDarmstadtProcessor,
        strat='bio',
        dataset='TUDarmstadt'
    ),
    '826027.undefined': dict(
        processor=TUDarmstadtProcessor,
        strat='io',
        dataset='TUDarmstadt'
    ),

}

MAX_LENGTH_DICT = {
    'google/bigbird-roberta-base': 1024,
    'roberta-base': 512
}

In [4]:
RESULTS = {}

In [5]:
#metrics = [FScore(average='macro')]

In [6]:
def _get_loader(df_label_map, df_test, batch_size):
    testset = ArgumentMiningDataset(
        df_label_map, df_test, tokenizer, max_length, f'standard_{strategy}', is_train=False
    )
    testloader = DataLoader(testset, batch_size=batch_size)
    return testloader

def _get_data(path, Processor, strategy, batch_size=32, limit=None):
    processor = Processor(path).from_json()
    if 'test' in path:
        df_test = processor.dataframe[['text', 'labels']]
        # TODO here might need to do a label_map
    else:
        df_dict = processor.get_tts(test_size=0.3)
        df_test = df_dict['test'][['text', 'labels']]
    if limit is not None:
        warnings.warn('LOADING LIMITED DATA')
        df_test = df_test.head(limit)
    
    df_label_map = LABELS_MAP_DICT[processor.__class__.__name__.split('Processor')[0]][strategy]
    return df_test, df_label_map

def _get_core_data(path, Processor, strategy, batch_size=32, limit=None):
    #processor = Processor(path).from_json()
    #if 'test' in path:
    #    print( processor.dataframe.head())
    #    df_test = processor.dataframe[['text', 'labels']]
    #else:
    #    df_dict = processor.get_tts(test_size=0.3)
    #    df_test = df_dict['test'][['text', 'labels']]
    #if limit is not None:
    #    warnings.warn('LOADING LIMITED DATA')
    #   df_test = df_test.head(limit)
    
    #df_label_map = LABELS_MAP_DICT[processor.__class__.__name__.split('Processor')[0]][strategy]
    
    #testset = ArgumentMiningDataset(
    #    df_label_map, df_test, tokenizer, max_length, f'standard_{strategy}', is_train=False
    #)
    #testloader = DataLoader(testset, batch_size=batch_size)
    df_test, df_label_map = _get_data(path, Processor, strategy, batch_size, limit)
    testloader = _get_loader(df_label_map, df_test, batch_size)
    
    return testloader


def _get_other_data(path, Processor, strategy, batch_size=32, limit=None):
    if Processor == PersuadeProcessor:
        # we are using a Persuade Model but would like to test on TUDarmstadt
        df_test, _ = _get_data(path, TUDarmstadtProcessor, strategy, batch_size, limit)
        # HARD RESET DF-LABEL-MAP
        df_label_map = LABELS_MAP_DICT['Persuade'][strategy]
        
        label_map_dict = {
            'Claim': 'Claim',
            'MajorClaim': 'Position',
            'Premise': 'O',
            'O': 'O'
        }
        
    else:
        df_test, _ = _get_data(path, PersuadeProcessor, strategy, batch_size, limit)
        df_label_map = LABELS_MAP_DICT['TUDarmstadt'][strategy]


        label_map_dict = {
            'Lead': 'O',
            'Rebuttal': 'O',
            'Concluding Statement': 'O',
            'Position': 'MajorClaim',
            'Evidence': 'O',
            'Claim': 'Claim',
            'Counterclaim': 'O',
            'O': 'O'
        }
    df_test.labels = df_test.labels.apply(
        lambda x: [
            'O' if label_map_dict[text.split('-')[-1]] =='O' else text.replace(
                text.split('-')[-1],
                label_map_dict[text.split('-')[-1]]
            ) for text in x]
    )


    testloader = _get_loader(df_label_map, df_test, batch_size)
    return testloader

def _get_scores_agg(df):
    df = df.groupby('class').sum()
    df['f1'] = df.tp / (df.tp + 1/2*(df.fp + df.fn))
    df['recall'] = df.tp / (df.tp + df.fn)
    df['precision'] = df.tp / (df.tp + df.fp)
    avgs = {'f1':df['f1'].mean(), 'recall': df['recall'].mean(), 'precision': df['precision'].mean()}
    return avgs, df

In [7]:
BATCH_SIZE = 64
LIMIT = None

In [26]:
for MODEL_ID, job in enumerate(os.listdir(JOB_DIR)):
    if job != '.DS_Store':
        job_path = os.path.join(JOB_DIR, job)
        model_dir = os.path.join(job_path, 'models')
        model_name = [item for item in os.listdir(model_dir) if item != '.DS_Store'][0]
        model_name_decoded = decode_model_name(model_name).split('_')[0] # get base model name
        
        max_length = MAX_LENGTH_DICT[model_name_decoded]
        # define tokenizer
        tokenizer = AutoTokenizer.from_pretrained(model_name_decoded, add_prefix_space=True)
        model_path = os.path.join(model_dir, model_name)
        
        strategy = CONFIG_MAP[job].get('strat')
        Processor = CONFIG_MAP[job].get('processor')
        dataset = CONFIG_MAP[job].get('dataset')
        if dataset == 'Persuade':
          continue
        print(
            f'{MODEL_ID+1}/12: {dataset}->{model_name_decoded}->{strategy} at PATH: {model_path}'
        )
        print('=================================================================================')
        
        


        s = time.time()
        trained_model = AutoModelForTokenClassification.from_pretrained(model_path)
        
        RESULTS[job] = {}
        
        # test the model against itself
        RESULTS[job]['self'] = {}
        
        # specify the path to the json
        path = os.path.join(MODEL_PATH, strategy)
        testloader = _get_core_data(path, Processor, strategy, batch_size=BATCH_SIZE, limit=LIMIT)
        print(f'Loaded and data loaded. Time: {time.time() - s: .3g}')


        # TODO add metrics support
        SELF = 'self'
        s = time.time()
        df_metrics, df_scores = inference(trained_model, testloader, )
        macro_f1, df_scores_agg = _get_scores_agg(df_scores)
        RESULTS[job][SELF]['core'] = {'avgs': macro_f1, 'scores': df_scores_agg}
        
        print(f'Took {time.time() -s:.3g} to get scores')
        
        
        
        # test models against self adversarial examples
        augmented_path = os.path.join(TEST_DIR, strategy)
        for augmentation in os.listdir(augmented_path):
            if augmentation != '.DS_Store':
                print(f'RUNNING AUGMENTATION: {augmentation}')
                augmentation_path = os.path.join(augmented_path, augmentation)
                testloader = _get_core_data(augmentation_path, Processor, strategy,
                                                       batch_size=BATCH_SIZE, limit=LIMIT)
                df_metrics, df_scores = inference(trained_model, testloader, )
                macro_f1, df_scores_agg = _get_scores_agg(df_scores)
                RESULTS[job][SELF][augmentation] = {'avgs': macro_f1, 'scores': df_scores_agg}
                # aggregate the scores
        
        
        '''OTHER = 'other'
        RESULTS[job][OTHER] = {}
        
        # get other processor
        testloader = _get_other_data(path, Processor, strategy, batch_size=BATCH_SIZE, limit=LIMIT)
        df_metrics, df_scores = inference(trained_model, testloader)
        macro_f1, df_scores_agg = _get_scores_agg(df_scores)
        RESULTS[job][OTHER]['core'] = {'macro_f1': macro_f1, 'scores': df_scores_agg}
        
        
        for augmentation in os.listdir(augmented_path):
            if augmentation != '.DS_Store':
                augmentation_path = os.path.join(augmented_path, augmentation)
                testloader = _get_other_data(augmentation_path, Processor, strategy,
                                             batch_size=BATCH_SIZE, limit=LIMIT)
                df_metrics, df_scores = inference(trained_model, testloader)
                macro_f1, df_scores_agg = _get_scores_agg(df_scores)
                RESULTS[job][OTHER][augmentation] = {'macro_f1': macro_f1, 'scores': df_scores_agg}'''
        
        #RESULTS[model_name_decoded]['self']['core'] # the score
        
        
        # test model against other datasets, need to convert datasets
        #RESULTS[model_name_decoded]['transfer'] = {}


        



1/12: TUDarmstadt->roberta-base->bieo at PATH: drive/MyDrive/Desktop/tmpdir/job/826025.undefined/models/cm9iZXJ0YS1iYXNlX2ZpbmFs
Loaded and data loaded. Time:  1.61
Prediction time: 0.017
Agg to word time: 5.31
Get predstring time: 0.374
Evaluate time: 1.56
Batch 1 complete.
Took 7.96 to get scores
RUNNING AUGMENTATION: custom_fillers
Prediction time: 0.0188
Agg to word time: 6.24
Get predstring time: 0.391
Evaluate time: 1.43
Batch 1 complete.
RUNNING AUGMENTATION: synonym
Prediction time: 0.017
Agg to word time: 5.47
Get predstring time: 0.369
Evaluate time: 1.55
Batch 1 complete.
RUNNING AUGMENTATION: spellingError
Prediction time: 0.0159
Agg to word time: 5.29
Get predstring time: 0.364
Evaluate time: 1.5
Batch 1 complete.
RUNNING AUGMENTATION: keywordChange
Prediction time: 0.0187
Agg to word time: 5.33
Get predstring time: 0.372
Evaluate time: 1.54
Batch 1 complete.
RUNNING AUGMENTATION: antonym
Prediction time: 0.0166
Agg to word time: 5.37
Get predstring time: 0.375
Evaluate ti

  * num_indices_to_pick_from


Prediction time: 6.8
Agg to word time: 4.41
Get predstring time: 0.551
Evaluate time: 2.73
Batch 1 complete.
Took 15.4 to get scores
RUNNING AUGMENTATION: antonym
Prediction time: 6.82
Agg to word time: 4.91
Get predstring time: 0.544
Evaluate time: 2.45
Batch 1 complete.
RUNNING AUGMENTATION: synonym
Prediction time: 6.85
Agg to word time: 4.13
Get predstring time: 0.54
Evaluate time: 2.55
Batch 1 complete.
RUNNING AUGMENTATION: keywordChange
Prediction time: 6.84
Agg to word time: 4.54
Get predstring time: 0.352
Evaluate time: 2.99
Batch 1 complete.
RUNNING AUGMENTATION: custom_fillers
Prediction time: 6.85
Agg to word time: 4.97
Get predstring time: 0.375
Evaluate time: 1.64
Batch 1 complete.
RUNNING AUGMENTATION: spellingError
Prediction time: 6.85
Agg to word time: 3.57
Get predstring time: 0.367
Evaluate time: 1.56
Batch 1 complete.
8/12: TUDarmstadt->google/bigbird-roberta-base->io at PATH: drive/MyDrive/Desktop/tmpdir/job/820987.undefined/models/Z29vZ2xlL2JpZ2JpcmQtcm9iZXJ0YS1i

In [None]:
RESULTS

{'822594.undefined': {'self': {'core': {'macro_f1': 0.46401515151515155,
    'scores':         tp   fn   fp        f1
    class                         
    0      0.0  2.0  4.0  0.000000
    1      1.0  1.0  1.0  0.500000
    2      1.0  1.0  1.0  0.500000
    3      3.0  4.0  6.0  0.375000
    6      5.0  0.0  1.0  0.909091
    7      1.0  1.0  1.0  0.500000},
   'custom_fillers': {'macro_f1': 0.0,
    'scores':         tp  fn  fp   f1
    class                  
    1      0.0   0   2  0.0
    2      0.0   1   2  0.0
    3      0.0   0   2  0.0
    5      0.0   1   1  0.0
    6      0.0   1   2  0.0
    7      0.0   0   1  0.0},
   'synonym': {'macro_f1': 0.0,
    'scores':         tp  fn  fp   f1
    class                  
    1      0.0   0   1  0.0
    2      0.0   1   2  0.0
    3      0.0   0   4  0.0
    5      0.0   1   0  0.0
    6      0.0   1   4  0.0},
   'keywordChange': {'macro_f1': 0.0,
    'scores':         tp  fn  fp   f1
    class                  
    1      0.0  

In [28]:
for model in CONFIG_MAP:
  if CONFIG_MAP[model].get('dataset') == 'TUDarmstadt':
    strategy = CONFIG_MAP[model].get('strat')
    Processor = CONFIG_MAP[model].get('processor')
    dataset = CONFIG_MAP[model].get('dataset')
    print(
            f'{dataset}->{model_name_decoded}->{strategy} at PATH: {model_path}'
        )
    print('=================================================================================')
        
    results_self = RESULTS[model]['self']
    for attack in results_self:
      print(f'{attack}-------------------------------------------------------------------------------')
      for metric, val in results_self[attack]['avgs'].items():
        print(f'{metric}: {val}')
      print(results_self[attack]['scores'])
      print('\n\n\n')

TUDarmstadt->roberta-base->bieo at PATH: drive/MyDrive/Desktop/tmpdir/job/822594.undefined/models/cm9iZXJ0YS1iYXNlX2ZpbmFs
core-------------------------------------------------------------------------------
f1: 0.7079025125846855
recall: 0.7373476287271112
precision: 0.6812618396866449
        tp   fn   fp        f1    recall  precision
class                                              
0      920  426  438  0.680473  0.683507   0.677467
1      121   32   51  0.744615  0.790850   0.703488
2      188  116  145  0.590267  0.618421   0.564565
3      693  116  196  0.816254  0.856613   0.779528




custom_fillers-------------------------------------------------------------------------------
f1: 0.40157620191896376
recall: 0.43058029733586756
precision: 0.37939143515073415
        tp   fn    fp        f1    recall  precision
class                                               
0      384  962  1053  0.275961  0.285290   0.267223
1       63   90    81  0.424242  0.411765   0.437500
2      1

In [9]:
for MODEL_ID, job in enumerate(os.listdir(JOB_DIR)):
    limit = 500
    if job != '.DS_Store':
        job_path = os.path.join(JOB_DIR, job)
        model_dir = os.path.join(job_path, 'models')
        model_name = [item for item in os.listdir(model_dir) if item != '.DS_Store'][0]
        model_name_decoded = decode_model_name(model_name).split('_')[0] # get base model name
        
        max_length = MAX_LENGTH_DICT[model_name_decoded]
        # define tokenizer
        tokenizer = AutoTokenizer.from_pretrained(model_name_decoded, add_prefix_space=True)
        model_path = os.path.join(model_dir, model_name)
        
        strategy = CONFIG_MAP[job].get('strat')
        Processor = CONFIG_MAP[job].get('processor')
        dataset = CONFIG_MAP[job].get('dataset')
        if dataset == 'TUDarmstadt':
          continue
        print(
            f'{MODEL_ID+1}/12: {dataset}->{model_name_decoded}->{strategy} at PATH: {model_path}'
        )
        print('=================================================================================')
        
        


        s = time.time()
        trained_model = AutoModelForTokenClassification.from_pretrained(model_path)
        
        RESULTS[job] = {}
        
        # test the model against itself
        RESULTS[job]['self'] = {}
        
        # specify the path to the json
        path = os.path.join(MODEL_PATH, strategy)
        testloader = _get_core_data(path, Processor, strategy, batch_size=BATCH_SIZE, limit=limit)
        print(f'Loaded and data loaded. Time: {time.time() - s: .3g}')


        # TODO add metrics support
        SELF = 'self'
        s = time.time()
        df_metrics, df_scores = inference(trained_model, testloader, )
        macro_f1, df_scores_agg = _get_scores_agg(df_scores)
        RESULTS[job][SELF]['core'] = {'avgs': macro_f1, 'scores': df_scores_agg}
        
        print(f'Took {time.time() -s:.3g} to get scores')
        
        
        
        # test models against self adversarial examples
        augmented_path = os.path.join(TEST_DIR, strategy)
        for augmentation in os.listdir(augmented_path):
            if augmentation != '.DS_Store':
                print(f'RUNNING AUGMENTATION: {augmentation}')
                augmentation_path = os.path.join(augmented_path, augmentation)
                testloader = _get_core_data(augmentation_path, Processor, strategy,
                                                       batch_size=BATCH_SIZE, limit=limit)
                df_metrics, df_scores = inference(trained_model, testloader, )
                macro_f1, df_scores_agg = _get_scores_agg(df_scores)
                RESULTS[job][SELF][augmentation] = {'avgs': macro_f1, 'scores': df_scores_agg}
                # aggregate the scores
        
        
        '''OTHER = 'other'
        RESULTS[job][OTHER] = {}
        
        # get other processor
        testloader = _get_other_data(path, Processor, strategy, batch_size=BATCH_SIZE, limit=LIMIT)
        df_metrics, df_scores = inference(trained_model, testloader)
        macro_f1, df_scores_agg = _get_scores_agg(df_scores)
        RESULTS[job][OTHER]['core'] = {'macro_f1': macro_f1, 'scores': df_scores_agg}
        
        
        for augmentation in os.listdir(augmented_path):
            if augmentation != '.DS_Store':
                augmentation_path = os.path.join(augmented_path, augmentation)
                testloader = _get_other_data(augmentation_path, Processor, strategy,
                                             batch_size=BATCH_SIZE, limit=LIMIT)
                df_metrics, df_scores = inference(trained_model, testloader)
                macro_f1, df_scores_agg = _get_scores_agg(df_scores)
                RESULTS[job][OTHER][augmentation] = {'macro_f1': macro_f1, 'scores': df_scores_agg}'''
        
        #RESULTS[model_name_decoded]['self']['core'] # the score
        
        
        # test model against other datasets, need to convert datasets
        #RESULTS[model_name_decoded]['transfer'] = {}


        



3/12: Persuade->google/bigbird-roberta-base->bio at PATH: drive/MyDrive/Desktop/tmpdir/job/820965.undefined/models/Z29vZ2xlL2JpZ2JpcmQtcm9iZXJ0YS1iYXNlX2ZpbmFs




Loaded and data loaded. Time:  6.22


RuntimeError: ignored