In [1]:
# -- env setup
import os
import gc

!python3.7 -m pip install git+https://github.com/namiyousef/colab-utils.git
from colabtools.utils import get_gpu_utilization, mount_drive, install_private_library

drive_path = mount_drive()
project_path = os.path.join(drive_path, 'COMP0087/data/core')
#development_dir = os.path.join(drive_path, 'argument-mining/argminer')

install_private_library(os.path.join(drive_path, 'github_config.json'), 'argument-mining')

Collecting git+https://github.com/namiyousef/colab-utils.git
  Cloning https://github.com/namiyousef/colab-utils.git to /tmp/pip-req-build-n7563p22
  Running command git clone -q https://github.com/namiyousef/colab-utils.git /tmp/pip-req-build-n7563p22
Building wheels for collected packages: colab-dev-tools
  Building wheel for colab-dev-tools (setup.py) ... [?25l[?25hdone
  Created wheel for colab-dev-tools: filename=colab_dev_tools-0.0.7-py3-none-any.whl size=3654 sha256=82edc88c1bf452b5401647d79864a699488c701b2b3d2796a34b9ba99eecc33b
  Stored in directory: /tmp/pip-ephem-wheel-cache-7r0t3gt3/wheels/1c/35/c0/364531e4ff0f0fe0f3296c80f1ee668b03ae6c6c378c5a44bf
Successfully built colab-dev-tools
Installing collected packages: colab-dev-tools
Successfully installed colab-dev-tools-0.0.7
Google Drive import successful.
CUDA device detected. Using GPU...
Mounted at /content/drive
Google Drive mount successful.


In [None]:
# import pkg_resources
# version = pkg_resources.require("argminer")[0].version

In [2]:
from argminer.data import ArgumentMiningDataset, TUDarmstadtProcessor, PersuadeProcessor, DataProcessor, create_labels_doc_level, df_from_text_files
from argminer.evaluation import inference
import time


Import of ArgMiner successful


In [3]:
!pip install nlpaug

Collecting nlpaug
  Downloading nlpaug-1.1.10-py3-none-any.whl (410 kB)
[K     |████████████████████████████████| 410 kB 4.3 MB/s 
Installing collected packages: nlpaug
Successfully installed nlpaug-1.1.10


In [4]:
import numpy as np
import pandas as pd
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas

import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Unzipping corpora/omw-1.4.zip.


True

In [7]:
text = ["From this point of view, I firmly believe that we should attach more importance to cooperation during primary education."]

'From this point of view, I firmly believe that we should attach more importance to cooperation during primary education.'

In [6]:
def filler_augment(text, fillers=None):
    """Augments the beginning of text with a phrase taken from a list of pre-defined filler phrases.
    The filler phrases are taken from the TUDarmstadt annotation guidelines, with the addition of some
    other common fillers used in english."""
    if len(text) <= 1:
        return text
    if fillers is None:
        fillers = [
            "According to the previous fact, ",
            "As can be seen, ",
            "For example, ",
            "Another important point which contributes to my argument is that ",
            "I agree to this view that ",
            "In this context, ",
            "At the end of the day, ",
        ]
    random_idx = np.random.choice(len(fillers))
    filler = fillers[random_idx]
    aug_text = filler + text[0].lower() + text[1:]
    
    return aug_text

In [None]:
s = time.time()
text = filler_augment(text)
print(time.time()-s)

0.0056934356689453125


In [None]:
#Synonym

aug = naw.SynonymAug()
augmented_text = aug.augment(text)


In [None]:
#Random spelling mistakes
aug = naw.SpellingAug()

In [None]:
#Random word insertion /substitution
aug = naw.ContextualWordEmbsAug(model_path='bert-base-uncased', action="insert")


Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

In [None]:
#Antonyms
aug = naw.AntonymAug()

In [8]:
#Random keyword change
reserved_tokens = [
    [
        "I",
        "I therefore",
        "I actually",
        "I basically",
        "I seriously",
        "I really",
        "I highly",
        "I totally",
        "I absolutely",
    ],
]
aug = naw.ReservedAug(reserved_tokens=reserved_tokens)
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
['From this point of view, I firmly believe that we should attach more importance to cooperation during primary education.']
Augmented Text:
['From this point of view, I totally firmly believe that we should attach more importance to cooperation during primary education.']


In [None]:
#Time
s = time.time()
augmented_text = aug.augment(text)
print(time.time()-s)

0.0004763603210449219


In [None]:
path_kaggle = 'drive/MyDrive/feedback-prize-2021'
processor = PersuadeProcessor(path_kaggle)
processor = processor.preprocess()
processor = processor.save_json(dir_path='drive/MyDrive/augmented_dataset')

In [None]:
!ls drive/MyDrive/augmented_dataset

Persuade_preprocessed.json  test  TUDarmstadt_preprocessed.json


In [9]:
path_persuade = 'drive/MyDrive/augmented_dataset'
path_tu =  'drive/MyDrive/augmented_dataset'
test = 'test'

In [None]:

processor = TUDarmstadtProcessor(path_persuade).from_json(status='preprocessed')


<argminer.data.TUDarmstadtProcessor at 0x7f0ca2f604d0>

In [None]:

for strategy in ['bio','io','bieo']:
  for aug_name, aug_func in {'synonym':naw.SynonymAug().augment,'spellingError':naw.SpellingAug().augment,
                             "antonym":naw.AntonymAug().augment, 'keywordChange':naw.ReservedAug(reserved_tokens=reserved_tokens).augment}.items():
    processor = PersuadeProcessor(path_persuade).from_json(status='preprocessed')
    processor = processor.process(strategy=strategy,test_size=0.3, processors=[aug_func],split=test).postprocess()
    save_path = path_persuade+f'/{test}/{strategy}/{aug_name}'
    if not os.path.exists(save_path):
      os.makedirs(save_path)
      print(f"{save_path} is created")
    processor.save_json(save_path)


  


drive/MyDrive/augmented_dataset/test/bio/synonym is created
drive/MyDrive/augmented_dataset/test/bio/spellingError is created
drive/MyDrive/augmented_dataset/test/bio/antonym is created
drive/MyDrive/augmented_dataset/test/bio/keywordChange is created
drive/MyDrive/augmented_dataset/test/io/synonym is created
drive/MyDrive/augmented_dataset/test/io/spellingError is created
drive/MyDrive/augmented_dataset/test/io/antonym is created
drive/MyDrive/augmented_dataset/test/io/keywordChange is created
drive/MyDrive/augmented_dataset/test/bieo/synonym is created
drive/MyDrive/augmented_dataset/test/bieo/spellingError is created
drive/MyDrive/augmented_dataset/test/bieo/antonym is created
drive/MyDrive/augmented_dataset/test/bieo/keywordChange is created
1203.1301529407501


In [None]:
for strategy in ['bio','io','bieo']:
  for aug_name, aug_func in {'custom_fillers':filler_augment}.items():
    processor = PersuadeProcessor(path_persuade).from_json(status='preprocessed')
    processor = processor.process(strategy=strategy,test_size=0.3, processors=[aug_func],split=test).postprocess()
    save_path = path_persuade+f'/{test}/{strategy}/{aug_name}'
    if not os.path.exists(save_path):
      os.makedirs(save_path)
      print(f"{save_path} is created")
    processor.save_json(save_path)

  after removing the cwd from sys.path.


drive/MyDrive/augmented_dataset/test/bio/custom_fillers is created
drive/MyDrive/augmented_dataset/test/io/custom_fillers is created
drive/MyDrive/augmented_dataset/test/bieo/custom_fillers is created


In [10]:
for strategy in ['bio','io','bieo']:
  for aug_name, aug_func in {'synonym':naw.SynonymAug().augment,'spellingError':naw.SpellingAug().augment,
                             "antonym":naw.AntonymAug().augment, 'keywordChange':naw.ReservedAug(reserved_tokens=reserved_tokens).augment,
                             'custom_fillers':filler_augment}.items():
    processor = TUDarmstadtProcessor(path_tu).from_json(status='preprocessed')
    processor = processor.process(strategy=strategy,test_size=0.3, processors=[aug_func],split=test).postprocess()
    save_path = path_tu+f'/{test}/{strategy}/{aug_name}'
    if not os.path.exists(save_path):
      os.makedirs(save_path)
      print(f"{save_path} is created")
    processor.save_json(save_path)

  
