71432


('attributes', (227, 237))

In [208]:
import pandas as pd
from os import listdir
from os.path import isfile, join
import random
import math
from enum import Enum

DATA_PATH = 'data/simplified/'

class FeatureSequence():
    
    def __init__(self, feature_num, start_loc, end_loc):
        self.feature_num = feature_num
        self.start_loc = start_loc
        self.end_loc = end_loc

class Sample():
    
    def __init__(self, text, labels, case_num, pn_num):
        self.text = text
        self.labels = labels
        self.case_num = case_num
        self.pn_num = pn_num
        self.feat_seqs = []
        self.update_feat_seqs()
        
    def update_feat_seqs(self):
        self.feat_seq = []
        prev_label = self.labels[0]
        start_loc = 0
        end_loc = 0
        for i, label in enumerate(self.labels):
            if prev_label != label:
                end_loc = i
                self.feat_seqs.append(FeatureSequence(prev_label, start_loc, end_loc))
                prev_label = label
                start_loc = i
        end_loc = i
        self.feat_seqs.append(FeatureSequence(prev_label, start_loc, end_loc))
    
    def get_num_word(self):
        return len(self.text.split())
    
    def feat_seq_num(self):
        return len(self.feat_seqs)
    
    def select_random_feat_seqs(self, feature_num=None, min_length=5):
        filtered_seqs = [seq for seq in self.feat_seqs if len(seq) >= 5]
        if feature_num != None:
            filtered_seqs = [seq for seq in filtered_seqs if seq.feature_num == feature_num]
        return random.sample(filtered_seqs)   
    
    def select_random_word(self):
        text = ''.join(self.text)
        words = text.split()
        word = random.choice(words)
        loc = text.find(word)
        return word, loc
        
        
       
    

class Augmenter_Type(Enum):
    WORD = 1
    SEQUENCE = 2


    
def augment(aug_type, augmenter, sample, percentage_to_augment = 0.1):
    if aug_type == Augmenter_Type.WORD:
        words_to_augment = math.ceil(sample.get_num_word() * percentage_to_augment)
        print()
        for i in range(words_to_augment):
            word, loc = sample.select_random_word()
            aug_word = augmenter.augment(word)
            print(word, aug_word)
            if len(word) < len(aug_word) :
                print('adding', len(aug_word) - len(word))
                for j in range(len(aug_word) - len(word)):
                    sample.labels.insert(loc, sample.labels[loc])
            if len(word) > len(aug_word):
                print('removing', len(word) - len(aug_word))
                for j in range(len(word) - len(aug_word)):
                    print('inside')
                    sample.labels.pop(loc)
            sample.text = sample.text.replace(word, aug_word, 1)
        sample.update_feat_seqs()
    
    
def read_simplified_data(path):
    
    data_files = [f for f in listdir(path) if isfile(join(path, f))]
    samples = []
    for data_file in data_files:
        case_num, pn_num = (int(i) for i in data_file[:-4].split('_'))
        df = pd.read_csv(path + data_file)  
        tokens = ''.join(df.word.to_list())
        labels = df.label.to_list()
        samples.append(Sample(tokens, labels, case_num, pn_num))
        
    return samples



In [128]:
#samples = read_simplified_data(DATA_PATH)

#print(samples[0].text)
#for seq in samples[0].feat_seqs:
#    print(seq.feature_num, seq.start_loc, seq.end_loc)

In [97]:
sample = samples[0]
word, loc = sample.select_random_word()
print(word, loc)
print(sample.text[loc:(loc + len(word))])

cancer. 704
cancer.


In [109]:
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as nafc

from nlpaug.util import Action
back_translation_aug = naw.BackTranslationAug(
    from_model_name='facebook/wmt19-en-de', 
    to_model_name='facebook/wmt19-de-en'
)

In [142]:

text = 'asdgs '
back_translation_aug.augment(text)

''

In [143]:
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

synonym_wordnet_aug = naw.SynonymAug(aug_src='wordnet')
synonym_wordnet_aug.augment(text)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/mc/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/mc/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/mc/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


'asdgs'

In [220]:
text = 'Speedy brown pole walks bar'
labels = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 0, 0, 0, 0]
print(text)
print(len(text), len(labels))
sample = Sample(text, labels, 1, 1)
augment(Augmenter_Type.WORD, synonym_wordnet_aug, sample, 1)
print(sample.text)

Speedy brown pole walks bar
27 27

brown robert brown
adding 7
robert henry m. robert
adding 9
pole celestial pole
adding 10
pole terminal
adding 4
Speedy Speedy
Speedy henry m. robert brown celestial terminal walks bar


In [221]:
print(len(sample.text), len(sample.labels))


for i, j in zip(sample.text, sample.labels):
    print(i, j)

57 57
S 0
p 0
e 0
e 0
d 0
y 0
  0
h 1
e 1
n 1
r 1
y 1
  1
m 1
. 1
  1
r 1
o 1
b 1
e 1
r 1
t 1
  1
b 1
r 1
o 1
w 1
n 1
  0
c 0
e 0
l 0
e 0
s 0
t 0
i 0
a 0
l 0
  0
t 0
e 0
r 0
m 0
i 0
n 0
a 0
l 0
  0
w 2
a 2
l 2
k 2
s 2
  0
b 0
a 0
r 0


In [218]:
lis = [0, 1, 2]
print(lis.pop(1))
print(lis)

1
[0, 2]
