In [1]:
import nbimporter
from augmentation import *
from parsing import *
from subsampling import *
import os

In [2]:
def create_augmented_data(simplified_data_path_in, simplified_augmented_data_path_out, feats_path, augmented_train_path_out, 
                          patient_notes_path, augmenter, num_augmentations, percentage_to_augment =0.1, min_length=1):
    
    samples = read_simplified_data(simplified_data_path_in)
    df_patient_notes = pd.read_csv(patient_notes_path)
    df_new_patient_notes = df_patient_notes.copy()
    
    tokenizer = CharacterTokenizer()
    simplified_data = []
    
    all_samples = samples[:]
    for i, sample in enumerate(samples):
        augmented_samples = augment_simplified_data(num_augmentations, percentage_to_augment, augmenter, sample, min_length)
        for augmented_sample in augmented_samples:
            new_row = pd.DataFrame([[augmented_sample.pn_num, augmented_sample.case_num, augmented_sample.text]], columns=df_new_patient_notes.columns)
            df_new_patient_notes = pd.concat([df_new_patient_notes, new_row])
        all_samples += augmented_samples
        
    for i, sample in enumerate(all_samples):
        tokens = tokenizer.tokenize(sample.text)
        word_locs = []
        curr_loc = 0
        
        for word in tokens:
            word_locs.append((curr_loc, curr_loc + len(word)))
            curr_loc += len(word)
            
        simplified_data.append((tokens, word_locs, sample.labels, sample.pn_num, sample.case_num))
    
    for tokens, locations, labels, pn_num, case_num in simplified_data:
        path = simplified_augmented_data_path_out + str(pn_num) + '_' +str(case_num) + '.csv'
        save_simplified_data(tokens, locations, labels, path)
        
        
        
        
    df_out = complexify_data(simplified_augmented_data_path_out, feats_path)
    df_out.to_csv(augmented_train_path_out + 'dtrain.csv', index=False)
    df_new_patient_notes.to_csv(augmented_train_path_out + 'dpatient_notes.csv', index=False)

In [3]:
# SIMPLIFIED_PATH_IN = 'dataset/subset_250/simplified_train/'
# SIMPLIFIED_AUGMENTED_PATH_OUT = 'dataset/subset_250/simplified_train_augmented/'
# FEATS_PATH = 'dataset/subset_250/features.csv'
# AUGMENTED_TRAIN_PATH_OUT = 'dataset/subset_250/train_augmented/'
# PATIENT_NOTES_PATH = 'dataset/subset_250/patient_notes.csv'

# synonym_wordnet_aug = SynonymAugmenter()
# num_aug = 1

# stvara novi dataset sastavljenih od primjera SIMPLIFIED_PATH_IN i njihovih augmentiranih inačica 
# i sprema ga AUGMENTED_TRAIN_PATH_OUT, stvara novi patient_notes.csv koji uključuje augmentirane primjere

# create_augmented_data(SIMPLIFIED_PATH_IN, SIMPLIFIED_AUGMENTED_PATH_OUT, FEATS_PATH, AUGMENTED_TRAIN_PATH_OUT, 
#                       PATIENT_NOTES_PATH, synonym_wordnet_aug, num_aug)

In [4]:
PATH = os.getcwd() + '/tar-project/'
PATIENT_NOTES_PATH = PATH + 'patient_notes.csv'
FEATS_PATH = PATH + 'features.csv'
SIMPLIFIED_PATH_IN = PATH + '/simplified_train_250_subset/'


augmenters = [LwtrAugmenter(SIMPLIFIED_PATH_IN)]
percenteges = [0.1, 0.3, 0.5]
num_augmented_examples = [1, 3]
runs = [1, 2, 3]

In [5]:
# stvari nove datasetove i spremi ih u odgovarajući direktorij

for augmenter in augmenters:
    for percentage in percenteges:
        for num in num_augmented_examples:
            for run in runs:
                print(augmenter.type, percentage, num, run)
                
                destination = PATH + 'datasets/' + str(type(augmenter).__name__) +'-percent-aug-'+ str(percentage) +'-num-augs-'+ str(num) +'-run-'+ str(run)
                
                if not os.path.exists(destination):
                    os.makedirs(destination)
                    
                
                SIMPLIFIED_AUGMENTED_PATH_OUT = destination + '/simplified_train_augmented/'
                AUGMENTED_TRAIN_PATH_OUT = destination + '/train_augmented/'
                
                if not os.path.exists(SIMPLIFIED_AUGMENTED_PATH_OUT):
                    os.makedirs(SIMPLIFIED_AUGMENTED_PATH_OUT)
                    
                if not os.path.exists(AUGMENTED_TRAIN_PATH_OUT):
                    os.makedirs(AUGMENTED_TRAIN_PATH_OUT)
                
                create_augmented_data(SIMPLIFIED_PATH_IN, SIMPLIFIED_AUGMENTED_PATH_OUT, FEATS_PATH, AUGMENTED_TRAIN_PATH_OUT, PATIENT_NOTES_PATH, augmenter, num, percentage)

AugmenterType.WORD_LWTR 0.1 1 1
AugmenterType.WORD_LWTR 0.1 1 2



KeyboardInterrupt

