In [1]:
import nbimporter
from augmentation import *
from parsing import *
from subsampling import *

In [2]:
def create_augmented_data(simplified_data_path_in, simplified_augmented_data_path_out, feats_path, augmented_train_path_out, 
                          patient_notes_path, augmenter, num_augmentations, percentage_to_augment = 0.1, min_length=10):
    
    samples = read_simplified_data(simplified_data_path_in)
    df_patient_notes = pd.read_csv(patient_notes_path)
    df_new_patient_notes = df_patient_notes.copy()
    
    tokenizer = CharacterTokenizer()
    simplified_data = []
    
    samples = [samples[1]] # uzimamo samo prvi primjer radi testova (zbog one greške s indeksima)
    
    for sample in samples:
        augmented_samples = augment_simplified_data(num_augmentations, percentage_to_augment, augmenter, sample, min_length)
        
        for augmented_sample in augmented_samples:
            new_row = pd.DataFrame([[augmented_sample.pn_num, augmented_sample.case_num, augmented_sample.text]], columns=df_new_patient_notes.columns)
            df_new_patient_notes = pd.concat([df_new_patient_notes, new_row])
            
            
    all_samples = augmented_samples + samples
    
    for sample in all_samples:
        
        tokens = tokenizer.tokenize(sample.text)
        word_locs = []
        curr_loc = 0
        
        for word in tokens:
            word_locs.append((curr_loc, curr_loc + len(word)))
            curr_loc += len(word)
            
        simplified_data.append((tokens, word_locs, sample.labels, sample.case_num, sample.pn_num))
    
    for tokens, locations, labels, pn_num, case_num in simplified_data:
        path = simplified_augmented_data_path_out + str(case_num) + '_' +str(pn_num) + '.csv'
        save_simplified_data(tokens, locations, labels, path)
        
    df_out = complexify_data(simplified_augmented_data_path_out, feats_path)
    df_out.to_csv(augmented_train_path_out + 'augmented_train.csv', index=False)
    df_new_patient_notes.to_csv(augmented_train_path_out + 'augmented_patient_notes.csv', index=False)

In [3]:
SIMPLIFIED_PATH_IN = 'dataset/subset_250/simplified_train/'
SIMPLIFIED_AUGMENTED_PATH_OUT = 'dataset/subset_250/simplified_train_augmented/'
FEATS_PATH = 'dataset/subset_250/features.csv'
AUGMENTED_TRAIN_PATH_OUT = 'dataset/subset_250/train_augmented/'
PATIENT_NOTES_PATH = 'dataset/subset_250/patient_notes.csv'

synonym_wordnet_aug = SynonymAugmenter()
num_aug = 1

# stvara novi dataset sastavljenih od primjera SIMPLIFIED_PATH_IN i njihovih augmentiranih inačica 
# i sprema ga AUGMENTED_TRAIN_PATH_OUT, stvara novi patient_notes.csv koji uključuje augmentirane primjere

create_augmented_data(SIMPLIFIED_PATH_IN, SIMPLIFIED_AUGMENTED_PATH_OUT, FEATS_PATH, AUGMENTED_TRAIN_PATH_OUT, 
                      PATIENT_NOTES_PATH, synonym_wordnet_aug, num_aug)