augment.py

# /////////////// Replaces original words in the training file with random ones generated by the module 'generate'
# /////////////// augments the training file with the new sentences

import csv
import time
import os
import argparse
from tqdm import tqdm
from generate import mainG, mainGacc, mainGintern, mainGsupertag, lG, lGacc, lGIntern, info, color, tags

# //////////////////////////////////////////////////
def sentLengths(linesList, emptyList):
    try:
        for i in range(len(linesList)):
            if len(linesList[i]) == 0:    
                count = linesList[i-1][0]
                emptyList.append(count)
            i += 1
        #print(emptyList)      
    except(IndexError):
        i += 1  
# //////////////////////////////////////////////////

# /////////////////////////////////////////////////////////////////////////////////////////////////////
def checkVowels(i, lines):
    try:
        if lines[i][1].startswith(('a','A','e','E','i','I','o','O','u','U')) == True:
            if lines[i-1][1] == 'a':
                if 'OP-DEF <>' in lines[i-1][3]:
                    lines[i-1][1] = 'an'
            if lines[i-1][1] == 'A':
                if 'OP-DEF <>' in lines[i-1][3]:
                    lines[i-1][1] = 'An'
                
        if lines[i][1].startswith(('a','A','e','E','i','I','o','O','u','U')) == False:
            if lines[i-1][1] == 'an':
                if 'OP-DEF <>' in lines[i-1][3]:
                    lines[i-1][1] = 'a'
            if lines[i-1][1] == 'An':
                if 'OP-DEF <>' in lines[i-1][3]:
                    lines[i-1][1] = 'A'
    except(IndexError):
        i += 1
# /////////////////////////////////////////////////////////////////////////////////////////////////////   
   

if __name__ == '__main__': 
    # // commands //////////////////////////////////////////////////////////////////////////////////////////
    parser = argparse.ArgumentParser(add_help=False)
    parser.add_argument('-h', '--help', help=info(), action='help')
    parser.add_argument('--unimorph0', action='store_true', required=False, help='UniMorph Inaccurate verb replacements with regard to transitivity. In place of --unimorph1, --internal, --supertag, or --original.')
    parser.add_argument('--unimorph1', action='store_true', required=False, help='UniMorph Accurate verb replacements with regard to transitivity. In place of --unimorph0, --internal, --supertag, or --original.')
    parser.add_argument('--internal', action='store_true', required=False, help='For internal word swapping. In place of --unimorph0, unimorph1, --supertag, or --original.')
    parser.add_argument('--supertag', action='store_true', required=False, help='For internal supertag word swapping. In place of --unimorph0, unimorph1, --internal, or --original.')
    parser.add_argument('--original', action='store_true', required=False, help='Appends the training data by itself without changes. In place of --unimorph0, unimorph1, --internal, or --supertag.')
    parser.add_argument('-i', '--RRGinput', type=str, required=False, default='./rrgparbank/conllu/filtered_acc_en_conllu.conllu', help='(OPTIONAL) Filtered RRG file input. Default file: "rrgparbank/conllu/filtered_acc_en_conllu.conllu".')
    parser.add_argument('-o', '--RRGoutput', type=str, required=False, default='./rrgparbank/conllu', help='(OPTIONAL) Filtered RRG file output directory. Default directory: "rrgparbank/conllu".')
    parser.add_argument('-t', '--tag', type=str, required=False, help='Word tags.')
    parser.add_argument('-ti', '--trainInput', required=False, default='./experiments/rrgparbank-en/base/train.supertags', type=str, help='(OPTIONAL) train.supertags file input. Default file: "experiments/rrgparbank-en/base/train.supertags".')
    parser.add_argument('-to', '--trainOutput', required=False, default='./experiments/rrgparbank-en/new_', type=str, help='(OPTIONAL) train.supertags file output directory. If the directory is not specified, the default directory is used and filename changes to "new_train.extension".')
    parser.add_argument('-s', '--extensionSize', required=True, type=int, help='Extension size of the resulting training file. Must be >= 2. "2" doubles the size (sentences) of the base training file, thus does 1 run through the file (-s input-1).')
    baseTrPath = './experiments/rrgparbank-en/base/train.supertags'
    args = parser.parse_args()
    # //////////////////////////////////////////////////////////////////////////////////////////////////////

    if args.tag not in tags and args.tag != None:
        print(color.RED, color.BOLD, '\nWrong tag.\n')
        print("Enter available tag.", color.END)
        exit()
    elif args.tag == None and (args.unimorph0 == True or args.unimorph1 == True or args.internal == True):
        print(color.RED, color.BOLD, '\nMissing tag (-t) input!')
        print("Enter available tag.", color.END)
        exit()
        
    try:
        if args.extensionSize < 2:
            raise Exception('ValueError')
    except Exception as VE:
            print(color.RED, color.BOLD, '\n', VE,' -s (--extensionSize) has to be >= 2.')
            exit()
    
    L_nS = []
    L_aPoss = []
    L_nP = []
    L_aCmpr = []
    L_aSup = []
    L_vPst = []
    L_vInf = []
    L_vPresPart = []
    L_vPstPart = []
    L_adv = []
    L_advInt = []
    L_advSup = []
    L_advCmpr = [] 
    L_vPstIntr = []
    L_vInfIntr = []
    L_vPresPartIntr = []
    L_vPstPartIntr = []
    
    if os.path.isdir(args.trainOutput) == False:
        print(color.RED, color.BOLD, '\nDirectory (-to) missing or does not exist!',)
        print('Using default directory:\n./experiments/rrgparbank-en/', color.END)
        args.trainOutput = './experiments/rrgparbank-en/new_'
    if os.path.isfile(args.trainInput) == False:
        print(color.RED, color.BOLD, '\nTraining input (-ti) missing or file does not exist!')
        print('Using default train.supertags file.', color.END)
        args.trainInput = './experiments/rrgparbank-en/base/train.supertags'

    start = time.perf_counter() 
    
    # /// Copying input/original data to new training file
    print('\nCopying input/original data to new training file...')
    progressbar = tqdm(total=1)  
    with open(args.trainInput, 'r+', encoding='UTF-8', newline='\n') as training:
        with open(args.trainOutput+'train.supertags', 'w', encoding='UTF-8', newline='\n') as outfileTr:          
            orgTr = csv.reader(training, delimiter='\t')
            writer = csv.writer(outfileTr, delimiter='\t')
            rowsTr = list(orgTr)
            writer.writerows(rowsTr)
    progressbar.update()
    progressbar.close()
    
    # /// creating temp file
    temp = open('./experiments/rrgparbank-en/tempTr.txt', 'w', encoding='UTF-8', newline='\n')
    temp.close()

    
    if args.unimorph0 == True and args.unimorph1 == False and args.internal == False and args.supertag == False and args.original == False:
        # /// running list function
        lG(L_nS, L_nP, L_aPoss, L_aCmpr, L_aSup, L_vPst, L_vInf, L_vPresPart, L_vPstPart)
        
        for x in range((args.extensionSize)-1):
            mainG(args.RRGinput, args.trainInput, args.RRGoutput, args.tag, L_nS, L_nP, L_aPoss, L_aCmpr, L_aSup, L_vPst, L_vInf, L_vPresPart, L_vPstPart)
            # /// sleep for 2 seconds for buffer
            for i in range(2,0,-1):
                print(f"{i}", end="\r", flush=True)
                time.sleep(1)
            print()
        
            with open('./rrgparbank/conllu/modified_filtered_acc_en_conllu.conllu', 'r+', encoding='UTF-8', newline='\n') as modified:
                with open('./experiments/rrgparbank-en/base/train.supertags', 'r+', encoding='UTF-8', newline='\n') as training:
                    with open('./experiments/rrgparbank-en/tempTr.txt', 'a', encoding='UTF-8', newline='\n') as outfile:
                        readerMod = csv.reader(modified, delimiter='\t')
                        readerTr = csv.reader(training, delimiter='\t')
                        writerTemp = csv.writer(outfile, delimiter='\t')
                        linesMod = list(readerMod)  #///creates lists of lines from the file
                        linesTr = list(readerTr)
                        trainLength = []
                        
                        sentLengths(linesTr, trainLength)
                        
                        # /// run number indicator
                        print(color.BOLD, color.PURPLE, '\n\nRun ', (x+1), '/', (args.extensionSize)-1, color.END, '\n')
                        
                        # /// Adding sentence lengths to the file  
                        count = 0
                        for i in range(len(linesTr)):               
                            if len(linesTr[i]) > 0:
                                linesTr[i].append('\t')
                                linesTr[i][4] = '_'
                        for j in range(len(linesTr)):         
                            if len(linesTr[j]) > 0:
                                linesTr[j][4] = trainLength[count]
                            else:
                                count += 1 
                                
                        # /// creating dictionaries for faster access
                        dctMod = {'lines':[i for i in linesMod]}
                        dctTr = {'lines':[i for i in linesTr]}
                                     
                        # /// Swapping words in training file
                        print('\nSwapping words in training file...')
                        progressbar = tqdm(total=len(linesTr)) 
                        for lineTr in dctTr['lines']:
                            for lineMod in dctMod['lines']:
                                try:
                                    if lineTr[1] ==  lineMod[10]:                                        
                                        if lineTr[4] ==  lineMod[4] and lineTr[0] ==  lineMod[0] and  lineMod[3] == 'NOUN':
                                            if lineMod[5] == 'Number=Sing' and ('NUC_N (N <>' in lineTr[3] or 'NUC_N (V-GER <>' in lineTr[3]): 
                                                lineTr[1] =  lineMod[1]               
                                            elif lineMod[5] == 'Number=Plur' and ('NUC_N (N <>' in lineTr[3]):
                                                lineTr[1] =  lineMod[1]         
                                        elif lineTr[4] ==  lineMod[4] and lineTr[0] ==  lineMod[0] and  lineMod[3] == 'ADJ':
                                            if lineMod[5] == 'Degree=Pos' and ('NUC_A (A <>' in lineTr[3] or 'NUC_A (V-PART <>' in lineTr[3] or 'NUC_A (V-GER <>' in lineTr[3]):
                                                lineTr[1] =  lineMod[1]     
                                            elif lineMod[5] == 'Degree=Cmp' and ('NUC_A (A <>' in lineTr[3]):
                                                lineTr[1] =  lineMod[1]
                                            elif lineMod[5] == 'Degree=Sup' and ('NUC_A (A <>' in lineTr[3]):
                                                lineTr[1] =  lineMod[1]
                                        elif lineTr[4] ==  lineMod[4] and lineTr[0] ==  lineMod[0] and  lineMod[3] == 'VERB':
                                            if 'Tense=Past' in  lineMod[5] and 'VerbForm=Fin' in  lineMod[5] and 'NUC (V <>' in lineTr[3]:
                                                lineTr[1] =  lineMod[1]
                                            elif 'VerbForm=Inf' in lineMod[5] and 'NUC (V <>' in lineTr[3]:
                                                lineTr[1] = lineMod[1]
                                            elif 'Tense=Pres' in  lineMod[5] and 'VerbForm=Part' in  lineMod[5] and ('NUC (V <>' in lineTr[3] or 'NUC_A (V-PART <>' in lineTr[3] or 'NUC_A (V-GER <>' in lineTr[3]):
                                                lineTr[1] =  lineMod[1]
                                            elif 'Tense=Past' in  lineMod[5] and 'VerbForm=Part' in  lineMod[5] and ('NUC_A (V-PART <>' in lineTr[3] or 'NUC (V <>' in lineTr[3] or 'NUC (V-PART <>' in lineTr[3] or 'V-PART <>' in lineTr[3]):
                                                lineTr[1] =  lineMod[1]                                                     
                                except(IndexError):
                                    i += 1
                            progressbar.update()
                        progressbar.close()   
                        
                        trToList = list(dctTr['lines'])
                        
                        print('\nChecking vowels and deleting unnecessary columns...') 
                        # /// deletes columns with sentence lengths 
                        progressbar = tqdm(total=len(trToList))       
                        for i in range(len(trToList)):
                            if len(trToList[i]) > 0:                    
                                trToList[i].pop(4)
                            else:
                                i += 1 
                        
                        # /// checks vowels
                        for i in range(len(trToList)):
                            checkVowels(i, trToList)
                            progressbar.update() 
                        writerTemp.writerows(trToList)
                        progressbar.close()

    
    elif args.unimorph1 == True and args.unimorph0 == False and args.internal == False and args.supertag == False and args.original == False:
        # /// running list function
        lGacc(L_nS, L_nP, L_aPoss, L_aCmpr, L_aSup, L_vPst, L_vInf, L_vPresPart, L_vPstPart, L_vPstIntr, L_vInfIntr, L_vPresPartIntr, L_vPstPartIntr)
        
        for x in range((args.extensionSize)-1):
            mainGacc(args.RRGinput, args.trainInput, args.RRGoutput, args.tag, L_nS, L_nP, L_aPoss, L_aCmpr, L_aSup, L_vPst, L_vInf, L_vPresPart, L_vPstPart, L_vPstIntr, L_vInfIntr, L_vPresPartIntr, L_vPstPartIntr)
            # /// sleep for 2 seconds for buffer
            for i in range(2,0,-1):
                print(f"{i}", end="\r", flush=True)
                time.sleep(1)
            print()
            
            with open('./rrgparbank/conllu/modified_filtered_acc_en_conllu.conllu', 'r+', encoding='UTF-8', newline='\n') as modified:
                with open('./experiments/rrgparbank-en/base/train.supertags', 'r+', encoding='UTF-8', newline='\n') as training:
                    with open('./experiments/rrgparbank-en/tempTr.txt', 'a', encoding='UTF-8', newline='\n') as outfile:
                        readerMod = csv.reader(modified, delimiter='\t')
                        readerTr = csv.reader(training, delimiter='\t')
                        writerTemp = csv.writer(outfile, delimiter='\t')
                        linesMod = list(readerMod)  #///creates lists of lines from the file
                        linesTr = list(readerTr)
                        trainLength = []
                    
                        sentLengths(linesTr, trainLength)
                        
                        # /// run number indicator
                        print(color.BOLD, color.PURPLE, '\n\nRun ', (x+1), '/', (args.extensionSize)-1, color.END, '\n')
                        
                        # /// Adding sentence lengths to the file  
                        count = 0
                        for i in range(len(linesTr)):               
                            if len(linesTr[i]) > 0:
                                linesTr[i].append('\t')
                                linesTr[i][4] = '_'
                        for j in range(len(linesTr)):         
                            if len(linesTr[j]) > 0:
                                linesTr[j][4] = trainLength[count]
                            else:
                                count += 1  
                    
                        # /// creating dictionaries for faster access
                        dctMod = {'lines':[i for i in linesMod]}
                        dctTr = {'lines':[i for i in linesTr]}
                                    
                        # /// Swapping words in training file
                        print('\nSwapping words in training file...')
                        progressbar = tqdm(total=len(linesTr)) 
                        for lineTr in dctTr['lines']:
                            for lineMod in dctMod['lines']:
                                try:                                  
                                    if lineTr[1] == lineMod[10]:
                                        if lineTr[4] == lineMod[4] and lineTr[0] == lineMod[0] and lineMod[3] == 'NOUN':
                                            if lineMod[5] == 'Number=Sing' and ('NUC_N (N <>' in lineTr[3] or 'NUC_N (V-GER <>' in lineTr[3]): 
                                                lineTr[1] = lineMod[1]                               
                                            elif lineMod[5] == 'Number=Plur' and ('NUC_N (N <>' in lineTr[3]):
                                                lineTr[1] = lineMod[1]                            
                                        elif lineTr[4] == lineMod[4] and lineTr[0] == lineMod[0] and lineMod[3] == 'ADJ':
                                            if lineMod[5] == 'Degree=Pos' and ('NUC_A (A <>' in lineTr[3] or 'NUC_A (V-PART <>' in lineTr[3] or 'NUC_A (V-GER <>' in lineTr[3]):
                                                lineTr[1] = lineMod[1]                        
                                            elif lineMod[5] == 'Degree=Cmp' and ('NUC_A (A <>' in lineTr[3]):
                                                lineTr[1] = lineMod[1]
                                            elif lineMod[5] == 'Degree=Sup' and ('NUC_A (A <>' in lineTr[3]):
                                                lineTr[1] = lineMod[1]
                                        elif lineTr[4] == lineMod[4] and lineTr[0] == lineMod[0] and lineMod[3] == 'VERB':
                                            if 'Tense=Past' in lineMod[5] and 'VerbForm=Fin' in lineMod[5] and 'NUC (V <>' in lineTr[3]:
                                                lineTr[1] = lineMod[1]
                                            elif 'VerbForm=Inf' in lineMod[5] and 'NUC (V <>' in lineTr[3]:
                                                lineTr[1] = lineMod[1]
                                            elif 'Tense=Pres' in lineMod[5] and 'VerbForm=Part' in lineMod[5] and ('NUC (V <>' in lineTr[3] or 'NUC_A (V-PART <>' in lineTr[3] or 'NUC_A (V-GER <>' in lineTr[3]):
                                                lineTr[1] = lineMod[1]
                                            elif 'Tense=Past' in lineMod[5] and 'VerbForm=Part' in lineMod[5] and ('NUC_A (V-PART <>' in lineTr[3] or 'NUC (V <>' in lineTr[3] or 'NUC (V-PART <>' in lineTr[3] or 'V-PART <>' in lineTr[3]):
                                                lineTr[1] = lineMod[1]
                                except(IndexError):
                                    i += 1
                            progressbar.update()
                        progressbar.close()   
                        
                        trToList = list(dctTr['lines'])
                        
                        print('\nChecking vowels and deleting unnecessary columns...') 
                        # /// deletes columns with sentence lengths 
                        progressbar = tqdm(total=len(trToList))       
                        for i in range(len(trToList)):
                            if len(trToList[i]) > 0:                    
                                trToList[i].pop(4)
                            else:
                                i += 1 
                        
                        # /// checks vowels
                        for i in range(len(trToList)):
                            checkVowels(i, trToList)
                            progressbar.update() 
                        writerTemp.writerows(trToList)
                        progressbar.close()
    
    
    elif args.internal == True and args.unimorph0 == False and args.unimorph1 == False and args.supertag == False and args.original == False:
        # /// running list function
        lGIntern(L_nS, L_nP, L_aPoss, L_aCmpr, L_aSup, L_adv, L_advInt, L_advSup, L_advCmpr, L_vPst, L_vInf, L_vPresPart, L_vPstPart, L_vPstIntr, L_vInfIntr, L_vPresPartIntr, L_vPstPartIntr)
        
        for x in range((args.extensionSize)-1):
            mainGintern(args.RRGinput, args.trainInput, args.RRGoutput, args.tag, L_nS, L_nP, L_aPoss, L_aCmpr, L_aSup, L_adv, L_advInt, L_advSup, L_advCmpr, L_vPst, L_vInf, L_vPresPart, L_vPstPart, L_vPstIntr, L_vInfIntr, L_vPresPartIntr, L_vPstPartIntr)
            # /// sleep for 2 seconds for buffer
            for i in range(2,0,-1):
                print(f"{i}", end="\r", flush=True)
                time.sleep(1)
            print()
            
            with open('./rrgparbank/conllu/modified_filtered_acc_en_conllu.conllu', 'r+', encoding='UTF-8', newline='\n') as modified:
                with open('./experiments/rrgparbank-en/base/train.supertags', 'r+', encoding='UTF-8', newline='\n') as training:
                    with open('./experiments/rrgparbank-en/tempTr.txt', 'a', encoding='UTF-8', newline='\n') as outfile:
                        readerMod = csv.reader(modified, delimiter='\t')
                        readerTr = csv.reader(training, delimiter='\t')
                        writerTemp = csv.writer(outfile, delimiter='\t')
                        linesMod = list(readerMod)  #///creates lists of lines from the file
                        linesTr = list(readerTr)
                        trainLength = []
                    
                        sentLengths(linesTr, trainLength)
                        
                        # /// run number indicator
                        print(color.BOLD, color.PURPLE, '\n\nRun ', (x+1), '/', (args.extensionSize)-1, color.END, '\n')
                        
                        # /// Adding sentence lengths to the file     
                        count = 0
                        for i in range(len(linesTr)):               
                            if len(linesTr[i]) > 0:
                                linesTr[i].append('\t')
                                linesTr[i][4] = '_'
                        for j in range(len(linesTr)):         
                            if len(linesTr[j]) > 0:
                                linesTr[j][4] = trainLength[count]
                            else:
                                count += 1  
                        
                        # /// creating dictionaries for faster access
                        dctMod = {'lines':[i for i in linesMod]}
                        dctTr = {'lines':[i for i in linesTr]}
                                    
                        # /// Swapping words in training file
                        print('\nSwapping words in training file...')
                        progressbar = tqdm(total=len(linesTr)) 
                        for lineTr in dctTr['lines']:
                            for lineMod in dctMod['lines']:
                                try:                                  
                                    if lineTr[1] == lineMod[10]:
                                        if lineTr[4] == lineMod[4] and lineTr[0] == lineMod[0] and lineMod[3] == 'NOUN':
                                            if lineMod[5] == 'Number=Sing' and ('NUC_N (N <>' in lineTr[3] or 'NUC_N (V-GER <>' in lineTr[3]): 
                                                lineTr[1] = lineMod[1]                                
                                            elif lineMod[5] == 'Number=Plur' and ('NUC_N (N <>' in lineTr[3]):
                                                lineTr[1] = lineMod[1]      
                                        elif lineTr[4] == lineMod[4] and lineTr[0] == lineMod[0] and lineMod[3] == 'ADJ':
                                            if lineMod[5] == 'Degree=Pos' and ('NUC_A (A <>' in lineTr[3] or 'NUC_A (V-PART <>' in lineTr[3] or 'NUC_A (V-GER <>' in lineTr[3]):
                                                lineTr[1] = lineMod[1]       
                                            elif lineMod[5] == 'Degree=Cmp' and ('NUC_A (A <>' in lineTr[3]):
                                                lineTr[1] = lineMod[1]
                                            elif lineMod[5] == 'Degree=Sup' and ('NUC_A (A <>' in lineTr[3]):
                                                lineTr[1] = lineMod[1]
                                        elif lineTr[4] == lineMod[4] and lineTr[0] == lineMod[0] and lineMod[3] == 'VERB':
                                            if 'Tense=Past' in lineMod[5] and 'VerbForm=Fin' in lineMod[5] and 'NUC (V <>' in lineTr[3]:
                                                lineTr[1] = lineMod[1]
                                            elif 'VerbForm=Inf' in lineMod[5] and 'NUC (V <>' in lineTr[3]:
                                                lineTr[1] = lineMod[1]
                                            elif 'Tense=Pres' in lineMod[5] and 'VerbForm=Part' in lineMod[5] and ('NUC (V <>' in lineTr[3] or 'NUC_A (V-PART <>' in lineTr[3] or 'NUC_A (V-GER <>' in lineTr[3]):
                                                lineTr[1] = lineMod[1]
                                            elif 'Tense=Past' in lineMod[5] and 'VerbForm=Part' in lineMod[5] and ('NUC_A (V-PART <>' in lineTr[3] or 'NUC (V <>' in lineTr[3] or 'NUC (V-PART <>' in lineTr[3] or 'V-PART <>' in lineTr[3]):
                                                lineTr[1] = lineMod[1]  
                                        # elif lineTr[4] == lineMod[4] and lineTr[0] == lineMod[0] and lineMod[3] == 'ADV' and 'NUC_ADV (ADV <>' in lineTr[3]:                                
                                        #     lineTr[1] = lineMod[1]
                                except(IndexError):
                                    i += 1
                            progressbar.update()
                        progressbar.close()   
                        
                        trToList = list(dctTr['lines'])
                        
                        print('\nChecking vowels and deleting unnecessary columns...') 
                        # /// deletes columns with sentence lengths 
                        progressbar = tqdm(total=len(trToList))       
                        for i in range(len(trToList)):
                            if len(trToList[i]) > 0:                    
                                trToList[i].pop(4)
                            else:
                                i += 1 
                        
                        # /// checks vowels
                        for i in range(len(trToList)):
                            checkVowels(i, trToList)
                            progressbar.update() 
                        writerTemp.writerows(trToList)
                        progressbar.close()
                        
                        
    elif args.supertag == True and args.unimorph0 == False and args.unimorph1 == False and args.internal == False and args.original == False:
        
        for x in range((args.extensionSize)-1):
            mainGsupertag(args.trainInput, args.trainOutput, args.tag)
            # /// sleep for 2 seconds for buffer
            for i in range(2,0,-1):
                print(f"{i}", end="\r", flush=True)
                time.sleep(1)
            print()
            
            
            with open('./experiments/rrgparbank-en/tempST.txt', 'r+', encoding='UTF-8', newline='\n') as training:
                with open('./experiments/rrgparbank-en/tempTr.txt', 'a', encoding='UTF-8', newline='\n') as outfile:
                    readerTr = csv.reader(training, delimiter='\t')
                    writerTemp = csv.writer(outfile, delimiter='\t')
                    linesTr = list(readerTr)
                    
                    # /// run number indicator
                    print(color.BOLD, color.PURPLE, '\n\nRun ', (x+1), '/', (args.extensionSize)-1, color.END, '\n')
                    
                    print('\nChecking vowels and deleting unnecessary columns...') 
                    # /// deletes columns with sentence lengths 
                    progressbar = tqdm(total=len(linesTr))       
                    for i in range(len(linesTr)):
                        if len(linesTr[i]) > 0:                    
                            linesTr[i].pop(4)
                        else:
                            i += 1 
                    
                    # /// checks vowels
                    for i in range(len(linesTr)):
                        checkVowels(i, linesTr)
                        progressbar.update() 
                    writerTemp.writerows(linesTr)
                    progressbar.close()
                    
                    
    elif args.original == True and args.unimorph0 == False and args.unimorph1 == False and args.internal == False and args.supertag == False:
        with open('./experiments/rrgparbank-en/base/train.supertags', 'r', encoding='UTF-8', newline='\n') as train:
            with open('./experiments/rrgparbank-en/tempTr.txt', 'a', encoding='UTF-8', newline='\n') as outfile:
                readerTr = csv.reader(train, delimiter='\t')
                linesTr = list(readerTr)
                writerTemp = csv.writer(outfile, delimiter='\t')
                
                print('\nExtending training file with original sentences...')
                progressbar = tqdm(total=args.extensionSize-1)
                for x in range((args.extensionSize)-1):
                    writerTemp.writerows(linesTr)
                    progressbar.update()
                progressbar.close()
                    
             
    else:
        print('Options --unimorph0, --unimorph1, --internal, --supertag, and --original can only be used separately!')
        exit()

    # /// Writing to new training file
    print('\nWriting to new training file...')
    progressbar = tqdm(total=1)
    with open('./experiments/rrgparbank-en/tempTr.txt', 'r+', encoding='UTF-8', newline='\n') as infileTemp:        
        with open(args.trainOutput+'train.supertags', 'a', encoding='UTF-8', newline='\n') as infileTr:
            readerTemp = csv.reader(infileTemp, delimiter='\t')
            writer = csv.writer(infileTr, delimiter='\t')
            rowsTemp = list(readerTemp)
            writer.writerows(rowsTemp)
    progressbar.update()
    progressbar.close()
    
    # Removing temporary files
    if os.path.exists('./experiments/rrgparbank-en/tempTr.txt'):
        os.remove('./experiments/rrgparbank-en/tempTr.txt')
    if os.path.exists('./experiments/rrgparbank-en/tempST.txt'):
        os.remove('./experiments/rrgparbank-en/tempST.txt')

    trainOut = args.trainOutput
    logpath = args.trainOutput
    if 'new_' in args.trainOutput:
        logpath = trainOut.replace('new_', '')

    # /// Creating logfile
    print('\nCreating logfile...')
    time.sleep(1)
    progressbar = tqdm(total=1)
    with open(logpath+'log.txt', 'w', encoding='UTF-8', newline='\n') as logfile:
        with open(args.trainOutput+'train.supertags', 'r+', encoding='UTF-8', newline='\n') as infileNewTr:
            with open(baseTrPath, 'r+', encoding='UTF-8', newline='\n') as infileBaseTr:
                readerNewTr = csv.reader(infileNewTr, delimiter='\t')
                readerOrgTr = csv.reader(infileBaseTr, delimiter='\t')
                newTrlen = []
                orgTrlen = []
                sentLengths(list(readerNewTr), newTrlen)
                sentLengths(list(readerOrgTr), orgTrlen)

                if args.unimorph0 != False or args.unimorph1 != False or args.internal != False:
                    logfile.write('Tag used for replacements:\t'+str(args.tag)+'\n')
                logfile.write('Training file used:\t'+args.trainInput)
                logfile.write('\nNumber of sentences:\t'+str(len(newTrlen)))
                logfile.write('\nExtension size input:\t'+str(args.extensionSize))
                logfile.write('\nActual ratio:\t'+str(len(newTrlen)/len(orgTrlen))+'x of base training file')
                
                if args.internal != False:
                    logfile.write('\n\nInternal constituent replacements')
                elif args.supertag != False:
                    logfile.write('\n\nSupertag constituent replacements')
                elif args.original != False:
                    logfile.write('\n\nAugmentation without constituent replacements')
                elif args.unimorph0 != False:
                    logfile.write('\n\nUniMorph constituent replacements')
                elif args.unimorph1 != False:
                    logfile.write('\n\nUniMorph constituent replacements')    
                    logfile.write('\nVerb transitivity accounted for')    
                progressbar.update()
                progressbar.close()
        
    end = time.perf_counter()
    mins = int((end-start)/60)
    secs = int((end-start)%60)
    print(f'\nTime taken: {mins}min {secs}s')