# Experiment Processor
- Given a dataset labelled for positive (1) and negative (0) presence of some pragmatic feature, create euphemism-detection train/test splits, and process the results.
- Specify parameters in the first chunk.

In [8]:
# A helper notebook to create a number of train-test splits from the data
# DATASET = "VET_Corpus_v0.2.csv" # the dataset containing pragmatic labels
# NUM_TESTS = 10 # number of splits
# FOLDER = "TEST_4.0.3" # directory where train/test files should be output
# PROPERTY = 'is_vague' # linguistic property being investigated
# PREPROCESS = True
# NOTES = 'Preprocessing, including the removal of <> (the subject of study). Strong assumption vagueness labels, includes all examples.'
DATASET = "Sentiment_Corpus_v1.2.csv" # the dataset containing pragmatic labels
NUM_TESTS = 10 # number of splits
FOLDER = "TEST_5.1" # directory where train/test files should be output
PROPERTY = 'is_neu_PET' # linguistic property being investigated
PREPROCESS = False
NOTES = 'High-agreement PETs only. Neutral label using mean neutral sentiment strength as the threshold. From Euphemism_Corpus_v2.1.'

In [4]:
# This function, HuggingFace-ify, takes a sample of euphemism corpus and it makes into an appropriate format for the HuggingFace Trainer class
# NEW: also preprocesses it, if specified in the first chunk
import re
def hfify(df):
    # df = df.drop(['keyword', 'category', 'type', 'euph_status', 'sentence', PROPERTY], axis=1)
    df = df[['edited_text', 'is_euph']]
    if (PREPROCESS == False):
        df = df.rename(columns={'edited_text':'text', 'is_euph':'label'})
    elif (PREPROCESS == True):
        df['preprocessed_text'] = ""
        for i, row in df.iterrows():
            text = df.loc[i, 'edited_text']
            text = re.sub(r"[^a-zA-Z<>]", " ", text.lower())
            text = re.sub(r"[^a-zA-Z]", " ", text.lower()) # we're removing brackets...
            df.loc[i, 'preprocessed_text'] = " ".join(text.split()) 
        df = df[['preprocessed_text','is_euph']]
        df = df.rename(columns={'preprocessed_text':'text', 'is_euph':'label'})
    return df

## Produce multiple training splits

In [9]:
import pandas as pd
import os

# makes train-test splits BALANCED for a particular linguistic property

corpus = pd.read_csv(DATASET, index_col=0)

# excluding certain terms for balance
# banned_list = ['aging', 'deprived', 'gluteus maximus', 'got clean', 'let [pro] go', 'oldest profession', 'plump', 'sober', 'wealthy', 'with child']
# for term in banned_list:
#     banned_rows = corpus.loc[corpus['type'] == term]
#     # print(banned_rows.index)
#     corpus = corpus.drop(banned_rows.index)
    
# this chunk is for limiting data to high-agreement PETs
# high_agreement_PETs = ['slim', 'between jobs', 'accident', 'late', 'number one', 'sleep with', 'seasoned', 'wealthy', 'over the hill', 'plump', 'let go of', 'go all the way', 'overweight', 'sober', 'number two', 'slept with', 'dismissed', 'let them go', 'aging', 'expecting', 'stout', 'troubled', 'with child', 'invalid', 'experienced', 'getting clean', 'custodian', 'got clean', 'long sleep', 'mixed up', 'chest', 'same-sex', 'economical', 'passing on', 'neutralize', 'outspoken', 'gluteus maximus', 'sleep around', 'pass on', 'disabled', 'special needs', 'pass away', 'a certain age', 'well off', 'less fortunate', 'mistruths', 'droppings', 'lose your lunch', 'pregnancy termination', 'let him go', 'golden years', 'mentally challenged', 'tinkle', 'demise', 'drinking problem', 'indigent', 'detainee', 'advanced age', 'comfort women', 'time of the month', 'pass gas', 'portly', 'went to heaven', 'venereal disease', 'put to sleep', 'mistruth', 'differently-abled', 'intoxicated', 'economical with the truth', 'lavatory', 'birds and the bees', 'deceased', 'terminating a pregnancy', 'inebriated', 'inner city', 'regime change', 'enhanced interrogation techniques', 'adult beverages', 'to go to heaven', 'dearly departed', 'passed away', 'downsize', 'ethnic cleansing', 'substance abusers', 'broken home', 'made love', 'plus-sized', 'underprivileged', 'rear end', 'armed conflict', 'substance abuse', 'disadvantaged', 'neutralized', 'capital punishment', 'street person', 'making love', 'freedom fighters']
# corpus = corpus.loc[corpus['keyword'].isin(high_agreement_PETs)]

# this is for limiting to parallel-examples only
# corpus = corpus.loc[corpus['euph_status']=='somestimes_euph']

pos_examples = corpus.loc[corpus[PROPERTY]==1] # note, "pos" and "neg" stand for "positive" and "negative" for the target property
neg_examples = corpus.loc[corpus[PROPERTY]==0]
pos_1s = pos_examples.loc[pos_examples['is_euph']==1]
pos_0s = pos_examples.loc[pos_examples['is_euph']==0]
neg_1s = neg_examples.loc[neg_examples['is_euph']==1]
neg_0s = neg_examples.loc[neg_examples['is_euph']==0]

print("# POS 1s:", len(pos_1s))
print("# POS 0s:", len(pos_0s))
print("# NEG 1s:", len(neg_1s))
print("# NEG 0s:", len(neg_0s))

# choosing sizes to balance number of 0s an 1s in experiments
constraint = min(len(pos_1s),len(pos_0s),len(neg_1s),len(neg_0s)) # constrained by smallest number of pos/neg 0s/1s
train_size = round(constraint*0.8) - round(constraint*0.8)%2 # make it an even number for easier split between pos/neg later; this is the size of each P/N-0/1 sample
test_size = constraint - train_size # how much test-set samples to take from each P/N-0/1 sample
# information
s1 = "The smallest group is {} examples. Training set will have {} of each label; test set will have {} of each label".format(constraint, train_size*2, test_size*2)
s2 = "The training set will have {} total examples; the test set, {}, for a total of {} examples".format(train_size*4, test_size*4, train_size*4+test_size*4)
print(s1)
print(s2)
if ('no' in input("Ok?")):
    raise KeyboardInterrupt
    
os.mkdir(FOLDER)
f = open(FOLDER + "/info.txt", "a")
f.write(s1 + "\n" + s2 + "\n\nCUSTOM NOTES: " + NOTES)
f.close()

for x in range(0, NUM_TESTS):
    # create the training sample
    pos_1s_train_sample = pos_1s.sample(train_size).sample(frac=1) # note the second .sample(frac=1) is to shuffle rows
    pos_0s_train_sample = pos_0s.sample(train_size).sample(frac=1)
    neg_1s_train_sample = neg_1s.sample(train_size).sample(frac=1) # note the second .sample(frac=1) is to shuffle rows
    neg_0s_train_sample = neg_0s.sample(train_size).sample(frac=1)
    # remove train selections from data
    remaining_pos_1s = pos_1s.drop(pos_1s_train_sample.index)
    remaining_pos_0s = pos_0s.drop(pos_0s_train_sample.index)
    remaining_neg_1s = neg_1s.drop(neg_1s_train_sample.index)
    remaining_neg_0s = neg_0s.drop(neg_0s_train_sample.index)
    # randomly select test samples from remaining data
    pos_1s_test_sample = remaining_pos_1s.sample(test_size).sample(frac=1) # note the second .sample(frac=1) is to shuffle rows
    pos_0s_test_sample = remaining_pos_0s.sample(test_size).sample(frac=1)
    neg_1s_test_sample = remaining_neg_1s.sample(test_size).sample(frac=1) # note the second .sample(frac=1) is to shuffle rows
    neg_0s_test_sample = remaining_neg_0s.sample(test_size).sample(frac=1)   
    
    train_set = pd.concat([pos_1s_train_sample, pos_0s_train_sample, neg_1s_train_sample, neg_0s_train_sample]).sample(frac=1)
    test_set = pd.concat([pos_1s_test_sample, pos_0s_test_sample, neg_1s_test_sample, neg_0s_test_sample]).sample(frac=1)
    
    # save them to folder
    train_set.to_csv(FOLDER + '/train_' + str(x) + '.csv')
    train_set = hfify(train_set)
    train_set.to_csv(FOLDER + '/hf_train_' + str(x) + '.csv', index=False)
    test_set.to_csv(FOLDER + '/test_' + str(x) + '.csv')
    test_set = hfify(test_set)
    test_set.to_csv(FOLDER + '/hf_test_' + str(x) + '.csv', index=False)
    # print(len(pos_1s_train_sample))
    # print(len(pos_0s_train_sample))
    # print(len(neg_1s_train_sample))
    # print(len(neg_0s_train_sample))
    # print(len(pos_1s_test_sample))
    # print(len(pos_0s_test_sample))
    # print(len(neg_1s_test_sample))
    # print(len(neg_0s_test_sample))
    # break
print("Tests successfully output to " + FOLDER)

# POS 1s: 402
# POS 0s: 203
# NEG 1s: 442
# NEG 0s: 245
The smallest group is 203 examples. Training set will have 324 of each label; test set will have 82 of each label
The training set will have 648 total examples; the test set, 164, for a total of 812 examples


Ok? yes


Tests successfully output to TEST_5.1


In [49]:
# just verifying sizes here...
import os 
FOLDER = 'TEST_4.0.2'
PROPERTY = 'is_vague'

# FOLDER = 'TEST_5.0'
# PROPERTY = 'is_neu_PET'
# curr_intersection = set()
# for file in os.listdir(FOLDER):
#     if (file[0:5] != 'train'):
#         continue
#     fn = FOLDER + '/' + file
#     data = pd.read_csv(fn, index_col=0)
#     data_indices = data.index
#     if (len(curr_intersection) == 0):
#         curr_intersection = data_indices
#     else:
#         curr_intersection = curr_intersection.intersection(data_indices)
        
# print(curr_intersection)
# print(len(curr_intersection))

for n in range(0, 10):
    print("TEST " + str(n))
    data = pd.read_csv(FOLDER + '/train_' + str(n) + '.csv', index_col=0)
    # data2 = pd.read_csv(FOLDER + '/train_3.csv', index_col=0)
    # data1 = set(data1.index)
    # data2 = set(data2.index)
    # intersection = data1.intersection(data2)
    # print(len(intersection))
    euphs = data.loc[data['is_euph'] == 1]
    noneuphs = data.loc[data['is_euph'] == 0]
    vague = data.loc[data[PROPERTY] == 1]
    nonvague = data.loc[data[PROPERTY] == 0]

    vague_euphs = euphs.loc[euphs[PROPERTY] == 1]
    nonvague_euphs = euphs.loc[euphs[PROPERTY] == 0]
    vague_noneuphs = noneuphs.loc[noneuphs[PROPERTY] == 1]
    nonvague_noneuphs = noneuphs.loc[noneuphs[PROPERTY] == 0]
    # print(file)
    # print(len(vague_euphs))
    # print(len(nonvague_euphs))
    # print(len(vague_noneuphs))
    # print(len(nonvague_noneuphs))
    # print()
    # print(len(data['type'].unique()))
    # print(len(vague['type'].unique()))
    # print(len(nonvague['type'].unique()))

    num_ambiguous_PETs = 0
    num_ambiguous_examples = 0
    for PET in vague['type'].unique():
        sel = vague.loc[vague['type'] == PET]
        if (0 in sel['is_euph'].tolist() and 1 in sel['is_euph'].tolist()):
            num_ambiguous_PETs += 1
            num_ambiguous_examples += len(sel['is_euph'].tolist())
    print("NUM AMBIGUOUS PETS IN VAGUE:", num_ambiguous_PETs)
    print("NUM AMBIGUOUS EXAMPLES IN VAGUE:", num_ambiguous_examples)

    num_ambiguous_PETs = 0
    num_ambiguous_examples = 0

    for PET in nonvague['type'].unique():
        sel = nonvague.loc[nonvague['type'] == PET]
        if (0 in sel['is_euph'].tolist() and 1 in sel['is_euph'].tolist()):
            num_ambiguous_PETs += 1
            num_ambiguous_examples += len(sel['is_euph'].tolist())
    print("NUM AMBIGUOUS PETS IN UNVAGUE:", num_ambiguous_PETs)
    print("NUM AMBIGUOUS EXAMPLES IN UNVAGUE:", num_ambiguous_examples)
    print()

TEST 0
NUM AMBIGUOUS PETS IN VAGUE: 13
NUM AMBIGUOUS EXAMPLES IN VAGUE: 150
NUM AMBIGUOUS PETS IN UNVAGUE: 16
NUM AMBIGUOUS EXAMPLES IN UNVAGUE: 187

TEST 1
NUM AMBIGUOUS PETS IN VAGUE: 14
NUM AMBIGUOUS EXAMPLES IN VAGUE: 151
NUM AMBIGUOUS PETS IN UNVAGUE: 14
NUM AMBIGUOUS EXAMPLES IN UNVAGUE: 168

TEST 2
NUM AMBIGUOUS PETS IN VAGUE: 14
NUM AMBIGUOUS EXAMPLES IN VAGUE: 153
NUM AMBIGUOUS PETS IN UNVAGUE: 13
NUM AMBIGUOUS EXAMPLES IN UNVAGUE: 169

TEST 3
NUM AMBIGUOUS PETS IN VAGUE: 12
NUM AMBIGUOUS EXAMPLES IN VAGUE: 140
NUM AMBIGUOUS PETS IN UNVAGUE: 16
NUM AMBIGUOUS EXAMPLES IN UNVAGUE: 186

TEST 4
NUM AMBIGUOUS PETS IN VAGUE: 13
NUM AMBIGUOUS EXAMPLES IN VAGUE: 138
NUM AMBIGUOUS PETS IN UNVAGUE: 10
NUM AMBIGUOUS EXAMPLES IN UNVAGUE: 157

TEST 5
NUM AMBIGUOUS PETS IN VAGUE: 14
NUM AMBIGUOUS EXAMPLES IN VAGUE: 154
NUM AMBIGUOUS PETS IN UNVAGUE: 13
NUM AMBIGUOUS EXAMPLES IN UNVAGUE: 173

TEST 6
NUM AMBIGUOUS PETS IN VAGUE: 14
NUM AMBIGUOUS EXAMPLES IN VAGUE: 152
NUM AMBIGUOUS PETS IN UN

## Produce multiple, CUSTOM-SIZED splits

In [9]:
import pandas as pd
import os

# specify numbers here
tot_num_pos_1s, tot_num_neg_1s = 400, 400 # 'pos' and 'neg' refer to the property of interest; e.g., 'is_vague'
tot_num_pos_0s, tot_num_neg_0s = 200, 200 

# makes train-test splits BALANCED for a particular linguistic property

corpus = pd.read_csv(DATASET, index_col=0)

# this chunk is for limiting data to high-agreement PETs
# high_agreement_PETs = ['slim', 'between jobs', 'accident', 'late', 'number one', 'sleep with', 'seasoned', 'wealthy', 'over the hill', 'plump', 'let go of', 'go all the way', 'overweight', 'sober', 'number two', 'slept with', 'dismissed', 'let them go', 'aging', 'expecting', 'stout', 'troubled', 'with child', 'invalid', 'experienced', 'getting clean', 'custodian', 'got clean', 'long sleep', 'mixed up', 'chest', 'same-sex', 'economical', 'passing on', 'neutralize', 'outspoken', 'gluteus maximus', 'sleep around', 'pass on', 'disabled', 'special needs', 'pass away', 'a certain age', 'well off', 'less fortunate', 'mistruths', 'droppings', 'lose your lunch', 'pregnancy termination', 'let him go', 'golden years', 'mentally challenged', 'tinkle', 'demise', 'drinking problem', 'indigent', 'detainee', 'advanced age', 'comfort women', 'time of the month', 'pass gas', 'portly', 'went to heaven', 'venereal disease', 'put to sleep', 'mistruth', 'differently-abled', 'intoxicated', 'economical with the truth', 'lavatory', 'birds and the bees', 'deceased', 'terminating a pregnancy', 'inebriated', 'inner city', 'regime change', 'enhanced interrogation techniques', 'adult beverages', 'to go to heaven', 'dearly departed', 'passed away', 'downsize', 'ethnic cleansing', 'substance abusers', 'broken home', 'made love', 'plus-sized', 'underprivileged', 'rear end', 'armed conflict', 'substance abuse', 'disadvantaged', 'neutralized', 'capital punishment', 'street person', 'making love', 'freedom fighters']
# corpus = corpus.loc[corpus['keyword'].isin(high_agreement_PETs)]

# this is for limiting to parallel-examples only
# corpus = corpus.loc[corpus['euph_status']=='somestimes_euph']

pos_examples = corpus.loc[corpus[PROPERTY]==1] # note, "pos" and "neg" stand for "positive" and "negative" for the target property
neg_examples = corpus.loc[corpus[PROPERTY]==0]
pos_1s = pos_examples.loc[pos_examples['is_euph']==1]
pos_0s = pos_examples.loc[pos_examples['is_euph']==0]
neg_1s = neg_examples.loc[neg_examples['is_euph']==1]
neg_0s = neg_examples.loc[neg_examples['is_euph']==0]

print("# POS 1s:", len(pos_1s))
print("# POS 0s:", len(pos_0s))
print("# NEG 1s:", len(neg_1s))
print("# NEG 0s:", len(neg_0s))

# determine train and test sizes
train_pos_1s, train_neg_1s = round(tot_num_pos_1s*0.8), round(tot_num_neg_1s*0.8)
train_pos_0s, train_neg_0s = round(tot_num_pos_0s*0.8), round(tot_num_neg_0s*0.8)
test_pos_1s, test_neg_1s = tot_num_pos_1s-train_pos_1s, tot_num_neg_1s-train_neg_1s
test_pos_0s, test_neg_0s = tot_num_pos_0s-train_pos_0s, tot_num_neg_0s-train_neg_0s

# information
s1 = "TRAIN: There will be {} positive 1s, {} negative 1s, {} positive 0s, and {} negative 0s.".format(train_pos_1s, train_neg_1s, train_pos_0s, train_neg_0s)
s2 = "TEST: There will be {} positive 1s, {} negative 1s, {} positive 0s, and {} negative 0s.".format(test_pos_1s, test_neg_1s, test_pos_0s, test_neg_0s)
print(s1)
print(s2)
if ('no' in input("Ok?")):
    raise KeyboardInterrupt
    
os.mkdir(FOLDER)
f = open(FOLDER + "/info.txt", "a")
f.write(s1 + "\n" + s2 + "\n\nCUSTOM NOTES: " + NOTES)
f.close()

for x in range(0, NUM_TESTS):
    # create the training sample
    pos_1s_train_sample = pos_1s.sample(train_pos_1s).sample(frac=1) # note the second .sample(frac=1) is to shuffle rows
    pos_0s_train_sample = pos_0s.sample(train_pos_0s).sample(frac=1)
    neg_1s_train_sample = neg_1s.sample(train_neg_1s).sample(frac=1) # note the second .sample(frac=1) is to shuffle rows
    neg_0s_train_sample = neg_0s.sample(train_neg_0s).sample(frac=1)
    # remove train selections from data
    remaining_pos_1s = pos_1s.drop(pos_1s_train_sample.index)
    remaining_pos_0s = pos_0s.drop(pos_0s_train_sample.index)
    remaining_neg_1s = neg_1s.drop(neg_1s_train_sample.index)
    remaining_neg_0s = neg_0s.drop(neg_0s_train_sample.index)
    # randomly select test samples from remaining data
    pos_1s_test_sample = remaining_pos_1s.sample(test_pos_1s).sample(frac=1) # note the second .sample(frac=1) is to shuffle rows
    pos_0s_test_sample = remaining_pos_0s.sample(test_pos_0s).sample(frac=1)
    neg_1s_test_sample = remaining_neg_1s.sample(test_neg_1s).sample(frac=1) # note the second .sample(frac=1) is to shuffle rows
    neg_0s_test_sample = remaining_neg_0s.sample(test_neg_0s).sample(frac=1)   
    
    train_set = pd.concat([pos_1s_train_sample, pos_0s_train_sample, neg_1s_train_sample, neg_0s_train_sample]).sample(frac=1)
    test_set = pd.concat([pos_1s_test_sample, pos_0s_test_sample, neg_1s_test_sample, neg_0s_test_sample]).sample(frac=1)
    
    # save them to folder
    train_set.to_csv(FOLDER + '/train_' + str(x) + '.csv')
    train_set = hfify(train_set)
    train_set.to_csv(FOLDER + '/hf_train_' + str(x) + '.csv', index=False)
    test_set.to_csv(FOLDER + '/test_' + str(x) + '.csv')
    test_set = hfify(test_set)
    test_set.to_csv(FOLDER + '/hf_test_' + str(x) + '.csv', index=False)
    # print(len(pos_1s_train_sample))
    # print(len(pos_0s_train_sample))
    # print(len(neg_1s_train_sample))
    # print(len(neg_0s_train_sample))
    # print(len(pos_1s_test_sample))
    # print(len(pos_0s_test_sample))
    # print(len(neg_1s_test_sample))
    # print(len(neg_0s_test_sample))
    # break
print("Tests successfully output to " + FOLDER)

# POS 1s: 401
# POS 0s: 368
# NEG 1s: 981
# NEG 0s: 215
TRAIN: There will be 320 positive 1s, 320 negative 1s, 160 positive 0s, and 160 negative 0s.
TEST: There will be 80 positive 1s, 80 negative 1s, 40 positive 0s, and 40 negative 0s.


Ok? yes


Tests successfully output to TEST_4.2.1


## Temporary - make customized splits

In [6]:
import pandas as pd
import os

# makes train-test splits, restricting train/test to a particular linguistic property

corpus = pd.read_csv(DATASET, index_col=0)
# this chunk is for limiting data to high-agreement PETs
# high_agreement_PETs = ['slim', 'between jobs', 'accident', 'late', 'number one', 'sleep with', 'seasoned', 'wealthy', 'over the hill', 'plump', 'let go of', 'go all the way', 'overweight', 'sober', 'number two', 'slept with', 'dismissed', 'let them go', 'aging', 'expecting', 'stout', 'troubled', 'with child', 'invalid', 'experienced', 'getting clean', 'custodian', 'got clean', 'long sleep', 'mixed up', 'chest', 'same-sex', 'economical', 'passing on', 'neutralize', 'outspoken', 'gluteus maximus', 'sleep around', 'pass on', 'disabled', 'special needs', 'pass away', 'a certain age', 'well off', 'less fortunate', 'mistruths', 'droppings', 'lose your lunch', 'pregnancy termination', 'let him go', 'golden years', 'mentally challenged', 'tinkle', 'demise', 'drinking problem', 'indigent', 'detainee', 'advanced age', 'comfort women', 'time of the month', 'pass gas', 'portly', 'went to heaven', 'venereal disease', 'put to sleep', 'mistruth', 'differently-abled', 'intoxicated', 'economical with the truth', 'lavatory', 'birds and the bees', 'deceased', 'terminating a pregnancy', 'inebriated', 'inner city', 'regime change', 'enhanced interrogation techniques', 'adult beverages', 'to go to heaven', 'dearly departed', 'passed away', 'downsize', 'ethnic cleansing', 'substance abusers', 'broken home', 'made love', 'plus-sized', 'underprivileged', 'rear end', 'armed conflict', 'substance abuse', 'disadvantaged', 'neutralized', 'capital punishment', 'street person', 'making love', 'freedom fighters']
# corpus = corpus.loc[corpus['keyword'].isin(high_agreement_PETs)]

pos_examples = corpus.loc[corpus[PROPERTY]==1] # note, "pos" and "neg" stand for "positive" and "negative" for the target property
neg_examples = corpus.loc[corpus[PROPERTY]==0]
pos_1s = pos_examples.loc[pos_examples['is_euph']==1]
pos_0s = pos_examples.loc[pos_examples['is_euph']==0]
neg_1s = neg_examples.loc[neg_examples['is_euph']==1]
neg_0s = neg_examples.loc[neg_examples['is_euph']==0]

print("# POS 1s:", len(pos_1s))
print("# POS 0s:", len(pos_0s))
print("# NEG 1s:", len(neg_1s))
print("# NEG 0s:", len(neg_0s))

# choosing sizes to balance number of 0s an 1s in experiments
constraint = min(len(pos_1s),len(pos_0s),len(neg_1s),len(neg_0s)) # constrained by smallest number of pos/neg 0s/1s
train_size = round(constraint*0.8) - round(constraint*0.8)%2 # make it an even number for easier split between pos/neg later; this is the size of each P/N-0/1 sample
test_size = constraint - train_size # how much test-set samples to take from each P/N-0/1 sample
# information
s1 = "The smallest group is {} examples. Training set will have {} of each label; test set will have {} of each label".format(constraint, train_size*2, test_size*2)
s2 = "The training set will have {} total examples; the test set, {}, for a total of {} examples".format(train_size*4, test_size*4, train_size*4+test_size*4)
print(s1)
print(s2)
# make directories
os.mkdir(FOLDER)
POS_TRAIN_FOLDER = FOLDER + '/pos_train'
NEG_TRAIN_FOLDER = FOLDER + '/neg_train'
POS_TEST_FOLDER = FOLDER + '/pos_test'
NEG_TEST_FOLDER = FOLDER + '/neg_test'
MIXED_TEST_FOLDER = FOLDER + '/mixed_test'
os.mkdir(POS_TRAIN_FOLDER)
os.mkdir(NEG_TRAIN_FOLDER)
os.mkdir(POS_TEST_FOLDER)
os.mkdir(NEG_TEST_FOLDER)
os.mkdir(MIXED_TEST_FOLDER)

# info
f = open(FOLDER + "/info.txt", "a")
f.write(s1 + "\n" + s2 + "\n\nCUSTOM NOTES: " + NOTES)
f.close()

for x in range(0, NUM_TESTS):
    # create the training sample
    pos_1s_train_sample = pos_1s.sample(train_size).sample(frac=1) # note the second .sample(frac=1) is to shuffle rows
    pos_0s_train_sample = pos_0s.sample(train_size).sample(frac=1)
    neg_1s_train_sample = neg_1s.sample(train_size).sample(frac=1) # note the second .sample(frac=1) is to shuffle rows
    neg_0s_train_sample = neg_0s.sample(train_size).sample(frac=1)
    # remove train selections from data
    remaining_pos_1s = pos_1s.drop(pos_1s_train_sample.index)
    remaining_pos_0s = pos_0s.drop(pos_0s_train_sample.index)
    remaining_neg_1s = neg_1s.drop(neg_1s_train_sample.index)
    remaining_neg_0s = neg_0s.drop(neg_0s_train_sample.index)
    # randomly select test samples from remaining data
    pos_1s_test_sample = remaining_pos_1s.sample(test_size).sample(frac=1) # note the second .sample(frac=1) is to shuffle rows
    pos_0s_test_sample = remaining_pos_0s.sample(test_size).sample(frac=1)
    neg_1s_test_sample = remaining_neg_1s.sample(test_size).sample(frac=1) # note the second .sample(frac=1) is to shuffle rows
    neg_0s_test_sample = remaining_neg_0s.sample(test_size).sample(frac=1)   
    
    # make restricted training sets
    pos_only_train_set = pd.concat([pos_1s_train_sample, pos_0s_train_sample]).sample(frac=1)
    pos_only_train_set.to_csv(POS_TRAIN_FOLDER + '/train_' + str(x) + '.csv')
    pos_only_train_set = hfify(pos_only_train_set)
    pos_only_train_set.to_csv(POS_TRAIN_FOLDER + '/hf_train_' + str(x) + '.csv', index=False)
    
    neg_only_train_set = pd.concat([neg_1s_train_sample, neg_0s_train_sample]).sample(frac=1)
    neg_only_train_set.to_csv(NEG_TRAIN_FOLDER + '/train_' + str(x) + '.csv')
    neg_only_train_set = hfify(neg_only_train_set)
    neg_only_train_set.to_csv(NEG_TRAIN_FOLDER + '/hf_train_' + str(x) + '.csv', index=False)
    
    # make resricted test sets
    pos_only_test_set = pd.concat([pos_1s_test_sample, pos_0s_test_sample]).sample(frac=1)
    pos_only_test_set.to_csv(POS_TEST_FOLDER + '/test_' + str(x) + '.csv')
    hf_pos_only_test_set = hfify(pos_only_test_set)
    hf_pos_only_test_set.to_csv(POS_TEST_FOLDER + '/hf_test_' + str(x) + '.csv', index=False)
    
    neg_only_test_set = pd.concat([neg_1s_test_sample, neg_0s_test_sample]).sample(frac=1)
    neg_only_test_set.to_csv(NEG_TEST_FOLDER + '/test_' + str(x) + '.csv')
    hf_neg_only_test_set = hfify(neg_only_test_set)
    hf_neg_only_test_set.to_csv(NEG_TEST_FOLDER + '/hf_test_' + str(x) + '.csv', index=False)
    
    mixed_test_set = pd.concat([pos_only_test_set.sample(frac=0.5), neg_only_test_set.sample(frac=0.5)]).sample(frac=1)
    mixed_test_set.to_csv(MIXED_TEST_FOLDER + '/test_' + str(x) + '.csv')
    mixed_test_set = hfify(mixed_test_set)
    mixed_test_set.to_csv(MIXED_TEST_FOLDER + '/hf_test_' + str(x) + '.csv', index=False)
    
#     train_set = pd.concat([pos_1s_train_sample, pos_0s_train_sample, neg_1s_train_sample, neg_0s_train_sample]).sample(frac=1)
#     test_set = pd.concat([pos_1s_test_sample, pos_0s_test_sample, neg_1s_test_sample, neg_0s_test_sample]).sample(frac=1)
    
#     # save them to folder
#     train_set.to_csv(FOLDER + '/train_' + str(x) + '.csv')
#     train_set = hfify(train_set)
#     train_set.to_csv(FOLDER + '/hf_train_' + str(x) + '.csv', index=False)
#     test_set.to_csv(FOLDER + '/test_' + str(x) + '.csv')
#     test_set = hfify(test_set)
#     test_set.to_csv(FOLDER + '/hf_test_' + str(x) + '.csv', index=False)
    # print(len(pos_1s_train_sample))
    # print(len(pos_0s_train_sample))
    # print(len(neg_1s_train_sample))
    # print(len(neg_0s_train_sample))
    # print(len(pos_1s_test_sample))
    # print(len(pos_0s_test_sample))
    # print(len(neg_1s_test_sample))
    # print(len(neg_0s_test_sample))
    # break
print("Tests successfully output to " + FOLDER)

# POS 1s: 401
# POS 0s: 368
# NEG 1s: 981
# NEG 0s: 215
The smallest group is 215 examples. Training set will have 344 of each label; test set will have 86 of each label
The training set will have 688 total examples; the test set, 172, for a total of 860 examples
Tests successfully output to TEST_2.3


## Analyze `results.csv`

In [51]:
# Code to analyze results (results.csv must be in the TEST folder)
import ast
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

# FOLDER = 'TEST_0.0' # the directory containing the train/test files and the results
# PROPERTY = 'is_neg_PET' # linguistic property being investigated
FOLDER = 'TEST_4.0.2'#  + '/NEG-MIXED-EXPERIMENT'
# FOLDER = '../Euphemisms/BERT_Trainer/Bracket_Test/Without_Brackets'
PROPERTY = 'is_vague'
# FOLDER = 'TEST_5.0'
# PROPERTY = 'is_neu_PET'
df = pd.read_csv(FOLDER + '/results_base.csv', index_col=0)

results = pd.DataFrame(columns=['F1', 'P', 'R', 'tn', 'fp', 'fn', 'tp', 'A-1', 'F1-1', 'P-1', 'R-1', 'A-0', 'F1-0', 'P-0', 'R-0'])

# for each test, select the row with the best F1, then evaluate separate F1s for pos vs neg examples
for x in range(0, 10):
    test = df.loc[10*x:10*x+9]
    max_f1 = test.loc[test['f1'].idxmax()] # this is the best row from this test

    best_preds = max_f1['preds'].replace(' ', ', ') # the labels don't have a comma between them...
    best_preds = ast.literal_eval(best_preds)
    ref_df = pd.read_csv(FOLDER + '/test_' + str(x) + '.csv')
    
    # approach: get row IDs of pos/neg rows from test file, then pick the preds at the corresponding indices for evaluation
    pos_examples = ref_df.loc[ref_df[PROPERTY]==1]
    neg_examples = ref_df.loc[ref_df[PROPERTY]==0]

    # get predictions
    pos_preds = [best_preds[i] for i in pos_examples.index.tolist()]
    neg_preds = [best_preds[i] for i in neg_examples.index.tolist()]
    # get labels
    pos_labels = pos_examples['is_euph'].tolist()
    neg_labels = neg_examples['is_euph'].tolist()
    # gather metrics
    pos_a = accuracy_score(pos_labels, pos_preds)
    pos_f1 = f1_score(pos_labels, pos_preds, average='macro')
    pos_p = precision_score(pos_labels, pos_preds, average='macro')
    pos_r = recall_score(pos_labels, pos_preds, average='macro')

    # neg examples
    neg_a = accuracy_score(neg_labels, neg_preds)
    neg_f1 = f1_score(neg_labels, neg_preds, average='macro')
    neg_p = precision_score(neg_labels, neg_preds, average='macro')
    neg_r = recall_score(neg_labels, neg_preds, average='macro')
    # all examples, just to make sure it matches original scores
    # pos_acc = metric_acc.compute(predictions=best_preds, 
    #                           references=ref_df['is_euph'].tolist())
    # pos_f1 = metric_f1.compute(predictions=best_preds, 
    #                    references=ref_df['is_euph'].tolist(), 
    #                    average='macro')
    # print(pos_f1)
    # print(pos_acc)
    # print()
    # add to an overall dataframe
    stats = max_f1[0:7].tolist() # take the base stats from the best row
    stats.extend([pos_a, pos_f1, pos_p, pos_r, neg_a, neg_f1, neg_p, neg_r]) # add on the pos/neg ones
    results.loc[len(results.index)] = stats
results.loc['AVG'] = results.mean()
results

Unnamed: 0,F1,P,R,tn,fp,fn,tp,A-1,F1-1,P-1,R-1,A-0,F1-0,P-0,R-0
0,0.862857,0.896104,0.821429,76.0,8.0,15.0,69.0,0.928571,0.928531,0.929545,0.928571,0.797619,0.795269,0.811943,0.797619
1,0.821023,0.855263,0.77381,73.0,11.0,19.0,65.0,0.845238,0.84504,0.847009,0.845238,0.797619,0.794088,0.819537,0.797619
2,0.851059,0.873418,0.821429,74.0,10.0,15.0,69.0,0.857143,0.857062,0.857955,0.857143,0.845238,0.844156,0.855102,0.845238
3,0.821327,0.8375,0.797619,71.0,13.0,17.0,67.0,0.880952,0.880682,0.884439,0.880952,0.761905,0.761905,0.761905,0.761905
4,0.814945,0.853333,0.761905,73.0,11.0,20.0,64.0,0.869048,0.869029,0.869257,0.869048,0.761905,0.759725,0.771765,0.761905
5,0.815313,0.835443,0.785714,71.0,13.0,18.0,66.0,0.845238,0.844156,0.855102,0.845238,0.785714,0.78125,0.811111,0.785714
6,0.79759,0.790698,0.809524,66.0,18.0,16.0,68.0,0.809524,0.808547,0.815972,0.809524,0.785714,0.785227,0.78833,0.785714
7,0.803564,0.807229,0.797619,68.0,16.0,17.0,67.0,0.785714,0.785593,0.786364,0.785714,0.821429,0.821201,0.823077,0.821429
8,0.821429,0.821429,0.821429,69.0,15.0,15.0,69.0,0.845238,0.84504,0.847009,0.845238,0.797619,0.797361,0.799145,0.797619
9,0.833239,0.85,0.809524,72.0,12.0,16.0,68.0,0.869048,0.86888,0.87094,0.869048,0.797619,0.796204,0.806122,0.797619


## Alternative analyses

In [7]:
# Code to analyze performances during training (output vague/unvague scores per row)
import ast
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

FOLDER = 'TEST_4.0.2'
PROPERTY = 'is_vague'
df = pd.read_csv(FOLDER + '/results_base.csv', index_col=0)
df['num_vague_incorrect'] = -1
df['num_unvague_incorrect'] = -1
df['f1_vague'] = -1
df['f1_unvague'] = -1
# results = pd.DataFrame(columns=['F1', 'P', 'R', 'tn', 'fp', 'fn', 'tp', 'F1-1', 'P-1', 'R-1', 'F1-0', 'P-0', 'R-0'])

incorrect_pos_indices = {} # for analyzing which indices tended to be misclassified
incorrect_neg_indices = {}

for i, row in df.iterrows():
    preds = df.loc[i, 'preds'].replace(' ', ', ')
    preds = ast.literal_eval(preds)
    
    num_test = int(i/10)
    ref_df = pd.read_csv(FOLDER + '/test_' + str(num_test) + '.csv')
    ref_df = ref_df.rename(columns={"Unnamed: 0": "corpus_index"})
    
    # approach: get row IDs of pos/neg rows from test file, then pick the preds at the corresponding indices for evaluation
    pos_examples = ref_df.loc[ref_df[PROPERTY]==1]
    neg_examples = ref_df.loc[ref_df[PROPERTY]==0]
    # get labels
    pos_labels = pos_examples['is_euph'].tolist()
    neg_labels = neg_examples['is_euph'].tolist()
    # get predictions
    pos_preds = [preds[j] for j in pos_examples.index.tolist()]
    neg_preds = [preds[j] for j in neg_examples.index.tolist()] 
    
    # Identify potentially problematic examples:
    pos_indices = [j for j in pos_examples['corpus_index'].tolist()]
    neg_indices = [j for j in neg_examples['corpus_index'].tolist()]
    
    pos_pred_results = np.array(np.asarray(pos_labels) == np.asarray(pos_preds)).tolist()
    neg_pred_results = np.array(np.asarray(neg_labels) == np.asarray(neg_preds)).tolist()
    # print(pos_indices)
    # print(incorrect_pos)
    
    for x in range(0, len(pos_indices)):
        if (pos_pred_results[x] == False):
            if (pos_indices[x] not in incorrect_pos_indices.keys()):
                incorrect_pos_indices[pos_indices[x]] = 1
            else:
                incorrect_pos_indices[pos_indices[x]] += 1
    
    for x in range(0, len(neg_indices)):
        if (neg_pred_results[x] == False):
            if (neg_indices[x] not in incorrect_neg_indices.keys()):
                incorrect_neg_indices[neg_indices[x]] = 1
            else:
                incorrect_neg_indices[neg_indices[x]] += 1

    # incorrect_pos_PETs = []
    # for index in incorrect_pos_indices:
    #     incorrect_pos_PETs.append(ref_df.loc[index, 'type'])
    # print(incorrect_pos_PETs)
    
    # gather metrics for pos examples
    pos_f1 = f1_score(pos_labels, pos_preds, average='macro')
    # pos_p = precision_score(pos_labels, pos_preds, average='macro')
    # pos_r = recall_score(pos_labels, pos_preds, average='macro')
    # # neg examples
    neg_f1 = f1_score(neg_labels, neg_preds, average='macro')
    # neg_p = precision_score(neg_labels, neg_preds, average='macro')
    # neg_r = recall_score(neg_labels, neg_preds, average='macro')    
    # make stats
    # stats = row[0:.tolist() # max_f1[0:7].tolist() # take the base stats from the best row
    # print([pos_f1, pos_p, pos_r, neg_f1, neg_p, neg_r]) # add on the pos/neg ones
    # results.loc[len(results.index)] = stats
    # print(pos_labels)
    # print(pos_preds)
    # print(len(pos_preds))
    # print(np.sum(np.asarray(pos_labels) == np.asarray(pos_preds),axis=0))
    # print(np.sum(np.asarray(pos_labels) != np.asarray(pos_preds),axis=0))
    df.loc[i, 'f1_vague'] = pos_f1
    df.loc[i, 'f1_unvague'] = neg_f1
    df.loc[i, 'num_vague_incorrect'] = np.sum(np.asarray(pos_labels) != np.asarray(pos_preds),axis=0)
    df.loc[i, 'num_unvague_incorrect'] = np.sum(np.asarray(neg_labels) != np.asarray(neg_preds),axis=0)
    # break

# print(incorrect_pos_indices)
pos_incorrect = {k: v for k, v in sorted(incorrect_pos_indices.items(), key=lambda item: item[1])}
neg_incorrect = {k: v for k, v in sorted(incorrect_neg_indices.items(), key=lambda item: item[1])}
# print(pos_incorrect)
# print(neg_incorrect)
df

Unnamed: 0,f1,precision,recall,tn,fp,fn,tp,preds,num_vague_incorrect,num_unvague_incorrect,f1_vague,f1_unvague
0,0.704554,0.669903,0.821429,50,34,15,69,[1 1 0 0 0 1 0 0 0 1 0 0 0 1 1 1 1 1 1 1 1 1 0...,23,26,0.719471,0.688889
1,0.772228,0.828571,0.690476,72,12,26,58,[0 1 0 0 0 1 0 0 0 1 0 0 0 0 1 1 1 1 0 0 1 1 0...,17,21,0.797361,0.745638
2,0.791482,0.810127,0.761905,69,15,20,64,[0 1 0 0 0 1 0 0 0 1 0 0 0 0 1 1 1 1 0 0 1 1 0...,15,20,0.821201,0.759725
3,0.832168,0.900000,0.750000,77,7,21,63,[0 1 0 0 0 1 0 0 0 1 0 0 0 0 1 1 1 1 0 0 0 1 0...,11,17,0.868880,0.794088
4,0.843441,0.939394,0.738095,80,4,22,62,[0 1 0 0 0 1 0 0 0 1 0 0 0 0 1 0 1 1 0 0 0 1 0...,8,18,0.904545,0.779592
...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.809416,0.795455,0.833333,66,18,14,70,[0 1 0 0 1 0 1 1 1 0 0 1 1 0 0 1 0 1 1 0 1 0 1...,13,19,0.844156,0.773521
96,0.815417,0.804598,0.833333,67,17,14,70,[0 1 0 0 1 0 1 1 1 0 0 1 1 0 0 0 0 1 1 0 1 0 1...,12,19,0.856410,0.773521
97,0.833239,0.850000,0.809524,72,12,16,68,[0 1 0 0 1 0 1 1 1 0 0 1 1 0 0 0 0 1 1 0 1 0 1...,11,17,0.868880,0.796204
98,0.797159,0.771739,0.845238,63,21,13,71,[0 1 0 0 1 0 1 1 1 0 0 1 1 0 0 0 0 1 1 0 1 0 1...,13,21,0.843441,0.749965


In [8]:
import ast
import pandas as pd

# f = open('Results_3.0_Errors_by_Index.txt', 'r')
df = pd.read_csv("VET_Corpus_v0.2.csv", index_col=0)
df['freq'] = 0

# pos (vague) examples

for k, v in pos_incorrect.items():
    # df.loc[len(df.index)] = df.loc[k]
    df.loc[k, 'freq'] = int(v)

# text = f.readline()
for k, v in neg_incorrect.items():
    # df.loc[len(df.index)] = df.loc[k]
    df.loc[k, 'freq'] = int(v)
    
# df = df.drop(indices)
# for i, row in df.iterrows():
#     if (df.loc[i, 'freq'] == -1):
#         df = df.drop(i)
df

Unnamed: 0,keyword,edited_text,is_euph,category,type,euph_status,sentence,is_vague,freq
0,tinkle,We're just getting back what was TAKEN from us...,1,body functions/parts,tinkle,always_euph,We're just getting back what was TAKEN from us...,0,8
1,tinkle,I think AB390 will pass next year now that the...,1,body functions/parts,tinkle,always_euph,I think AB390 will pass next year now that the...,0,17
2,undocumented immigrants,"Singled Out Think Like a Man, the new movie ba...",1,politics,undocumented immigrant,always_euph,Anything but Secure A federal program designed...,0,0
3,undocumented immigrants,"Not to be outdone, Sen. Rand Paul (R-Ky. ), so...",1,politics,undocumented immigrant,always_euph,In a post-election interview with POLITICO Pau...,0,0
4,undocumented immigrants,The law has also galvanized the growing immigr...,1,politics,undocumented immigrant,always_euph,Aside from undocumented immigrants the America...,0,0
...,...,...,...,...,...,...,...,...,...
1960,sleep with,There were other photos she wanted me to see: ...,0,sexual activity,sleep with,sometimes_euph,There were other photos she wanted me to see B...,0,1
1961,sleep with,I am relieved to see two pup tents marked STAF...,0,sexual activity,sleep with,sometimes_euph,Thank God I don't have to sleep with Ace Wands,0,0
1962,sleep around,"Nothing serious, just long nights of me hackin...",0,sexual activity,sleep around,sometimes_euph,With all my caterwauling it's a wonder anyone ...,0,2
1963,with child,sounds more like Jonestown. They cant leave @ ...,0,physical/mental attributes,with child,sometimes_euph,They cant leave best advice I can give them is...,0,9


In [9]:
df.to_csv("Vagueness_Errors_4.0.2_v1.csv")