# Vagueness Experiments

In [2]:
import pandas as pd
import numpy as np

In [1]:
# This function, HuggingFace-ify, takes a sample of euphemism corpus and it makes into an appropriate format for the HuggingFace Trainer class
def hfify(df):
    df = df.drop(['keyword', 'category', 'type', 'euph_status', 'sentence', 'is_vague'], axis=1)
    df = df.rename(columns={'edited_text':'text', 'is_euph':'label'})
    return df

## Annotation Task

### Creating Annotation Samples

#### Pilot Sample (200)

In [2]:
euph_corpus = pd.read_csv('Euphemism_Corpus_2-24.csv', index_col = 0, encoding= 'utf-8')
always_euphs = pd.read_csv('always_euphs.csv', index_col = 0, encoding= 'utf-8') # 71 PETs here
sometimes_euphs = pd.read_csv('sometimes_euphs.csv', index_col = 0, encoding= 'utf-8') # 58 PETs here

In [80]:
# this pilot sample will be formed by taking one example of each always_euph and one example of a 0 and 1 for each sometimes_euph
# this will total 71 + 58*2 = 187 rows
selected = [] # will store the indices of rows to select for the sample

for PET in always_euphs['type']:
    matches = euph_corpus[euph_corpus['type'] == PET]
    selection = matches.sample(n=1).index # randomly select an index
    selected.append(selection)

for PET in sometimes_euphs['type']:
    lit_matches = euph_corpus[(euph_corpus['type'] == PET) & (euph_corpus['is_euph'] == 0)]
    euph_matches = euph_corpus[(euph_corpus['type'] == PET) & (euph_corpus['is_euph'] == 1)]
    lit_selection = lit_matches.sample(n=1).index # randomly select an index
    euph_selection = euph_matches.sample(n=1).index
    selected.append(lit_selection)
    selected.append(euph_selection)

# idk, some number wrangling to convert from .index, which returns Int64 lists/
selected = np.array(selected).tolist()
selected = [item for sublist in selected for item in sublist]

# print(selected)
euph_corpus = euph_corpus[euph_corpus.index.isin(selected)]
display(euph_corpus)
print(len(euph_corpus))

Unnamed: 0,keyword,edited_text,is_euph,category,type,euph_status,sentence
1,tinkle,I think AB390 will pass next year now that the...,1,bodily functions,tinkle,always_euph,I think AB390 will pass next year now that the...
4,undocumented immigrants,The law has also galvanized the growing immigr...,1,politics,undocumented immigrant,always_euph,Aside from undocumented immigrants the America...
26,venereal diseases,"By 1928, there was a decided colonial response...",1,sexual activity,venereal disease,always_euph,Mass surveys movement restrictions monitoring ...
31,sex worker,"In fact, all the <sex worker> parents I know s...",1,sexual activity,sex worker,always_euph,In fact all the sex worker parents I know say ...
54,mentally disabled,Few battles are truly worth fighting. Stand up...,1,physical/mental attributes,mentally disabled,always_euph,Stand up for those who can not stand up for th...
...,...,...,...,...,...,...,...
1952,seeing each other,The guy I date said he loves me and we <seeing...,0,sexual activity,seeing someone/each other,sometimes_euph,The guy I date said he loves me and we seeing ...
1955,seasoned,More <seasoned> Afghan officials say they are ...,0,physical/mental attributes,seasoned,sometimes_euph,More seasoned Afghan officials say they are cl...
1960,sleep with,There were other photos she wanted me to see: ...,0,sexual activity,sleep with,sometimes_euph,There were other photos she wanted me to see B...
1962,sleep around,"Nothing serious, just long nights of me hackin...",0,sexual activity,sleep around,sometimes_euph,With all my caterwauling it's a wonder anyone ...


187


In [81]:
euph_corpus.to_csv('Pilot_Annotation_Sample.csv')

#### Full Samples (Mech Turk)

In [16]:
# this mech turk sample will be formed by taking 1/3 of the corpus (655 examples) each
# two samples will have 461 1's and 194 0's, while the other will have 462 1s and 193 0's
euph_corpus = pd.read_csv('Euphemism_Corpus_2-24.csv', index_col = 0, encoding= 'utf-8')

euph_corpus_1s = euph_corpus.loc[euph_corpus['is_euph'] == 1]
euph_corpus_1s = euph_corpus_1s.sample(frac=1) # idiomatic way of scrambling the rows
euph_result = np.array_split(euph_corpus_1s, 3)  

euph_corpus_0s = euph_corpus.loc[euph_corpus['is_euph'] == 0]
euph_corpus_0s = euph_corpus_0s.sample(frac=1) # idiomatic way of scrambling the rows
lit_result = np.array_split(euph_corpus_0s, 3)  

# combine each euph chunk (size 461, 461, 460) with each lit chunk (size 195, 194, 194) and shuffle
sample_1 = pd.concat([euph_result[0], lit_result[1]]).sample(frac=1)
sample_2 = pd.concat([euph_result[1], lit_result[2]]).sample(frac=1)
sample_3 = pd.concat([euph_result[2], lit_result[0]]).sample(frac=1)

In [17]:
display(sample_1)
display(sample_2)
display(sample_3)

Unnamed: 0,keyword,edited_text,is_euph,category,type,euph_status,sentence
1535,slim,Given the veritable spec sheet overload that S...,0,physical/mental attributes,slim,sometimes_euph,In terms of the pursuit of the absolute slimme...
1924,late,Handling was responsive but not squirrelly on ...,0,death,late,sometimes_euph,And it managed to do all this while remaining ...
1080,underdeveloped,We should pursue people-centered cultural crea...,1,politics,underdeveloped,sometimes_euph,We should be community-and people-focused in o...
1572,collateral damage,Afghan civilians in proximity to alleged milit...,0,death,collateral damage,sometimes_euph,Afghan civilians in proximity to alleged milit...
1235,laid off,"""Let's also repeal laws which encourage sendin...",1,employment,lay off,sometimes_euph,Those who still send American jobs overseas ou...
...,...,...,...,...,...,...,...
721,droppings,Only the freshest and best Ice Cream you'll ev...,1,bodily functions,droppings,always_euph,The line for a half hour once only for him to ...
806,perished,He died in his car. Koehler suffered cardiac a...,1,death,perish,sometimes_euph,Koehler suffered cardiac arrest and perished s...
1454,overweight,"It depends. If your child is <overweight>, it'...",0,physical/mental attributes,overweight,sometimes_euph,If your child is overweight it's not a
183,elderly,"""Children 2 through 12 and their parents are i...",1,physical/mental attributes,elderly,always_euph,Children 2 through 12 and their parents are in...


Unnamed: 0,keyword,edited_text,is_euph,category,type,euph_status,sentence
1902,late,"And in a stark warning, Hunt warned that it ma...",0,death,late,sometimes_euph,And in a stark warning Hunt warned that it may...
110,detainees,It's not just that torture and other violation...,1,politics,detainee,always_euph,It's not just that torture and other violation...
1479,aging,"When we first meet him, he's undergoing psycho...",0,physical/mental attributes,aging,sometimes_euph,On a symbolic level it's about why Van Damme c...
15,undocumented immigrants,Her primary rhetorical strategy centers around...,1,politics,undocumented immigrant,always_euph,People disgusted with Obama's enthusiasm for k...
617,pro-choice,We must remember that the first question is no...,1,politics,pro-choice,always_euph,The reason why people are Pro-choice rather th...
...,...,...,...,...,...,...,...
1962,sleep around,"Nothing serious, just long nights of me hackin...",0,sexual activity,sleep around,sometimes_euph,With all my caterwauling it's a wonder anyone ...
411,capital punishment,"but not every homicide is a murder. ""Homicide ...",1,death,capital punishment,always_euph,Killing of one person by another by accident s...
481,senior citizens,"Saturday November 3, 2012, 2:53 pm SOME KIND O...",1,physical/mental attributes,senior citizen,always_euph,by leaving all those destroyed senior citizens...
1571,collateral damage,"In the current New Yorker sub req'd, Michael S...",0,death,collateral damage,sometimes_euph,It can happen both through indiscriminate use ...


Unnamed: 0,keyword,edited_text,is_euph,category,type,euph_status,sentence
1008,go all the way,Lets <go all the way> (Lets go all the way ) L...,1,sexual activity,go all the way,sometimes_euph,Lets go all the way Lets go all the way Lets g...
1758,economical,"he is a political head of them, or in such sen...",0,employment,economical,sometimes_euph,he is a political head of them or in such sens...
256,income inequality,Here is an older paper which finds that change...,1,employment,income inequality,always_euph,Here is an older paper which finds that change...
1141,intoxicated,- Be <intoxicated> during their time with the ...,1,substances,intoxicated,sometimes_euph,Be intoxicated during their time with the virgin
873,aging,"At first glance, raising the retirement age se...",1,physical/mental attributes,aging,sometimes_euph,At first glance raising the retirement age see...
...,...,...,...,...,...,...,...
851,overweight,Exercise regularly and maintain a healthy body...,1,physical/mental attributes,overweight,sometimes_euph,Lose weight if you are overweight but avoid lo...
1699,special needs,She's still in the hospital. Baby Nozomi is st...,0,physical/mental attributes,special needs,sometimes_euph,Baby Nozomi is still in the NICU while big sis...
60,correctional facilities,Confinement Youth with disabilities who are ad...,1,employment,correctional facility,always_euph,Very few correctional facilities have formal v...
1385,let go of,It would be easier than her breaking up with h...,0,employment,let go of,sometimes_euph,He knew he had to let go of her before he beca...


In [19]:
sample_1.to_csv("AMT_Sample_1.csv")
sample_2.to_csv("AMT_Sample_2.csv")
sample_3.to_csv("AMT_Sample_3.csv")

In [35]:
# generate HTML
import pandas as pd
 
# to read csv file named "samplee"
a = pd.read_csv("AMT/AMT_Sample_1.1.csv")
 
# to save as html file
# named as "Table"
a.to_html("AMT/AMT_Sample_1.1.html", index=False, na_rep='')

### Annotation Analysis

In [17]:
import pandas as pd
annotations = pd.read_csv("Annotation_Task2.1.csv", index_col=0, encoding= 'utf-8')
euph_corpus = pd.read_csv('Euphemism_Corpus_2-24.csv', index_col = 0, encoding= 'utf-8')

In [11]:
# the next 2 chunks are to fix a mistake i made; leaving out the index in the annotation samples
indices = pd.Series([])
for i, row in annotations.iterrows():
    text = annotations.loc[i, 'text']
    for j, row in euph_corpus.iterrows():
        t = euph_corpus.loc[j, 'edited_text']
        if (t.strip()==text.strip()):
            indices = pd.concat([indices, pd.Series([j])])
            continue
print(len(indices))
print(indices)

  indices = pd.Series([])


165
0    1202
0    1006
0     826
0    1147
0    1445
     ... 
0     554
0     378
0     856
0     902
0      16
Length: 165, dtype: int64


In [14]:
annotations = annotations.set_index(indices)
annotations.to_csv('Annotation_Task2.1.csv')

In [18]:
# Need to put the 0 or 1 label back onto the annotations by matching the index numbers
# also fill in blanks with the keyword
annotations["label"] = -1

for i, row in annotations.iterrows():
    annotations.loc[i, 'type'] = euph_corpus.loc[i]['type']
    
annotations

Unnamed: 0,keyword,text,paraphrase1,paraphrase2,paraphrase3,paraphrase4,paraphrase5,label
1202,between jobs,Hannah's mother explained that God was testing...,unemployed,unemployed,unemployed,unemployed,unemployed,1
1006,same-sex,"So, for instance, when the study finds that ch...",gay,gay,same-sex,homosexual,gay,1
826,pass away,Your story has touched my heart and I wanted t...,die,die,die,die,die,1
1147,intoxicated,Trying difficult is now somehow something to b...,drunk,drunk,drunk,drunk,drunk,1
1445,neutralize,@#19 Rob and #20 Paul: One thing I think is fa...,neutralize,eliminate,nullify,neutralize,kill,0
...,...,...,...,...,...,...,...,...
554,same sex,"Multiple ""partners"" as you call it was NEVER i...",gay,gay,same sex,homosexual,gay,1
378,inebriated,"In a few weeks, tens of thousands of college f...",drunk,drunk,drunk,drunk,intoxicated,1
856,overweight,"Also, there's a good bit of mounting evidence ...",fat,fat,overweight,fat,fat,1
902,demise,"At the Vatican, following the <demise> of the ...",death,death,death,death,fall,1


In [4]:
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('all-MiniLM-L6-v2')

In [5]:
def compute_similarity(s1, s2):
    #Compute embedding for both lists
    embeddings1 = model.encode(s1, convert_to_tensor=True)
    embeddings2 = model.encode(s2, convert_to_tensor=True)

    #Compute cosine-similarities
    cosine_scores = util.cos_sim(embeddings1, embeddings2)
    # print("Similarity score:", cosine_scores.item())
    return cosine_scores.item()

In [7]:
s1 = "@#19 Rob and #20 Paul: One thing I think is fascinating about the RD-180 powering the Atlas is that it's derived from the RD-170/171, which was developed for the Zenit rocket, used in various configurations to boost the Energia (also used in an analogous role to the SRBs in the US Shuttle program, boosting the Energia-Buran launch stack) @ @ @ @ @ @ @ @ @ @ but in a nutshell, you have a very effective engine design that was originally intended to one-up the US and to launch Soviet military systems like the Polyus, which was intended to <neutralize> the strategic advantage of the US SDI. And now it's powering NASA (as well as US military) launches!"
s2 = "Little thought seems to have been given the question of whether to commit ground forces. The recommendations were accepted by the President, and a directive was at once sent General MacArthur authorizing him to use @ @ @ @ @ @ @ @ @ @ of the 38th parallel, and instructing him to <neutralize> Formosa by the use of the Seventh Fleet. "
compute_similarity(s1, s2)

0.20039547979831696

In [21]:
import itertools
import statistics

# stores the similarity score
annotations['semantic_similarity'] = -1
# create list of all pairs of paraphrases
iterable = ['paraphrase1', 'paraphrase2', 'paraphrase3', 'paraphrase4', 'paraphrase5']
combos = list(itertools.combinations(iterable, 2))

n = 0
for i, row in annotations.iterrows():
    annotations.loc[i] = annotations.loc[i].fillna(annotations.loc[i, 'keyword'])
    similarities = []
    # seems like only binary comparisons allowed, so have to compare each para with each other and then average them all
    for c in combos:
        p1 = annotations.loc[i, c[0]]
        p2 = annotations.loc[i, c[1]]
        sim = compute_similarity(p1, p2)
        similarities.append(sim)
    annotations.loc[i, 'semantic_similarity'] = statistics.mean(similarities)
    if (n % 20 == 0):
        print(n)
    n+=1
annotations

0
20
40
60
80
100
120
140
160


Unnamed: 0,keyword,text,paraphrase1,paraphrase2,paraphrase3,paraphrase4,paraphrase5,label,semantic_similarity
1202,between jobs,Hannah's mother explained that God was testing...,unemployed,unemployed,unemployed,unemployed,unemployed,1,1.000000
1006,same-sex,"So, for instance, when the study finds that ch...",gay,gay,same-sex,homosexual,gay,1,0.807432
826,pass away,Your story has touched my heart and I wanted t...,die,die,die,die,die,1,1.000000
1147,intoxicated,Trying difficult is now somehow something to b...,drunk,drunk,drunk,drunk,drunk,1,1.000000
1445,neutralize,@#19 Rob and #20 Paul: One thing I think is fa...,neutralize,eliminate,nullify,neutralize,kill,0,0.424927
...,...,...,...,...,...,...,...,...,...
554,same sex,"Multiple ""partners"" as you call it was NEVER i...",gay,gay,same sex,homosexual,gay,1,0.818027
378,inebriated,"In a few weeks, tens of thousands of college f...",drunk,drunk,drunk,drunk,intoxicated,1,0.869052
856,overweight,"Also, there's a good bit of mounting evidence ...",fat,fat,overweight,fat,fat,1,0.867237
902,demise,"At the Vatican, following the <demise> of the ...",death,death,death,death,fall,1,0.751394


In [10]:
euph_corpus = pd.read_csv('Euphemism_Corpus_2-24.csv', index_col = 0, encoding= 'utf-8')

a1["type"] = -1
a2["type"] = -1

for i, row in a1.iterrows():
    a1.loc[i, 'type'] = euph_corpus.loc[i]['type']
    
for i, row in a2.iterrows():
    a2.loc[i, 'type'] = euph_corpus.loc[i]['type']
    
a2

Unnamed: 0,keyword,text,paraphrase1,paraphrase2,paraphrase3,paraphrase4,paraphrase5,label,semantic_similarity,is_vague,type
1202,between jobs,Hannah's mother explained that God was testing...,unemployed,unemployed,unemployed,unemployed,unemployed,1,1.000000,0,between jobs
1006,same-sex,"So, for instance, when the study finds that ch...",gay,gay,same-sex,homosexual,gay,1,0.807432,0,same-sex
826,pass away,Your story has touched my heart and I wanted t...,die,die,die,die,die,1,1.000000,0,pass away
1147,intoxicated,Trying difficult is now somehow something to b...,drunk,drunk,drunk,drunk,drunk,1,1.000000,0,intoxicated
1445,neutralize,@#19 Rob and #20 Paul: One thing I think is fa...,neutralize,eliminate,nullify,neutralize,kill,0,0.424927,1,neutralize
...,...,...,...,...,...,...,...,...,...,...,...
554,same sex,"Multiple ""partners"" as you call it was NEVER i...",gay,gay,same sex,homosexual,gay,1,0.818027,0,same sex
378,inebriated,"In a few weeks, tens of thousands of college f...",drunk,drunk,drunk,drunk,intoxicated,1,0.869052,0,inebriated
856,overweight,"Also, there's a good bit of mounting evidence ...",fat,fat,overweight,fat,fat,1,0.867237,0,overweight
902,demise,"At the Vatican, following the <demise> of the ...",death,death,death,death,fall,1,0.751394,0,demise


In [12]:
import pandas as pd
# annotations.to_csv("Annotation_Task2_Analysis.csv")

# perform one-time linking
# a1 = pd.read_csv('Annotation_Task1_Analysis_v2.csv', index_col=0)
# a2 = pd.read_csv('Annotation_Task2_Analysis.csv', index_col=0)

d = {}
            
for i, row in a1.iterrows():
    keyword = a1.loc[i, 'type']
    label = a1.loc[i, 'label']
    keyword = keyword + '_' + str(label)
    is_vague = a1.loc[i, 'is_vague']
    if keyword not in d:
        d[keyword] = [-1, -1] # -1 in the second annotation sample means this PET didn't show up (only one example in dataset)
    d[keyword][0] = is_vague

for j, row in a2.iterrows():
    keyword = a2.loc[j, 'type']
    label = a2.loc[j, 'label']
    keyword = keyword + '_' + str(label)
    is_vague = a2.loc[j, 'is_vague']
    
    d[keyword][1] = is_vague

df = pd.DataFrame(columns = ['PET', 'is_vague_1', 'is_vague_2'])

for PET, e in d.items():
    new_row = pd.Series({'PET': PET,
                         'is_vague_1': e[0],
                         'is_vague_2': e[1]
                   })
    df = pd.concat([df, new_row.to_frame().T], ignore_index=True)
    
df
df.to_csv('VET_List.csv')

In [8]:
# output the PETs from (theoretically) less vague to vague based on 2 thresholds of similarity (for euphemistic examples)
high_sim_PETs = []
low_sim_PETs = []
med_sim_PETs = []

high_threshold = 0.7
low_threshold = 0.6
for i, row in annotations.iterrows():
    # for now, only look at euphemistic examples
    if (annotations.loc[i, 'label'] == 0):
        continue
    
    euph_status = 1 if (euph_corpus.loc[i, 'euph_status'] == "always_euph") else 0 # no apparent pattern to whether a euph is sometimes_euph and whether it's vague
    PET = (euph_status, annotations.loc[i, 'keyword'])
    if (annotations.loc[i, 'semantic_similarity'] > high_threshold):
        high_sim_PETs.append(PET)
    elif (annotations.loc[i, 'semantic_similarity'] < low_threshold):
        low_sim_PETs.append(PET)
    else:
        med_sim_PETs.append(PET)
    
print("{} HIGH SIM PETs: {}\n".format(len(high_sim_PETs), high_sim_PETs))
print("{} MED SIM PETs: {}\n".format(len(med_sim_PETs), med_sim_PETs))
print("{} LOW SIM PETs: {}".format(len(low_sim_PETs), low_sim_PETs)) 

69 HIGH SIM PETs: [(1, 'terminating a pregnancy'), (1, 'under the weather'), (0, 'overweight'), (1, 'street person'), (0, 'to go to heaven'), (1, 'sex worker'), (0, 'aging'), (1, 'portly'), (1, 'ethnic cleansing'), (0, 'stout'), (0, 'passed away'), (0, 'between jobs'), (0, 'slept with'), (1, 'time of the month'), (1, 'lavatory'), (1, 'substance abuse'), (0, 'a certain age'), (1, 'hearing impaired'), (1, 'pass gas'), (0, 'gluteus maximus'), (0, 'expecting'), (1, 'rear end'), (0, 'with child'), (0, 'dismissed'), (1, 'pro-life'), (1, 'enhanced interrogation techniques'), (1, 'correctional facility'), (1, 'tinkle'), (0, 'seasoned'), (0, 'exterminate'), (1, 'substance abusers'), (1, 'made love'), (1, 'elderly'), (1, 'underprivileged'), (1, 'advanced age'), (1, 'fatalities'), (1, 'capital punishment'), (0, 'let him go'), (1, 'undocumented workers'), (1, 'less fortunate'), (1, 'latrine'), (1, 'drinking problem'), (0, 'weed'), (0, 'passing on'), (1, 'low-income'), (0, 'perish'), (1, 'dearly de

In [4]:
high_sim_PETs = []
low_sim_PETs = []
med_sim_PETs = []

high_threshold = 0.7
low_threshold = 0.6
for i, row in annotations.iterrows():
    # for now, only look at euphemistic examples
    if (annotations.loc[i, 'label'] == 1):
        continue
    PET = annotations.loc[i, 'keyword']
    if (annotations.loc[i, 'semantic_similarity'] > high_threshold):
        high_sim_PETs.append(PET)
    elif (annotations.loc[i, 'semantic_similarity'] < low_threshold):
        low_sim_PETs.append(PET)
    else:
        med_sim_PETs.append(PET)
    
print("{} HIGH SIM PETs: {}\n".format(len(high_sim_PETs), high_sim_PETs))
print("{} MED SIM PETs: {}\n".format(len(med_sim_PETs), med_sim_PETs))
print("{} LOW SIM PETs: {}".format(len(low_sim_PETs), low_sim_PETs))

11 HIGH SIM PETs: ['plump', 'stout', 'same-sex', 'seasoned', 'overweight', 'wealthy', 'aging', 'exterminate', 'chest', 'sleep with', 'getting clean']

8 MED SIM PETs: ['disabled', 'between jobs', 'pass away', 'expecting', 'sleep around', 'seeing each other', 'a certain age', 'with child']

39 LOW SIM PETs: ['outspoken', 'economical', 'custodian', 'got clean', 'demise', 'neutralize', 'went to heaven', 'perish', 'over the hill', 'let them go', 'put to sleep', 'weed', 'outlived their usefulness', 'collateral damage', 'lay off', 'accident', 'passing on', 'downsize', 'regime change', 'special needs', 'mixed up', 'intoxicated', 'disadvantaged', 'sober', 'slim', 'invalid', 'experienced', 'oldest profession', 'go all the way', 'troubled', 'long sleep', 'let go of', 'dismissed', 'late', 'well off', 'gluteus maximus', 'to go to heaven', 'deprived', 'underdeveloped']


In [16]:
from parascore import ParaScorer
scorer = ParaScorer(lang="en", model_type = 'bert-base-uncased')
cands = ["A young person is skating.", "Terminating a pregnancy is murder."]
sources = ["There's a child on a skateboard.", "Abortion is murder."]
refs = ["A kid is skateboarding."]
# score = scorer.base_score(cands, sources, refs, batch_size=16)
score = scorer.free_score(cands, sources, batch_size=16)
print(score)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[tensor([0.6795, 0.7529]), tensor([0.6519, 0.8765])]


### Secondary Sample

In [11]:
first_sample = pd.read_csv('Pilot_Annotation_Sample.csv', index_col = 0)
euph_corpus = pd.read_csv('Euphemism_Corpus_2-24.csv', index_col = 0)

print(len(np.unique(euph_corpus['type'])))
remaining = euph_corpus.drop(first_sample.index)
always_euphs = np.unique((remaining.loc[remaining['euph_status']=='always_euph'])['type'])
sometimes_euphs = np.unique((remaining.loc[remaining['euph_status']=='sometimes_euph'])['type'])

selected = []

for PET in always_euphs:
    matches = remaining[remaining['type'] == PET]
    selection = matches.sample(n=1).index # randomly select an index
    selected.append(selection)

for PET in sometimes_euphs:
    lit_matches = remaining[(remaining['type'] == PET) & (remaining['is_euph'] == 0)]
    euph_matches = remaining[(remaining['type'] == PET) & (remaining['is_euph'] == 1)]
    lit_selection = lit_matches.sample(n=1).index # randomly select an index
    euph_selection = euph_matches.sample(n=1).index
    selected.append(lit_selection)
    selected.append(euph_selection)

# idk, some number wrangling to convert from .index, which returns Int64 lists/
selected = np.array(selected).tolist()
selected = [item for sublist in selected for item in sublist]

remaining = remaining[remaining.index.isin(selected)]
display(remaining)
remaining.to_csv('Annotation_Sample_2.csv')

129


Unnamed: 0,keyword,edited_text,is_euph,category,type,euph_status,sentence
0,tinkle,We're just getting back what was TAKEN from us...,1,bodily functions,tinkle,always_euph,We're just getting back what was TAKEN from us...
16,undocumented immigrants,The Arizona Republic reports that young <undoc...,1,politics,undocumented immigrant,always_euph,The Arizona Republic reports that young undocu...
23,venereal disease,He's being sued by a woman who claims he gave ...,1,sexual activity,venereal disease,always_euph,And Kris Humphries plans on being' relentless'...
34,sex worker,I cofounded a magazine by and for sex workers....,1,sexual activity,sex worker,always_euph,I found community in the sex worker rights mov...
57,mentally disabled,Thank you so much for sharing your story and y...,1,physical/mental attributes,mentally disabled,always_euph,I think though that you assume that those that...
...,...,...,...,...,...,...,...
1938,troubled,Speaking to the Santa Clara County Board of Su...,0,physical/mental attributes,troubled,sometimes_euph,Sara Cody health officer for Santa Clara Count...
1953,seeing each other,This guy replies with an awesome e-mail and wa...,0,sexual activity,seeing someone/each other,sometimes_euph,We clicked immediately and have been seeing ea...
1954,seasoned,"I am-- it depends on the location, though. Bec...",0,physical/mental attributes,seasoned,sometimes_euph,Because in Miami for example and in Richmond V...
1956,slept with,Calves and baby goats were sheltered in the fr...,0,sexual activity,sleep with,sometimes_euph,The other two rooms were the father's sleeping...


### Vagueness Extrapolation - Strong Assumption

In [5]:
import pandas as pd
# SINCE THE LAST TIME THE FILE "Annotation_Task1_Analysis.csv" WAS REFERENCED...
# ..I have manually (using Excel) put a "1" for Vague for paraphrase similarity scores < 0.6, "0" for > 0.7, and my own judgments for in between (Experimental: <0.63=vague)
annotations = pd.read_csv("Annotation_Task1_Analysis.csv", index_col = 0, encoding= 'utf-8')
euph_corpus = pd.read_csv('Euphemism_Corpus_v2.1.csv', index_col = 0, encoding= 'utf-8')

In [7]:
VET_dict = {} # will contain a dict for each PET and its vagueness label. If PET is sometimes_euph, dict will have 2 keys: '1' is euphemistic use and '0' literal
missing = i
for i, row in annotations.iterrows():
    # if (i == 1807):
    #     VET_dict['exterminate'][0] = 0
    #     continue
    keyword = euph_corpus.loc[i, 'type']
    label = annotations.loc[i, 'label']
    is_vague = annotations.loc[i, 'is_vague']
    if (keyword not in VET_dict):
        VET_dict[keyword] = {}
    VET_dict[keyword][label] = is_vague

# VET_dict

In [8]:
# now apply to rest of corpus
euph_corpus['is_vague'] = -1

for i, row in euph_corpus.iterrows():
    PET = euph_corpus.loc[i, 'type']
    label = euph_corpus.loc[i, 'is_euph']
    is_vague = VET_dict[PET][label]
    euph_corpus.loc[i, 'is_vague'] = is_vague

euph_corpus

Unnamed: 0,keyword,edited_text,is_euph,category,type,euph_status,sentence,is_vague
0,tinkle,We're just getting back what was TAKEN from us...,1,body functions/parts,tinkle,always_euph,We're just getting back what was TAKEN from us...,0
1,tinkle,I think AB390 will pass next year now that the...,1,body functions/parts,tinkle,always_euph,I think AB390 will pass next year now that the...,0
2,undocumented immigrants,"Singled Out Think Like a Man, the new movie ba...",1,politics,undocumented immigrant,always_euph,Anything but Secure A federal program designed...,0
3,undocumented immigrants,"Not to be outdone, Sen. Rand Paul (R-Ky. ), so...",1,politics,undocumented immigrant,always_euph,In a post-election interview with POLITICO Pau...,0
4,undocumented immigrants,The law has also galvanized the growing immigr...,1,politics,undocumented immigrant,always_euph,Aside from undocumented immigrants the America...,0
...,...,...,...,...,...,...,...,...
1960,sleep with,There were other photos she wanted me to see: ...,0,sexual activity,sleep with,sometimes_euph,There were other photos she wanted me to see B...,0
1961,sleep with,I am relieved to see two pup tents marked STAF...,0,sexual activity,sleep with,sometimes_euph,Thank God I don't have to sleep with Ace Wands,0
1962,sleep around,"Nothing serious, just long nights of me hackin...",0,sexual activity,sleep around,sometimes_euph,With all my caterwauling it's a wonder anyone ...,0
1963,with child,sounds more like Jonestown. They cant leave @ ...,0,physical/mental attributes,with child,sometimes_euph,They cant leave best advice I can give them is...,0


In [9]:
vague_examples = euph_corpus.loc[euph_corpus['is_vague']==1]
unvague_examples = euph_corpus.loc[euph_corpus['is_vague']==0]

vague_1s = vague_examples.loc[vague_examples['is_euph']==1]
vague_0s = vague_examples.loc[vague_examples['is_euph']==0]
unvague_1s = unvague_examples.loc[unvague_examples['is_euph']==1]
unvague_0s = unvague_examples.loc[unvague_examples['is_euph']==0]

print("There are {} vague examples. Of these, {} are 1s and {} are 0s".format(
    len(vague_examples), len(vague_1s), len(vague_0s)))
print("There are {} un-vague examples. Of these, {} are 1s and {} are 0s".format(
    len(unvague_examples), len(unvague_1s), len(unvague_0s)))

There are 769 vague examples. Of these, 408 are 1s and 361 are 0s
There are 1183 un-vague examples. Of these, 975 are 1s and 208 are 0s


In [10]:
euph_corpus.to_csv("VET_Corpus_0.2.csv")

### Vagueness Extrapolation - Weak Assumption

In [1]:
# first, we need to get both annotations in one file
import pandas as pd

a1 = pd.read_csv("Annotation_Task1_Analysis_v2.csv", index_col=0)
a2 = pd.read_csv("Annotation_Task2_Analysis.csv", index_col=0)

a3 = pd.concat([a1, a2])
a3

Unnamed: 0,keyword,text,paraphrase1,paraphrase2,paraphrase3,paraphrase4,paraphrase5,paraphrase6,label,semantic_similarity,human_sim,is_vague,changes
465,terminating a pregnancy,There's no difference. <Terminating a pregnanc...,abortion,abortion,abortion,abortion,abortion,Abortion,1,1.000000,5.0,0,
1347,sleep around,"I would too, just as I would for any other gir...",have sexual relationships with multiple people,have sex with a lot of people,practice adultery,have sex with many partners,have casual sex,have sex with many people,1,0.605805,3.0,1,
1188,custodian,"Only one Princeton faculty member, James Shinn...",janitor,cleaner,janitor,janitor,custodian,janitor,1,0.662722,4.0,0,
565,under the weather,"We don't know how bad things will get, Morocco...",sick,sick,sick,ill,sick,sick,1,0.853640,4.0,0,
847,overweight,Obesity is seen as the great pandemia of the X...,being fat,obesity,fatness,being fat,overweight,being fat,1,0.740032,3.0,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
554,same sex,"Multiple ""partners"" as you call it was NEVER i...",gay,gay,same sex,homosexual,gay,,1,0.818027,,0,
378,inebriated,"In a few weeks, tens of thousands of college f...",drunk,drunk,drunk,drunk,intoxicated,,1,0.869052,,0,
856,overweight,"Also, there's a good bit of mounting evidence ...",fat,fat,overweight,fat,fat,,1,0.867237,,0,
902,demise,"At the Vatican, following the <demise> of the ...",death,death,death,death,fall,,1,0.751394,,0,


In [2]:
# retrieve types 
euph_corpus = pd.read_csv('Euphemism_Corpus_2-24.csv', index_col = 0, encoding= 'utf-8')
a3['type'] = ""
for i, row in a3.iterrows():
    PET = euph_corpus.loc[i, 'type'] # retrieve type for this index
    a3.loc[i, 'type'] = PET
    
a3
# a3.to_csv("Weak_Assumption_Corpus.csv")

Unnamed: 0,keyword,text,paraphrase1,paraphrase2,paraphrase3,paraphrase4,paraphrase5,paraphrase6,label,semantic_similarity,human_sim,is_vague,changes,type
465,terminating a pregnancy,There's no difference. <Terminating a pregnanc...,abortion,abortion,abortion,abortion,abortion,Abortion,1,1.000000,5.0,0,,pregnancy termination
1347,sleep around,"I would too, just as I would for any other gir...",have sexual relationships with multiple people,have sex with a lot of people,practice adultery,have sex with many partners,have casual sex,have sex with many people,1,0.605805,3.0,1,,sleep around
1188,custodian,"Only one Princeton faculty member, James Shinn...",janitor,cleaner,janitor,janitor,custodian,janitor,1,0.662722,4.0,0,,custodian
565,under the weather,"We don't know how bad things will get, Morocco...",sick,sick,sick,ill,sick,sick,1,0.853640,4.0,0,,under the weather
847,overweight,Obesity is seen as the great pandemia of the X...,being fat,obesity,fatness,being fat,overweight,being fat,1,0.740032,3.0,0,,overweight
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
554,same sex,"Multiple ""partners"" as you call it was NEVER i...",gay,gay,same sex,homosexual,gay,,1,0.818027,,0,,same sex
378,inebriated,"In a few weeks, tens of thousands of college f...",drunk,drunk,drunk,drunk,intoxicated,,1,0.869052,,0,,inebriated
856,overweight,"Also, there's a good bit of mounting evidence ...",fat,fat,overweight,fat,fat,,1,0.867237,,0,,overweight
902,demise,"At the Vatican, following the <demise> of the ...",death,death,death,death,fall,,1,0.751394,,0,,demise


In [7]:
from tqdm.notebook import tqdm
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('all-MiniLM-L6-v2')
def compute_similarity(s1, s2):
    #Compute embedding for both lists
    embeddings1 = model.encode(s1, convert_to_tensor=True)
    embeddings2 = model.encode(s2, convert_to_tensor=True)

    #Compute cosine-similarities
    cosine_scores = util.cos_sim(embeddings1, embeddings2)
    # print("Similarity score:", cosine_scores.item())
    return cosine_scores.item()

# form weakness corpus 0.1 - using sentence_transformers, copy vagueness label of sentence with highest cos similarity in annotation samples
euph_corpus = pd.read_csv('Euphemism_Corpus_2-24.csv', index_col = 0, encoding= 'utf-8')
euph_corpus['is_vague'] = -1
euph_corpus['highest_sim'] = -1 # represents the highest similarity score with representative example in the vague-annotated data
                                # this may be useful for the next assumption level, to decide on a threshold
weak_corpus = pd.read_csv("Weak_Assumption_Corpus.csv", index_col = 0)

for i, row in tqdm(euph_corpus.iterrows()):
    PET = euph_corpus.loc[i, 'type']
    label = euph_corpus.loc[i, 'is_euph']
    text = euph_corpus.loc[i, 'edited_text'] # this example's text
    # print(text)
    
    model = SentenceTransformer('all-MiniLM-L6-v2')
    sel = weak_corpus.loc[weak_corpus['type']==PET]
    examples = sel.loc[sel['label']==label]
    # display(examples)
    
    highest_sim = -1
    highest_sim_label = -1
    for j, row in examples.iterrows():
        # print(examples.loc[j, 'text'])
        sim = compute_similarity(text, examples.loc[j, 'text'])
        if (sim > highest_sim):
            highest_sim = sim
            highest_sim_label = examples.loc[j, 'is_vague'] # update vagueness label to match highest sim example
    euph_corpus.loc[i, 'is_vague'] = highest_sim_label # set vagueness label
    euph_corpus.loc[i, 'highest_sim'] = highest_sim 

euph_corpus

0it [00:00, ?it/s]

Unnamed: 0,keyword,edited_text,is_euph,category,type,euph_status,sentence,is_vague,highest_sim
0,tinkle,We're just getting back what was TAKEN from us...,1,bodily functions,tinkle,always_euph,We're just getting back what was TAKEN from us...,0,1.000000
1,tinkle,I think AB390 will pass next year now that the...,1,bodily functions,tinkle,always_euph,I think AB390 will pass next year now that the...,0,1.000000
2,undocumented immigrants,"Singled Out Think Like a Man, the new movie ba...",1,politics,undocumented immigrant,always_euph,Anything but Secure A federal program designed...,0,0.281206
3,undocumented immigrants,"Not to be outdone, Sen. Rand Paul (R-Ky. ), so...",1,politics,undocumented immigrant,always_euph,In a post-election interview with POLITICO Pau...,0,0.238935
4,undocumented immigrants,The law has also galvanized the growing immigr...,1,politics,undocumented immigrant,always_euph,Aside from undocumented immigrants the America...,0,1.000001
...,...,...,...,...,...,...,...,...,...
1960,sleep with,There were other photos she wanted me to see: ...,0,sexual activity,sleep with,sometimes_euph,There were other photos she wanted me to see B...,0,1.000000
1961,sleep with,I am relieved to see two pup tents marked STAF...,0,sexual activity,sleep with,sometimes_euph,Thank God I don't have to sleep with Ace Wands,0,0.337824
1962,sleep around,"Nothing serious, just long nights of me hackin...",0,sexual activity,sleep around,sometimes_euph,With all my caterwauling it's a wonder anyone ...,0,1.000000
1963,with child,sounds more like Jonestown. They cant leave @ ...,0,physical/mental attributes,with child,sometimes_euph,They cant leave best advice I can give them is...,0,1.000000


In [9]:
euph_corpus.to_csv("VET_Corpus_1.0.csv")

In [8]:
# analyze 
vet_corpus = pd.read_csv("VET_Corpus_1.0.csv")
vague_examples = vet_corpus.loc[vet_corpus['is_vague']==1]
unvague_examples = vet_corpus.loc[vet_corpus['is_vague']==0]
vague_1s = vague_examples.loc[vague_examples['is_euph']==1]
vague_0s = vague_examples.loc[vague_examples['is_euph']==0]
unvague_1s = unvague_examples.loc[unvague_examples['is_euph']==1]
unvague_0s = unvague_examples.loc[unvague_examples['is_euph']==0]

print(len(vague_1s))
print(len(vague_0s))
print(len(unvague_1s))
print(len(unvague_0s))

386
305
996
278


### Weakest assumption - for each sentence in euph corpus, get closest cos sim label from combined annotation data

In [None]:
# form weakness corpus 0.1 - using sentence_transformers, copy vagueness label of sentence with highest cos similarity in annotation samples
euph_corpus = pd.read_csv('Euphemism_Corpus_2-24.csv', index_col = 0, encoding= 'utf-8')
euph_corpus['is_vague'] = -1
weak_corpus = pd.read_csv("Weak_Assumption_Corpus.csv", index_col = 0)

for i, row in tqdm(euph_corpus.iterrows()):
    

# Archives: Creating Vagueness Train-Test Splits, Data Analyses...

## Create Vagueness Train-Test Splits

### TEST_0 - TRAIN: Mixed, TEST: Only Vague/Non-Vague

#### Vague-only Test Set

In [10]:
# Initial experimental run (assumption: PETs in a 1 or 0 context will always have the same vagueness)
# requires variables from previous chunk 

# create test set
vague_1s_sample = vague_1s.sample(197).sample(frac=1) # note the second .sample(frac=1) is to shuffle rows
vague_0s_sample = vague_0s.sample(196).sample(frac=1)
vague_1s = vague_1s.drop(vague_1s_sample.index).sample(frac=1) # remove the selected ones from the vague examples (the remaining will serve as training examples)
vague_0s = vague_0s.drop(vague_0s_sample.index).sample(frac=1) 

In [15]:
vague_train = pd.concat([vague_1s, vague_0s]) # the portion of vague examples that should will go in the train set

In [18]:
train_set = pd.concat([vague_train, unvague_examples]) # train set consists of the remaining vague examples and unvague_examples
train_set = train_set.sample(frac=1)
train_set

Unnamed: 0,keyword,edited_text,is_euph,category,type,euph_status,sentence,is_vague
1464,overweight,It's not OK because when kids don't eat well o...,0,physical/mental attributes,overweight,sometimes_euph,It's not OK because when kids don't eat well o...,0
1137,intoxicated,Technically the fact that he was <intoxicated>...,1,substances,intoxicated,sometimes_euph,Technically the fact that he was intoxicated m...,0
1590,collateral damage,"Keep in mind, folks, that this guy claims to b...",0,death,collateral damage,sometimes_euph,And yet he is in essence saying the murder of ...,1
561,able-bodied,Certain precautions may have to be taken in ca...,1,physical/mental attributes,able-bodied,always_euph,For instance every ATC user may have to be eit...,1
1331,troubled,"In Youngstown, the Corrections Corporation of ...",1,physical/mental attributes,troubled,sometimes_euph,It found them in the troubled Lorton prison in...,1
...,...,...,...,...,...,...,...,...
1475,aging,The outside table has been super-soaked in woo...,0,physical/mental attributes,aging,sometimes_euph,I don't mind some aging just don't want it to ...,0
1287,late,He said the situation in Kaura Namida is espec...,1,death,late,sometimes_euph,He said the family of the late Emir has been q...,0
1603,plump,It appears to be wearing a coat two sizes too ...,0,physical/mental attributes,plump,sometimes_euph,It perches over a small pool then lunges with ...,0
207,drinking problem,"From civics class, you may remember that the 2...",1,substances,drinking problem,always_euph,But while the amendment made it once again leg...,0


In [19]:
test_set = pd.concat([vague_1s_sample, vague_0s_sample])
test_set = test_set.sample(frac=1)

Unnamed: 0,keyword,edited_text,is_euph,category,type,euph_status,sentence,is_vague
1073,disadvantaged,"To me, saying, ""You are Black/Native American/...",1,employment,disadvantaged,sometimes_euph,you must be disadvantaged and need help is a h...,1
1071,disadvantaged,"By contrast, Chapter 6 of What Is To Be Done? ...",1,employment,disadvantaged,sometimes_euph,Now'proposes beginning those local economic re...,1
1065,disadvantaged,Immediately prior to his Senate confirmation Z...,1,employment,disadvantaged,sometimes_euph,He also co-founded The Urban Alliance Foundati...,1
1066,disadvantaged,The current study examined 205 youth and mento...,1,employment,disadvantaged,sometimes_euph,The current study examined 205 youth and mento...,1
714,mentally challenged,But the biggest by far is the generous work ru...,1,physical/mental attributes,mentally challenged,always_euph,I'm thinking of the cancer-support groups; the...,1
...,...,...,...,...,...,...,...,...
1725,underdeveloped,"Dr. Sun Qingwei, a coal campaigner for Greenpe...",0,politics,underdeveloped,sometimes_euph,The reason is that the technology is underdeve...,1
1766,over the hill,"Who knows, perhaps beneath the starry night sk...",0,physical/mental attributes,over the hill,sometimes_euph,Who knows perhaps beneath the starry night sky...,1
1428,pass away,If he were certain that there were no life to ...,0,death,pass away,sometimes_euph,The greatest act of faith the only faith which...,1
1615,weed,"Personal success, right? You are all scum, don...",0,substances,weed,sometimes_euph,hopefully a hurricane or snowstorm will weed t...,1


In [21]:
# note, for huggingface training you need to remove all columns except "text" and "label" - do this manually
train_set.to_csv('Vagueness_Splits/Test_0/hf_train.csv')
test_set.to_csv('Vagueness_Splits/Test_0/hf_test.csv')

#### Unvague-only Test Set

In [22]:
# create test set
unvague_1s_sample = unvague_1s.sample(197).sample(frac=1) # note the second .sample(frac=1) is to shuffle rows
unvague_0s_sample = unvague_0s.sample(196).sample(frac=1)
unvague_1s = unvague_1s.drop(unvague_1s_sample.index).sample(frac=1) # remove the selected ones from the vague examples (the remaining will serve as training examples)
unvague_0s = unvague_0s.drop(unvague_0s_sample.index).sample(frac=1) 

In [26]:
unvague_train = pd.concat([unvague_1s, unvague_0s])
train_set = pd.concat([unvague_train, vague_examples]) # train set consists of the remaining vague examples and unvague_examples
train_set = train_set.sample(frac=1)
train_set

Unnamed: 0,keyword,edited_text,is_euph,category,type,euph_status,sentence,is_vague
1718,underdeveloped,I hope you realize that the Daleks used to be ...,0,politics,underdeveloped,sometimes_euph,So in theory the chick turning out to be a Dal...,1
1407,perish,So true @ @ @ @ @ @ @ @ @ @ the regional schoo...,0,death,perish,sometimes_euph,Unfortunately in the following years there wer...,1
974,collateral damage,The ambassador said Israel has destroyed many ...,1,death,collateral damage,sometimes_euph,The ambassador said Israel has destroyed many ...,1
910,demise,"Time after time we see the intelligence, patie...",1,death,demise,sometimes_euph,Time after time we see the intelligence patien...,0
350,substance abuse,"@ @ @ @ @ @ @ @ @ @ 1997, while simply doing m...",1,substances,substance abuse,always_euph,1997 while simply doing my job as the Clinical...,0
...,...,...,...,...,...,...,...,...
380,inebriated,"Mary, You are familiar with rhetoric, aren't y...",1,substances,inebriated,always_euph,The reason they are saying this is because in ...,0
1198,between jobs,I would still donate food and clothing for peo...,1,employment,between jobs,sometimes_euph,I applied for temporary assistance when I was ...,0
1585,collateral damage,They didn't have him offed like they had Vince...,0,death,collateral damage,sometimes_euph,But the ambassador and his three protectors di...,1
203,armed conflict,A 2005 RAND Corp study found the UN to be succ...,1,politics,armed conflict,always_euph,It compared UN nation-building efforts to thos...,1


In [25]:
test_set = pd.concat([unvague_1s_sample, unvague_0s_sample])
test_set = test_set.sample(frac=1)
test_set

Unnamed: 0,keyword,edited_text,is_euph,category,type,euph_status,sentence,is_vague
16,undocumented immigrants,The Arizona Republic reports that young <undoc...,1,politics,undocumented immigrant,always_euph,The Arizona Republic reports that young undocu...,0
682,advanced age,These include the receptors for insulin and fo...,1,physical/mental attributes,advanced age,always_euph,Rapamycin is an antimicrobial that was recentl...,0
1487,aging,I thought they made the right move when they t...,0,physical/mental attributes,aging,sometimes_euph,And they rebuilt well; they sold off aging vet...,0
1091,mixed up,Im white and a woman and in my sixtys. I do nt...,1,physical/mental attributes,mixed up,sometimes_euph,he got mixed up in drugs,0
1105,well off,How do you prove honest @ @ @ @ @ @ @ @ @ @ Br...,1,employment,well off,sometimes_euph,he gets matrimonial offers from even well off ...,0
...,...,...,...,...,...,...,...,...
488,substance abusers,"Indeed, it's useful to compare the Hollywood o...",1,substances,substance abuser,always_euph,Both had lots of substance abusers divorce and...,0
1801,exterminate,They had long ago interacted with earth and ou...,0,politics,exterminate,sometimes_euph,Voltar and his people felt that man was being ...,0
1249,to go to heaven,The short version I tried to commit suicide be...,1,death,to go to heaven,sometimes_euph,A couple days before halloween 2006 I laid dow...,0
1310,late,That was really bad for public health. In Nige...,1,death,late,sometimes_euph,In Nigeria that situation was what obtained be...,0


In [28]:
# note, for huggingface training you need to remove all columns except "text" and "label" - do this manually
train_set.to_csv('Vagueness_Splits/Test_0/unvague_only_test/hf_train.csv')
test_set.to_csv('Vagueness_Splits/Test_0/unvague_only_test/hf_test.csv')

### TEST_1 - TRAIN: Only Vague, TEST: Only Vague/Non-Vague

#### Vague-only Training and Test Set

In [29]:
vet_corpus = pd.read_csv('VET_Corpus_0.1.csv', index_col = 0, encoding= 'utf-8') # corpus containing strong assumption PET-generalization

vague_examples = vet_corpus.loc[vet_corpus['is_vague']==1]
# unvague_examples = vet_corpus.loc[vet_corpus['is_vague']==0]
vague_1s = vague_examples.loc[vague_examples['is_euph']==1]
vague_0s = vague_examples.loc[vague_examples['is_euph']==0]

# construct the test set
vague_1s_sample = vague_1s.sample(77)
vague_0s_sample = vague_0s.sample(77)
vague_test = pd.concat([vague_1s_sample, vague_0s_sample]).sample(frac=1)
vague_test

Unnamed: 0,keyword,edited_text,is_euph,category,type,euph_status,sentence,is_vague
1331,troubled,"In Youngstown, the Corrections Corporation of ...",1,physical/mental attributes,troubled,sometimes_euph,It found them in the troubled Lorton prison in...,1
1049,special needs,Hill that year was a group of LDS Scouts who w...,1,physical/mental attributes,special needs,sometimes_euph,Boy Scout Troop 601 a Church-sponsored unit fr...,1
1088,invalid,"She did 143 paintings, more than a third of th...",1,physical/mental attributes,invalid,sometimes_euph,Her brief life is connected with her solipsist...,1
551,custodians,"Those include $ 700,000 slashed from instructi...",1,employment,custodians,always_euph,The district also will reduce bus transportati...,1
1040,disabled,The whole point of insurance is to attain it b...,1,physical/mental attributes,disabled,sometimes_euph,You can argue that the government ie taxpayers...,1
...,...,...,...,...,...,...,...,...
519,sanitation workers,In much of the world @ @ @ @ @ @ @ @ @ @ the f...,1,employment,sanitation worker,always_euph,In much of the world the front lines of the cr...,1
1631,weed,Glyphosate is a strong organic phosphate chela...,0,substances,weed,sometimes_euph,It is this ability to shut down physiological ...,1
1530,demise,"Speaking of doing exactly what people expect, ...",0,death,demise,sometimes_euph,Since I started these pieces Nebraska has mana...,1
1699,special needs,She's still in the hospital. Baby Nozomi is st...,0,physical/mental attributes,special needs,sometimes_euph,Baby Nozomi is still in the NICU while big sis...,1


In [30]:
# construct the training set
vague_1s = vague_1s.drop(vague_1s_sample.index) # remove the selected ones from the vague examples (the remaining will serve as training examples)
vague_0s = vague_0s.drop(vague_0s_sample.index)
vague_train = pd.concat([vague_1s, vague_0s]).sample(frac=1) # note the second .sample(frac=1) is to shuffle rows
vague_train

Unnamed: 0,keyword,edited_text,is_euph,category,type,euph_status,sentence,is_vague
758,golden years,All three came from situations of terrible neg...,1,physical/mental attributes,golden years,always_euph,We hope to find more families like the Johnson...,1
1426,pass away,"Just as man, by such precepts, impresses a kin...",0,death,pass away,sometimes_euph,As the psalmist says He has made a decree whic...,1
118,comfort women,"By the way, Why the Chinese act so strongly to...",1,sexual activity,comfort women,always_euph,More than and more than 200 thousand Chinese w...,1
1878,lay off,We do NOT need some self-imposed book cop tell...,0,employment,lay off,sometimes_euph,Lay off,1
82,freedom fighters,"In recent years, ultra-Orthodox Charedi Jews i...",1,politics,freedom fighter,always_euph,And now as the freedom fighters did in the Ame...,1
...,...,...,...,...,...,...,...,...
1666,disabled,"In particular, one important task which the li...",0,physical/mental attributes,disabled,sometimes_euph,When this function is disabled by brain damage...,1
49,mentally disabled,In order to sin you have to have a certain amo...,1,physical/mental attributes,mentally disabled,always_euph,These things are all true of angels and humans...,1
1247,laid off,And layoff thousands and remove millions from ...,1,employment,lay off,sometimes_euph,But when he laid off workers at Bain to make m...,1
1608,weed,"In some ways, cultivating for <weed> control i...",0,substances,weed,sometimes_euph,In some ways cultivating for weed control is a...,1


In [35]:
vague_train.to_csv('Vagueness_Splits/Test_1/vague_only_test/train.csv')
# vague_test.to_csv('Vagueness_Splits/Test_1/vague_only_test/test.csv')

#### Non-vague-only Test Set (154 examples for comparison with Vague-only Test Set)

In [32]:
unvague_examples = vet_corpus.loc[vet_corpus['is_vague']==0]
unvague_1s = unvague_examples.loc[unvague_examples['is_euph']==1]
unvague_0s = unvague_examples.loc[unvague_examples['is_euph']==0]
# construct the test set - NOTE that the train set should be the same (615 rows, made in the previous chunk); it is not constructed here
unvague_1s_sample = unvague_1s.sample(77)
unvague_0s_sample = unvague_0s.sample(77)
unvague_test = pd.concat([unvague_1s_sample, unvague_0s_sample]).sample(frac=1)
unvague_test

Unnamed: 0,keyword,edited_text,is_euph,category,type,euph_status,sentence,is_vague
1641,same-sex,Males try to hold females during copulation wi...,0,sexual activity,same-sex,sometimes_euph,Males are infected at both positions left post...,0
774,making love,"Cooking, gardening, <making love>-I love these...",1,sexual activity,make love,always_euph,Cooking gardening making love-I love these ele...,0
1464,overweight,It's not OK because when kids don't eat well o...,0,physical/mental attributes,overweight,sometimes_euph,It's not OK because when kids don't eat well o...,0
240,pro-life,"Not enough, certainly; and some of the most pr...",1,politics,pro-life,always_euph,Finally I think many pro-life people are polit...,0
899,demise,"We are setting forth hopefully, a blueprint fo...",1,death,demise,sometimes_euph,There are some who say it is a coroner's repor...,0
...,...,...,...,...,...,...,...,...
1363,weed,Archived Andrew reflected on what it's like to...,1,substances,weed,sometimes_euph,Noah Feldman tried to imagine how the SCOTUS w...,0
1913,late,"Access to Ontario Works, housing registry, bui...",0,death,late,sometimes_euph,Late payments for the April 2 tax due date wil...,0
1842,expecting,"He had put in almost 20 years at NBC, devoting...",0,physical/mental attributes,expecting,sometimes_euph,He recalled how when his Late Night finally bu...,0
1457,overweight,I also eat well and exercise 5-6 times per wee...,0,physical/mental attributes,overweight,sometimes_euph,I do not know what it is like to be overweight...,0


In [33]:
unvague_test.to_csv('Vagueness_Splits/Test_1/unvague_only_test/test.csv')

### TEST_2 - TRAIN: Only Non-Vague, TEST: Only Vague/Non-Vague
- to hopefully verify the vagueness results from TEST_1/TEST_0
- as such, the test sets from the previous TEST are used; this code is only used to produce the non-vague train set

In [15]:
# We make use of the same test data in Test_1, but we have to make sure our training non-vague examples do not clash
vague_test = pd.read_csv('Vagueness_Splits/Test_1/vague_only_test/test.csv', index_col=0)
unvague_test = pd.read_csv('Vagueness_Splits/Test_1/unvague_only_test/test.csv', index_col=0)
vet_corpus = pd.read_csv('VET_Corpus_0.1.csv', index_col = 0, encoding= 'utf-8') # corpus containing strong assumption PET-generalization

In [17]:
vet_corpus = vet_corpus.drop(vague_test.index)
vet_corpus = vet_corpus.drop(unvague_test.index)

unvague_examples = vet_corpus.loc[vet_corpus['is_vague']==0]

unvague_1s = unvague_examples.loc[unvague_examples['is_euph']==1] # there's a bunch of these but let's only take 477 of them to make the training set 615 examples, as it was for the previous test
unvague_1s_sample = unvague_1s.sample(477)
unvague_0s = unvague_examples.loc[unvague_examples['is_euph']==0] # we want to use all 138 of these

unvague_train = pd.concat([unvague_1s_sample, unvague_0s]).sample(frac=1)
unvague_train

Unnamed: 0,keyword,edited_text,is_euph,category,type,euph_status,sentence,is_vague
1497,chest,"By his late teens, his bulk had become a menac...",0,physical/mental attributes,chest,sometimes_euph,One player knocked himself out running into Sh...,0
67,correctional facility,Ray Reyna Jr. said Los Banos Police will aid t...,1,employment,correctional facility,always_euph,This was such a tragic case and and booked on ...,0
1557,dismissed,"Men: wear a tie. If you don't look sharp, you'...",0,employment,dismissed,sometimes_euph,If you don't look sharp you're making it all t...,0
781,let go of,"At the end of the day, Steven Sinofsky is a hu...",1,employment,let go of,sometimes_euph,At the end of the day Steven Sinofsky is a hum...,0
470,senior citizen,It is obscene what goes on in that arena. I ha...,1,physical/mental attributes,senior citizen,always_euph,I have client's who are far younger and more a...,0
...,...,...,...,...,...,...,...,...
165,elderly,My father's mother lived with us at the time a...,1,physical/mental attributes,elderly,always_euph,I was paid for the weekend to go get her paper...,0
234,pro-life,When I first started blogging a year and a hal...,1,politics,pro-life,always_euph,I insisted that the pro-life movement wasn't a...,0
427,capital punishment,"My opinions were as ecumenical as my larder, t...",1,politics,capital punishment,always_euph,I had no problem with abortion but abhorred ca...,0
872,aging,"Not enough for long term, but beyond the 72-ho...",1,physical/mental attributes,aging,sometimes_euph,Some family lives only miles from us but my ag...,0


In [18]:
unvague_train.to_csv('Vagueness_Splits/Test_2/vague_only_test/train.csv')
unvague_train.to_csv('Vagueness_Splits/Test_2/unvague_only_test/train.csv')

#### Test Analysis

In [2]:
# Check le PET distrbutions
test1_vague_train = pd.read_csv('Vagueness_Splits/Test_1/vague_only_test/train.csv', index_col=0)
test1_vague_test = pd.read_csv('Vagueness_Splits/Test_1/vague_only_test/test.csv', index_col=0)
test1_unvague_test = pd.read_csv('Vagueness_Splits/Test_1/unvague_only_test/test.csv', index_col=0)

print("TEST_1:")
print('Number of PETs in train:', len(test1_vague_train['type'].unique()))
print('Number of PETs in VAGUE test set:', len(test1_vague_test['type'].unique()))
print('Number of PETs in UNVAGUE test set:', len(test1_unvague_test['type'].unique()))

TEST_1:
Number of PETs in train: 58
Number of PETs in VAGUE test set: 43
Number of PETs in UNVAGUE test set: 58


In [5]:
pd.DataFrame(test1_vague_train['type'].value_counts())
pd.DataFrame(test1_vague_test['type'].value_counts())

Unnamed: 0,type
disabled,13
collateral damage,10
weed,8
underdeveloped,8
lay off,8
accident,7
demise,6
developed/ing country,6
perish,6
inner city,6


In [6]:
test2_unvague_train = pd.read_csv('Vagueness_Splits/Test_2/vague_only_test/train.csv', index_col=0)
test2_vague_test = pd.read_csv('Vagueness_Splits/Test_2/vague_only_test/test.csv', index_col=0)
test2_unvague_test = pd.read_csv('Vagueness_Splits/Test_2/unvague_only_test/test.csv', index_col=0)

print("TEST_2:")
print('Number of PETs in train:', len(test2_unvague_train['type'].unique()))
print('Number of PETs in VAGUE test set:', len(test2_vague_test['type'].unique()))
print('Number of PETs in UNVAGUE test set:', len(test2_unvague_test['type'].unique()))

TEST_2:
Number of PETs in train: 85
Number of PETs in VAGUE test set: 43
Number of PETs in UNVAGUE test set: 58


In [183]:
vet_corpus = pd.read_csv('VET_Corpus_0.1.csv', index_col=0)
vague_examples = vet_corpus.loc[vet_corpus['is_vague']==1]
vague_always_PETs = vague_examples.loc[vague_examples['euph_status']=='always_euph']
vague_sometimes_PETs = vague_examples.loc[vague_examples['euph_status']=='sometimes_euph']

unvague_examples = vet_corpus.loc[vet_corpus['is_vague']==0]
unvague_always_PETs = unvague_examples.loc[unvague_examples['euph_status']=='always_euph']
unvague_sometimes_PETs = unvague_examples.loc[unvague_examples['euph_status']=='sometimes_euph']

print(len(vague_examples['type'].unique()))
print(len(unvague_examples['type'].unique()))

print("{} DIFFERENT VAGUE ALWAYS_EUPHS TOTALING TO {} EXAMPLES".format(len(vague_always_PETs['type'].unique()), len(vague_always_PETs)))
print("{} DIFFERENT VAGUE SOMETIMES_EUPHS TOTALING TO {} EXAMPLES".format(len(vague_sometimes_PETs['type'].unique()), len(vague_sometimes_PETs)))
print("{} DIFFERENT UNVAGUE ALWAYS_EUPHS TOTALING TO {} EXAMPLES".format(len(unvague_always_PETs['type'].unique()), len(unvague_always_PETs)))
print("{} DIFFERENT UNVAGUE SOMETIMES_EUPHS TOTALING TO {} EXAMPLES".format(len(unvague_sometimes_PETs['type'].unique()), len(unvague_sometimes_PETs)))

# print(len(vague_sometimes_PETs['type'].unique()))
# print(len(unvague_always_PETs['type'].unique()))
# print(len(unvague_sometimes_PETs['type'].unique()))

60
89
23 DIFFERENT VAGUE ALWAYS_EUPHS TOTALING TO 209 EXAMPLES
37 DIFFERENT VAGUE SOMETIMES_EUPHS TOTALING TO 560 EXAMPLES
48 DIFFERENT UNVAGUE ALWAYS_EUPHS TOTALING TO 568 EXAMPLES
41 DIFFERENT UNVAGUE SOMETIMES_EUPHS TOTALING TO 628 EXAMPLES


In [15]:
# Generate lists of PETs: vague-euph, nonvague-euph, vague-lit, nonvague-lit
import os
vet_corpus = pd.read_csv('VET_Corpus_0.1.csv', index_col = 0, encoding= 'utf-8') # corpus containing strong assumption PET-generalization
vague_examples = vet_corpus.loc[vet_corpus['is_vague']==1]
unvague_examples = vet_corpus.loc[vet_corpus['is_vague']==0]

vague_1s = vague_examples.loc[vague_examples['is_euph']==1]
vague_0s = vague_examples.loc[vague_examples['is_euph']==0]
unvague_1s = unvague_examples.loc[unvague_examples['is_euph']==1]
unvague_0s = unvague_examples.loc[unvague_examples['is_euph']==0]

# vague_1_PETs = pd.DataFrame({'vague_euph': vague_1s['type'].unique().tolist()})
# vague_0_PETs = pd.DataFrame({'vague_lits': vague_0s['type'].unique().tolist()})
# unvague_1_PETs = pd.DataFrame({'unvague_euphs': unvague_1s['type'].unique().tolist()})
# unvague_0_PETs = pd.DataFrame({'unvague_lits': unvague_0s['type'].unique().tolist()})

set(unvague_0s['type'].unique()).intersection(unvague_1s['type'].unique())
    
# data = pd.concat([vague_1_PETs, vague_0_PETs, unvague_1_PETs, unvague_0_PETs], axis=1)
# data 
# data.to_csv('VET_List_0.1.csv')

{'aging',
 'between jobs',
 'chest',
 'deprived',
 'dismissed',
 'expecting',
 'exterminate',
 'gluteus maximus',
 'got clean',
 'late',
 'let [pro] go',
 'oldest profession',
 'overweight',
 'plump',
 'same-sex',
 'seasoned',
 'sleep with',
 'sober',
 'stout',
 'wealthy',
 'with child'}

In [19]:
a = vague_0s['type'].unique()
print(len(a))
u = vague_0s.loc[vague_0s['euph_status']=='sometimes_euph']
t = u['type'].unique()
print(len(t))

34
34


### TEST_3 - TESTS 1 and 2, but 10 random splits each

In [27]:
# This function, HuggingFace-ify, takes a sample of euphemism corpus and it makes into an appropriate format for the HuggingFace Trainer class
def hfify(df):
    df = df.drop(['keyword', 'category', 'type', 'euph_status', 'sentence', 'is_vague'], axis=1)
    df = df.rename(columns={'edited_text':'text', 'is_euph':'label'})
    return df

In [3]:
# make the 10 splits for the training_vague_only test
import os
import pandas as pd

vet_corpus = pd.read_csv('VET_Corpus_0.1.csv', index_col = 0, encoding= 'utf-8') # corpus containing strong assumption PET-generalization
vague_examples = vet_corpus.loc[vet_corpus['is_vague']==1]
unvague_examples = vet_corpus.loc[vet_corpus['is_vague']==0]

vague_1s = vague_examples.loc[vague_examples['is_euph']==1]
vague_0s = vague_examples.loc[vague_examples['is_euph']==0]
unvague_1s = unvague_examples.loc[unvague_examples['is_euph']==1]
unvague_0s = unvague_examples.loc[unvague_examples['is_euph']==0]

for x in range(0, 10):
    vague_test_folder = 'Vagueness_Splits/Test_3/train_vague_only/test_vague_only/test' + str(x)
    unvague_test_folder = 'Vagueness_Splits/Test_3/train_vague_only/test_unvague_only/test' + str(x)
    os.mkdir(vague_test_folder)
    os.mkdir(unvague_test_folder)
    
    # create the vague-only training sample for the vague-only test sample
    vague_1s_sample = vague_1s.sample(324)
    vague_0s_sample = vague_0s.sample(291)
    vague_train = pd.concat([vague_1s_sample, vague_0s_sample]).sample(frac=1)
    vague_train.to_csv(vague_test_folder + '/train.csv')
    vague_train = hfify(vague_train)
    vague_train.to_csv(vague_test_folder + '/hf_train.csv', index=False)
    
    # create the corresponding vague-only test sample
    rem_vague_1s = vague_1s.drop(vague_1s_sample.index) # we already took some vague examples for the train set; only keep the remaining once
    rem_vague_0s = vague_0s.drop(vague_0s_sample.index)
    vague_1s_sample = rem_vague_1s.sample(77)
    vague_0s_sample = rem_vague_0s.sample(77)
    vague_test = pd.concat([vague_1s_sample, vague_0s_sample]).sample(frac=1)
    vague_test.to_csv(vague_test_folder + '/test.csv')
    vague_test = hfify(vague_test)
    vague_test.to_csv(vague_test_folder + '/hf_test.csv', index=False)
    
    # create another randomly sampled vague-only training sample for the unvague-only test sample
    vague_1s_sample = vague_1s.sample(324)
    vague_0s_sample = vague_0s.sample(291)
    vague_train = pd.concat([vague_1s_sample, vague_0s_sample]).sample(frac=1)
    vague_train.to_csv(unvague_test_folder + '/train.csv')
    vague_train = hfify(vague_train)
    vague_train.to_csv(unvague_test_folder + '/hf_train.csv', index=False)
    
    # create the corresponding unvague-only test sample
    unvague_1s_sample = unvague_1s.sample(77)
    unvague_0s_sample = unvague_0s.sample(77)
    unvague_test = pd.concat([unvague_1s_sample, unvague_0s_sample]).sample(frac=1)
    unvague_test.to_csv(unvague_test_folder + '/test.csv')
    unvague_test = hfify(unvague_test)
    unvague_test.to_csv(unvague_test_folder + '/hf_test.csv', index=False)

FileExistsError: [WinError 183] Cannot create a file when that file already exists: 'Vagueness_Splits/Test_3/train_vague_only/test_vague_only/test0'

In [4]:
print("VAGUE 1s", len(vague_1s))
print("VAGUE 0s", len(vague_0s))
print("UNVAGUE 1s", len(unvague_1s))
print("UNVAGUE 0s", len(unvague_0s))

VAGUE 1s 401
VAGUE 0s 368
UNVAGUE 1s 981
UNVAGUE 0s 215


In [6]:
# make the 10 splits for the training_UNvague_only test
import os

vet_corpus = pd.read_csv('VET_Corpus_0.1.csv', index_col = 0, encoding= 'utf-8') # corpus containing strong assumption PET-generalization
vague_examples = vet_corpus.loc[vet_corpus['is_vague']==1]
unvague_examples = vet_corpus.loc[vet_corpus['is_vague']==0]

vague_1s = vague_examples.loc[vague_examples['is_euph']==1]
vague_0s = vague_examples.loc[vague_examples['is_euph']==0]
unvague_1s = unvague_examples.loc[unvague_examples['is_euph']==1]
unvague_0s = unvague_examples.loc[unvague_examples['is_euph']==0]

for x in range(0, 10):
    vague_test_folder = 'Vagueness_Splits/Test_3/train_unvague_only/test_vague_only/test' + str(x)
    unvague_test_folder = 'Vagueness_Splits/Test_3/train_unvague_only/test_unvague_only/test' + str(x)
    os.mkdir(vague_test_folder)
    os.mkdir(unvague_test_folder)
    
    # create the UNvague-only training sample for the vague-only test sample
    unvague_1s_sample = unvague_1s.sample(477)
    unvague_0s_sample = unvague_0s.sample(138)
    unvague_train = pd.concat([unvague_1s_sample, unvague_0s_sample]).sample(frac=1)
    unvague_train.to_csv(unvague_test_folder + '/train.csv')
    unvague_train = hfify(unvague_train)
    unvague_train.to_csv(unvague_test_folder + '/hf_train.csv', index=False)
    
    # create the corresponding UNvague-only test sample
    rem_unvague_1s = unvague_1s.drop(unvague_1s_sample.index) # we already took some vague examples for the train set; only keep the remaining once
    rem_unvague_0s = unvague_0s.drop(unvague_0s_sample.index)
    unvague_1s_sample = rem_unvague_1s.sample(77)
    unvague_0s_sample = rem_unvague_0s.sample(77)
    unvague_test = pd.concat([unvague_1s_sample, unvague_0s_sample]).sample(frac=1)
    unvague_test.to_csv(unvague_test_folder + '/test.csv')
    unvague_test = hfify(unvague_test)
    unvague_test.to_csv(unvague_test_folder + '/hf_test.csv', index=False)
    
    # create another randomly sampled UNvague-only training sample for the vague-only test sample
    unvague_1s_sample = unvague_1s.sample(477)
    unvague_0s_sample = unvague_0s.sample(138)
    unvague_train = pd.concat([unvague_1s_sample, unvague_0s_sample]).sample(frac=1)
    unvague_train.to_csv(vague_test_folder + '/train.csv')
    unvague_train = hfify(unvague_train)
    unvague_train.to_csv(vague_test_folder + '/hf_train.csv', index=False)
    
    # create the corresponding VAGUE-only test sample
    vague_1s_sample = vague_1s.sample(77)
    vague_0s_sample = vague_0s.sample(77)
    vague_test = pd.concat([vague_1s_sample, vague_0s_sample]).sample(frac=1)
    vague_test.to_csv(vague_test_folder + '/test.csv')
    vague_test = hfify(vague_test)
    vague_test.to_csv(vague_test_folder + '/hf_test.csv', index=False)

In [None]:
# make the 10 splits for the training_MIXED

In [12]:
# code to pull tests out of each folder and put them into one
import shutil

for x in range(0, 10):
    vague_unvague_folder = 'Vagueness_Splits/Test_3/train_vague_only/test_unvague_only/test' + str(x)
    # unvague_test_folder = 'Vagueness_Splits/Test_3/train_unvague_only/test_unvague_only/test' + str(x)
    shutil.copyfile(vague_unvague_folder + '/hf_train.csv', 'Vagueness_Splits/Test_3/train_vague_only/test_unvague_only/hf_train_' + str(x) + '.csv')
    shutil.copyfile(vague_unvague_folder + '/hf_test.csv', 'Vagueness_Splits/Test_3/train_vague_only/test_unvague_only/hf_test_' + str(x) + '.csv')

#### Test Analysis

In [46]:
import pandas as pd
# Investigate the PET distribution in the best vague-vague run

best_run_fp = 'Vagueness_Splits/Test_3/train_unvague_only/test_unvague_only/test0'
train_df = pd.read_csv(best_run_fp + '/train.csv', index_col=0)
test_df = pd.read_csv(best_run_fp + '/test.csv', index_col=0)

In [47]:
print(len(train_df['type'].unique()))
print(len(test_df['type'].unique()))

81
61


In [2]:
l = []

for PET in train_df['type'].unique():
    rows = train_df.loc[train_df['type'] == PET]
    l.append((PET, len(rows)))
    
sorted(l, key=lambda tup: tup[1], reverse=True)

[('aging', 39),
 ('late', 34),
 ('expecting', 24),
 ('weed', 20),
 ('overweight', 20),
 ('sober', 15),
 ('demise', 13),
 ('plump', 12),
 ('capital punishment', 12),
 ('low-income', 12),
 ('exterminate', 12),
 ('dismissed', 12),
 ('ethnic cleansing', 12),
 ('perish', 11),
 ('undocumented immigrant', 11),
 ('less fortunate', 11),
 ('sex worker', 11),
 ('detainee', 11),
 ('advanced age', 11),
 ('economical', 10),
 ('elderly', 10),
 ('substance abuse', 9),
 ('correctional facility', 9),
 ('inebriated', 9),
 ('underprivileged', 9),
 ('same-sex', 9),
 ('indigent', 9),
 ('make love', 8),
 ('between jobs', 8),
 ('pro-choice', 8),
 ('deceased', 8),
 ('chest', 8),
 ('pro-life', 8),
 ('pass away', 7),
 ('homemaker', 7),
 ('psychiatric hospital', 7),
 ('droppings', 7),
 ('senior citizen', 7),
 ('let [pro] go', 7),
 ('a certain age', 7),
 ('slim', 7),
 ('people/persons of color', 7),
 ('rear end', 7),
 ('portly', 6),
 ('fatality', 6),
 ('sleep with', 6),
 ('wealthy', 6),
 ('targeted killing', 5),
 

In [26]:
l=[]

for PET in test_df['type'].unique():
    rows = test_df.loc[test_df['type'] == PET]
    l.append((PET, len(rows)))
    
sorted(l, key=lambda tup: tup[1], reverse=True)

[('late', 16),
 ('dismissed', 10),
 ('expecting', 8),
 ('aging', 8),
 ('overweight', 8),
 ('exterminate', 7),
 ('plump', 6),
 ('sober', 5),
 ('let [pro] go', 5),
 ('same-sex', 4),
 ('droppings', 3),
 ('homemaker', 3),
 ('detainee', 3),
 ('elderly', 3),
 ('weed', 3),
 ('perish', 3),
 ('undocumented workers', 3),
 ('targeted killing', 3),
 ('substance abuse', 3),
 ('capital punishment', 3),
 ('detention camp', 3),
 ('chest', 2),
 ('seasoned', 2),
 ('birds and the bees', 2),
 ('pro-life', 2),
 ('stout', 2),
 ('a certain age', 2),
 ('pass away', 2),
 ('correctional facility', 2),
 ('pro-choice', 2),
 ('sleep with', 2),
 ('advanced age', 2),
 ('enhanced interrogation techniques', 2),
 ('people/persons of color', 2),
 ('custodian', 1),
 ('put to sleep', 1),
 ('ethnic cleansing', 1),
 ('undocumented immigrant', 1),
 ('senior citizen', 1),
 ('pass on', 1),
 ('between jobs', 1),
 ('low-income', 1),
 ('mixed up', 1),
 ('underprivileged', 1),
 ('invalid', 1),
 ('less fortunate', 1),
 ('deprived',

In [29]:
print(set(train_df['type'].unique()).intersection(test_df['type'].unique()))

{'developmentally disabled', 'income inequality', 'slim', 'mentally disabled', 'intoxicated', 'special needs', 'long sleep', 'underdeveloped', 'regime change', 'accident', 'perish', 'demise', 'experienced', 'collateral damage', 'getting clean', 'weed', 'let go of', 'well off', 'inner city', 'global south', 'golden years', 'armed conflict', 'over the hill', 'freedom fighter', 'custodian', 'custodians', 'a certain age', 'neutralize', 'negative cash flow', 'troubled', 'sanitation worker', 'outspoken', 'mixed up', 'pass on', 'disabled', 'lay off', 'economical', 'pre-owned', 'go all the way', 'mentally challenged', 'disadvantaged', 'developed/ing country', 'pass away'}


In [22]:
# Investigate the PET distribution in the best vague-unvague run

best_run_fp = 'Vagueness_Splits/Test_3/train_vague_only/test_unvague_only/test0'
train_df = pd.read_csv(best_run_fp + '/train.csv', index_col=0)
test_df = pd.read_csv(best_run_fp + '/test.csv', index_col=0)

l = []

for PET in train_df['type'].unique():
    rows = train_df.loc[train_df['type'] == PET]
    l.append((PET, len(rows)))
    
sorted(l, key=lambda tup: tup[1], reverse=True)

[('disabled', 46),
 ('collateral damage', 45),
 ('lay off', 30),
 ('troubled', 24),
 ('disadvantaged', 24),
 ('weed', 24),
 ('accident', 24),
 ('demise', 22),
 ('underdeveloped', 18),
 ('inner city', 18),
 ('special needs', 18),
 ('income inequality', 18),
 ('sanitation worker', 16),
 ('freedom fighter', 15),
 ('developed/ing country', 15),
 ('armed conflict', 15),
 ('pass away', 14),
 ('economical', 13),
 ('perish', 13),
 ('golden years', 13),
 ('intoxicated', 12),
 ('mentally challenged', 12),
 ('slim', 12),
 ('regime change', 11),
 ('over the hill', 10),
 ('well off', 9),
 ('mentally disabled', 9),
 ('mixed up', 8),
 ('neutralize', 8),
 ('go all the way', 8),
 ('a certain age', 7),
 ('global south', 7),
 ('venereal disease', 6),
 ('custodian', 6),
 ('experienced', 6),
 ('pre-owned', 5),
 ('able-bodied', 5),
 ('pass on', 5),
 ('let go of', 4),
 ('outspoken', 4),
 ('downsize', 4),
 ('seeing someone/each other', 4),
 ('comfort women', 3),
 ('invalid', 3),
 ('put to sleep', 3),
 ('negat

In [23]:
l=[]

for PET in test_df['type'].unique():
    rows = test_df.loc[test_df['type'] == PET]
    l.append((PET, len(rows)))
    
sorted(l, key=lambda tup: tup[1], reverse=True)
print(*l)

[('late', 16),
 ('dismissed', 10),
 ('expecting', 8),
 ('aging', 8),
 ('overweight', 8),
 ('exterminate', 7),
 ('plump', 6),
 ('sober', 5),
 ('let [pro] go', 5),
 ('same-sex', 4),
 ('droppings', 3),
 ('homemaker', 3),
 ('detainee', 3),
 ('elderly', 3),
 ('weed', 3),
 ('perish', 3),
 ('undocumented workers', 3),
 ('targeted killing', 3),
 ('substance abuse', 3),
 ('capital punishment', 3),
 ('detention camp', 3),
 ('chest', 2),
 ('seasoned', 2),
 ('birds and the bees', 2),
 ('pro-life', 2),
 ('stout', 2),
 ('a certain age', 2),
 ('pass away', 2),
 ('correctional facility', 2),
 ('pro-choice', 2),
 ('sleep with', 2),
 ('advanced age', 2),
 ('enhanced interrogation techniques', 2),
 ('people/persons of color', 2),
 ('custodian', 1),
 ('put to sleep', 1),
 ('ethnic cleansing', 1),
 ('undocumented immigrant', 1),
 ('senior citizen', 1),
 ('pass on', 1),
 ('between jobs', 1),
 ('low-income', 1),
 ('mixed up', 1),
 ('underprivileged', 1),
 ('invalid', 1),
 ('less fortunate', 1),
 ('deprived',

In [24]:
print(set(train_df['type'].unique()).intersection(test_df['type'].unique()))

{'invalid', 'custodian', 'weed', 'pass on', 'mixed up', 'a certain age', 'perish', 'pass away', 'put to sleep'}


### TEST_4 - Same as TEST_3, but restricting the sets to the same number of PETs

In [49]:
VET_corpus = pd.read_csv('VET_Corpus_0.1.csv', index_col=0)
l = []
unvague_examples = VET_corpus.loc[VET_corpus['is_vague']==0]
for PET in unvague_examples['type'].unique():
    rows = unvague_examples.loc[unvague_examples['type'] == PET]
    l.append((PET, len(rows)))
l = sorted(l, key=lambda tup: tup[1], reverse=True)
print(*l)

('aging', 60) ('late', 60) ('expecting', 46) ('overweight', 38) ('exterminate', 30) ('weed', 30) ('demise', 28) ('dismissed', 26) ('sober', 22) ('undocumented immigrant', 20) ('sex worker', 20) ('detainee', 20) ('ethnic cleansing', 20) ('elderly', 20) ('deceased', 20) ('pro-life', 20) ('substance abuse', 20) ('senior citizen', 20) ('people/persons of color', 20) ('fatality', 20) ('pro-choice', 20) ('low-income', 20) ('less fortunate', 20) ('perish', 20) ('chest', 20) ('plump', 20) ('homemaker', 19) ('capital punishment', 19) ('advanced age', 19) ('correctional facility', 18) ('indigent', 18) ('droppings', 18) ('pass away', 18) ('inebriated', 16) ('same-sex', 16) ('intoxicated', 16) ('let [pro] go', 14) ('economical', 14) ('between jobs', 14) ('undocumented workers', 13) ('stout', 12) ('sleep with', 12) ('psychiatric hospital', 11) ('underprivileged', 11) ('substance abuser', 11) ('targeted killing', 11) ('make love', 11) ('slim', 11) ('mixed up', 11) ('well off', 11) ('a certain age', 

In [168]:
import random
import pandas as pd

VET_corpus = pd.read_csv('VET_Corpus_0.1.csv', index_col=0)
l = []

# Here, we form the components for an unvague-unvague test.
unvague_examples = VET_corpus.loc[VET_corpus['is_vague']==0]
unvague_1s = unvague_examples.loc[unvague_examples['is_euph']==1]
unvague_0s = unvague_examples.loc[unvague_examples['is_euph']==0]

# form 1s
sample_vague_PETs = random.sample(unvague_1s['type'].unique().tolist(), 55)
PET_controlled_unvague_1s = unvague_1s.loc[unvague_1s['type'].isin(sample_vague_PETs)]
sample_unvague_1s = PET_controlled_unvague_1s.sample(400)
train_1s = sample_unvague_1s.sample(350)
test_1s = sample_unvague_1s.drop(train_1s.index)

# form 0s
sample_unvague_0s = unvague_0s.sample(200)
train_0s = sample_unvague_0s.sample(150)
test_0s = sample_unvague_0s.drop(train_0s.index)
# vague_train = hfify(vague_train)

train = pd.concat([train_1s, train_0s]).sample(frac=1)
test = pd.concat([test_1s, test_0s]).sample(frac=1)
print(len(train))
print(len(test))
# print(len(train_1s))
# print(len(test_1s))
# print(len(train_0s))
# print(len(test_0s))
# Let's double check
print(len(train['type'].unique()))
print(len(test['type'].unique()))


500
100
60
42


In [169]:
# # Let's double check
# # print(len(train.loc[train['is_vague']==0]))
import os
path = 'Vagueness_Splits/Test_4/unvague_unvague_test'
train.to_csv(path + '/train.csv')
hfify(train).to_csv(path + '/hf_train.csv')
test.to_csv(path + '/test.csv')
hfify(test).to_csv(path + '/hf_test.csv')
# train = VET_corpus.loc[l, :]

In [178]:
# Here, we form the components for a vague-vague-test, for comparison. Will it still do a lot beter? 
VET_corpus = pd.read_csv('VET_Corpus_0.1.csv', index_col=0)
vague_examples = VET_corpus.loc[VET_corpus['is_vague']==1]
vague_1s = vague_examples.loc[vague_examples['is_euph']==1]
vague_0s = vague_examples.loc[vague_examples['is_euph']==0]

train_1s = vague_1s.sample(350)
test_1s = vague_1s.drop(train_1s.index).sample(50)
train_0s = vague_0s.sample(150)
test_0s = vague_0s.drop(train_0s.index).sample(50)

train = pd.concat([train_1s, train_0s]).sample(frac=1)
test = pd.concat([test_1s, test_0s]).sample(frac=1)
print(len(train))
print(len(test))
print(len(train['type'].unique()))
print(len(test['type'].unique()))

500
100
59
41


In [179]:
import os
path = 'Vagueness_Splits/Test_4/vague_vague_test'
train.to_csv(path + '/train.csv')
hfify(train).to_csv(path + '/hf_train.csv')
test.to_csv(path + '/test.csv')
hfify(test).to_csv(path + '/hf_test.csv')

In [70]:
train.index.intersection(test.index)

Int64Index([], dtype='int64')

In [10]:
# Here, we get an idea of the number of examples per PET. Idk, just cuz.
VET_Corpus = pd.read_csv('VET_Corpus_0.1.csv', index_col=0)
l = []
VET_Corpus = VET_Corpus.loc[VET_Corpus['is_vague']==1]
for PET in VET_Corpus['type'].unique():
    rows = VET_Corpus.loc[VET_Corpus['type'] == PET]
    l.append((PET, len(rows)))
l = sorted(l, key=lambda tup: tup[1], reverse=True)
print(*l)

('disabled', 60) ('collateral damage', 52) ('lay off', 36) ('accident', 32) ('troubled', 30) ('weed', 30) ('disadvantaged', 28) ('demise', 28) ('special needs', 26) ('underdeveloped', 26) ('freedom fighter', 20) ('armed conflict', 20) ('income inequality', 20) ('inner city', 20) ('sanitation worker', 20) ('perish', 20) ('developed/ing country', 19) ('pass away', 18) ('mentally challenged', 17) ('golden years', 17) ('intoxicated', 16) ('regime change', 14) ('economical', 14) ('slim', 13) ('over the hill', 12) ('mentally disabled', 11) ('mixed up', 11) ('well off', 11) ('a certain age', 11) ('go all the way', 10) ('global south', 8) ('neutralize', 8) ('pre-owned', 7) ('able-bodied', 7) ('venereal disease', 6) ('experienced', 6) ('outspoken', 6) ('pass on', 6) ('custodian', 6) ('let go of', 5) ('downsize', 4) ('outlived [pro] usefulness', 4) ('seeing someone/each other', 4) ('comfort women', 3) ('negative cash flow', 3) ('invalid', 3) ('put to sleep', 3) ('differently-abled', 2) ('develop

### TEST_5 - Mixed-mixed Somewhat Balanced (for comparison with Balanced mixed-mixed, that was made on the server)

In [4]:
# we will cut down on #examples from the UV1 category, 981, to 381. Also, cut down on 368 V0s to 218 (to compensate for lack of UV0s). If we do this, then:
# there are 1212 examples total, 20% of which is about 240 for the test set. 
# We're interested in evaluating vague/unvague performance in the test set, so it will be balanced by taking 240/4=60 examples from each of V1/V0/UV1/UV0.
# Then, in the training set, there will be 401-60=341 V1s, 218-60=158 V0s, 381-60=321 UV1s, and 215-60=155 UV0s.  

import os

vet_corpus = pd.read_csv('VET_Corpus_0.1.csv', index_col = 0, encoding= 'utf-8') # corpus containing strong assumption PET-generalization
vague_examples = vet_corpus.loc[vet_corpus['is_vague']==1]
unvague_examples = vet_corpus.loc[vet_corpus['is_vague']==0]
vague_1s = vague_examples.loc[vague_examples['is_euph']==1]
vague_0s = vague_examples.loc[vague_examples['is_euph']==0]
unvague_1s = unvague_examples.loc[unvague_examples['is_euph']==1]
unvague_0s = unvague_examples.loc[unvague_examples['is_euph']==0]

for x in range(0, 10):
    dest_path = 'Vagueness_Splits/Test_5/PET_less_balanced_mixed_mixed/'
    test_V1s = vague_1s.sample(60)
    test_V0s = vague_0s.sample(60)
    test_UV1s = unvague_1s.sample(60)
    test_UV0s = unvague_0s.sample(60)
    # now form the training set
    train_V1s = vague_1s.drop(test_V1s.index).sample(314)
    train_V0s = vague_0s.drop(test_V0s.index).sample(158)
    train_UV1s = unvague_1s.drop(test_UV1s.index)
    train_UV0s = unvague_0s.drop(test_UV0s.index)
    # now combine the sets
    train_set = pd.concat([train_V1s, train_V0s, train_UV1s, train_UV0s]).sample(frac=1)
    test_set = pd.concat([test_V1s, test_V0s, test_UV1s, test_UV0s]).sample(frac=1)
    train_set.to_csv(dest_path + 'train' + str(x) + '.csv')
    test_set.to_csv(dest_path + 'test' + str(x) + '.csv')
    hfify(train_set).to_csv(dest_path + 'hf_train_' + str(x) + '.csv', index=False)
    hfify(test_set).to_csv(dest_path + 'hf_test_' + str(x) + '.csv', index=False)

### TEST_5 - Mini-test on the 1st annotation sample only

In [7]:
import pandas as pd

df = pd.read_csv('Annotation_Task1_Analysis_v2.csv', index_col=0)

vague = df.loc[df['is_vague'] == 1]
vague_euphs = vague.loc[vague['label'] == 1]
vague_noneuphs = vague.loc[vague['label'] == 0]
print(len(vague_euphs))
print(len(vague_noneuphs))

unvague = df.loc[df['is_vague'] == 0]
unvague_euphs = unvague.loc[unvague['label'] == 1]
unvague_noneuphs = unvague.loc[unvague['label'] == 0]
print(len(unvague_euphs))
print(len(unvague_noneuphs))

43
28
86
30


In [10]:
import os
# make the 10 splits for the training_vague_only test
vague_vague_folder = 'Vagueness_Splits/Test_4/vague_vague_test' 
vague_unvague_folder = 'Vagueness_Splits/Test_4/vague_unvague_test'
unvague_vague_folder = 'Vagueness_Splits/Test_4/unvague_vague_test' 
unvague_unvague_folder = 'Vagueness_Splits/Test_4/unvague_unvague_test' 

os.mkdir(vague_vague_folder)
os.mkdir(vague_unvague_folder)
os.mkdir(unvague_vague_folder)
os.mkdir(unvague_unvague_folder)

for x in range(0, 10): # create 10 copies of each test
    # create a vague-vague test
    s1 = vague_euphs.sample(28)
    s2 = vague_noneuphs.sample(28)
    train = pd.concat([s1, s2]).sample(frac=1)
    s3 = vague_euphs.drop(s1.index).sample(7)
    s4 = vague_noneuphs.drop(s2.index).sample(7)
    test = pd.concat([s3, s4]).sample(frac=1)
    # normally, we'd want to save the train and test subsets before removing columns, but we'll skip for now...
    train = hfify(train)
    test = hfify(test)
    train.to_csv(vague_vague_folder + '/hf_train_' + str(x) + '.csv', index=False)
    train.to_csv(vague_vague_folder + '/hf_test_' + str(x) + '.csv', index=False)
    
    # create a vague-unvague test
    s1 = vague_euphs.sample(28)
    s2 = vague_noneuphs.sample(28)
    train = pd.concat([s1, s2]).sample(frac=1)
    s3 = unvague_euphs.sample(7)
    s4 = unvague_noneuphs.sample(7)
    test = pd.concat([s3, s4]).sample(frac=1)
    # normally, we'd want to save the train and test subsets before removing columns, but we'll skip for now...
    train = hfify(train)
    test = hfify(test)
    train.to_csv(vague_unvague_folder + '/hf_train_' + str(x) + '.csv', index=False)
    train.to_csv(vague_unvague_folder + '/hf_test_' + str(x) + '.csv', index=False)
    
    # create an unvague-vague test
    s1 = unvague_euphs.sample(28)
    s2 = unvague_noneuphs.sample(28)
    train = pd.concat([s1, s2]).sample(frac=1)
    s3 = vague_euphs.sample(7)
    s4 = vague_noneuphs.sample(7)
    test = pd.concat([s3, s4]).sample(frac=1)
    # normally, we'd want to save the train and test subsets before removing columns, but we'll skip for now...
    train = hfify(train)
    test = hfify(test)
    train.to_csv(unvague_vague_folder + '/hf_train_' + str(x) + '.csv', index=False)
    train.to_csv(unvague_vague_folder + '/hf_test_' + str(x) + '.csv', index=False)
    
    # create an unvague-unvague test
    s1 = unvague_euphs.sample(28)
    s2 = unvague_noneuphs.sample(28)
    train = pd.concat([s1, s2]).sample(frac=1)
    s3 = unvague_euphs.drop(s1.index).sample(7)
    s4 = unvague_noneuphs.drop(s2.index).sample(7)
    test = pd.concat([s3, s4]).sample(frac=1)
    # normally, we'd want to save the train and test subsets before removing columns, but we'll skip for now...
    train = hfify(train)
    test = hfify(test)
    train.to_csv(unvague_unvague_folder + '/hf_train_' + str(x) + '.csv', index=False)
    train.to_csv(unvague_unvague_folder + '/hf_test_' + str(x) + '.csv', index=False)

ValueError: a must be greater than 0 unless no samples are taken

## Analyses

### TEST_5 (Mixed-Mixed)

In [37]:
import ast
import evaluate
import pandas as pd

path = 'Vagueness_Splits/Test_5/PET_balanced_mixed_mixed'
df = pd.read_csv(path + '/PET_balanced_mixed_mixed_results_unscrambled.csv', index_col=0)
results = pd.DataFrame(columns=['F1', 'P', 'R', 'tn', 'fp', 'fn', 'tp', 'F1-V', 'P-V', 'R-V', 'F1-UV', 'P-UV', 'R-UV'])

# for each test, select the row with the best F1, then evaluate separate F1s for vague vs unvague examples
for x in range(0, 10):
    test = df.loc[10*x:10*x+9]
    max_f1 = test.loc[test['f1'].idxmax()] # this is the best row from this test
    
    best_preds = max_f1['preds'].replace(' ', ', ') # the labels don't have a comma between them...
    best_preds = ast.literal_eval(best_preds)
    
    ref_df = pd.read_csv(path + '/test_' + str(x) + '.csv')
    # print(ref_df['is_euph'].tolist())
    # approach: get row IDs of vague/unvague rows from test file, then pick the preds at the corresponding indices for evaluation
    vagues = ref_df.loc[ref_df['is_vague']==1]
    unvagues = ref_df.loc[ref_df['is_vague']==0]
    # get predictions
    vague_preds = [best_preds[i] for i in vagues.index.tolist()]
    unvague_preds = [best_preds[i] for i in unvagues.index.tolist()]
    # get labels
    vague_labels = vagues['is_euph'].tolist()
    unvague_labels = unvagues['is_euph'].tolist()
    # load metrics
    # metric_acc = evaluate.load("accuracy")
    metric_f1 = evaluate.load("f1")
    metric_p = evaluate.load("precision")
    metric_r = evaluate.load("recall")
    # vague examples
    vague_f1 = metric_f1.compute(predictions=vague_preds, 
                       references=vague_labels, 
                       average='macro')['f1']
    vague_p = metric_p.compute(predictions=vague_preds, 
                          references=vague_labels)['precision']
    vague_r = metric_r.compute(predictions=vague_preds, 
                          references=vague_labels)['recall']

    # unvague examples
    unvague_f1 = metric_f1.compute(predictions=unvague_preds, 
                       references=unvague_labels, 
                       average='macro')['f1']
    unvague_p = metric_p.compute(predictions=unvague_preds, 
                          references=unvague_labels)['precision']
    unvague_r = metric_r.compute(predictions=unvague_preds, 
                          references=unvague_labels)['recall']

    # all examples, just to make sure it matches original scores
    # vague_acc = metric_acc.compute(predictions=best_preds, 
    #                           references=ref_df['is_euph'].tolist())
    # vague_f1 = metric_f1.compute(predictions=best_preds, 
    #                    references=ref_df['is_euph'].tolist(), 
    #                    average='macro')
    # print(vague_f1)
    # print(vague_acc)
    # print()
    # add to an overall dataframe
    stats = max_f1[0:7].tolist() # take the base stats from the best row
    stats.extend([vague_f1, vague_p, vague_r, unvague_f1, unvague_p, unvague_r]) # add on the vague/unvague ones
    results.loc[len(results.index)] = stats
results

Unnamed: 0,F1,P,R,tn,fp,fn,tp,F1-V,P-V,R-V,F1-UV,P-UV,R-UV
0,0.754959,0.823899,0.935714,32.0,9.0,28.0,131.0,0.737762,0.802326,0.985714,0.7669,0.849315,0.885714
1,0.796491,0.863946,0.907143,40.0,13.0,20.0,127.0,0.85119,0.891892,0.942857,0.742363,0.835616,0.871429
2,0.768964,0.837662,0.921429,35.0,11.0,25.0,129.0,0.797057,0.855263,0.928571,0.740125,0.820513,0.914286
3,0.694157,0.786982,0.95,24.0,7.0,36.0,133.0,0.737762,0.802326,0.985714,0.652343,0.771084,0.914286
4,0.786667,0.8375,0.957143,34.0,6.0,26.0,134.0,0.870062,0.884615,0.985714,0.698465,0.792683,0.928571
5,0.721003,0.803681,0.935714,28.0,9.0,32.0,131.0,0.717674,0.797619,0.957143,0.723648,0.810127,0.914286
6,0.768277,0.873134,0.835714,43.0,23.0,17.0,117.0,0.786834,0.853333,0.914286,0.748881,0.898305,0.757143
7,0.774295,0.846667,0.907143,37.0,13.0,23.0,127.0,0.828925,0.860759,0.971429,0.723524,0.830986,0.842857
8,0.751984,0.837838,0.885714,36.0,16.0,24.0,124.0,0.836991,0.88,0.942857,0.668752,0.794521,0.828571
9,0.789446,0.841772,0.95,35.0,7.0,25.0,133.0,0.870062,0.884615,0.985714,0.706667,0.8,0.914286


In [7]:
# Correct hf_train and hf_test, idk lol
import pandas as pd

corpus = pd.read_csv('Euphemism_Corpus_v2.0.csv')
train_df = pd.read_csv('BERT/hf_train.csv')
test_df = pd.read_csv('BERT/hf_test.csv')

counter = 0
# i know this loop is an efficiency nightmare but i am too lazy and it prob doesn't matter
for i, row in corpus.iterrows():
    text = corpus.loc[i]['edited_text']
    label = corpus.loc[i]['is_euph']
    for j, row in train_df.iterrows():
        a = train_df.loc[j]['text']
        if (text == a):
            l = train_df.loc[j]['label']
            if (l != label):
                train_df.loc[j, 'label'] = label
                counter += 1
            break
    for k, row in test_df.iterrows():
        b = test_df.loc[k]['text']
        if (text == b):
            l = test_df.loc[k]['label']
            if (l != label):
                test_df.loc[k, 'label'] = label
                counter += 1
            break
    if (i % 100 == 0):
        print(i)
            
print(counter)

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
21


In [8]:
train_df.to_csv('hf_train_2.csv')
test_df.to_csv('hf_test_2.csv')

# Using Previous Annotator Data

In [3]:
import pandas as pd

euph_corpus = pd.read_csv("Euphemism_Corpus_v2.0.csv", index_col=0)
annotator_data = pd.read_csv("All_Annotator_Data_Fall_2021.csv", index_col=False)
annotator_data

Unnamed: 0,keyword,edited_text,MP_is_euph,kira_is_euph,kira_diff,kira_interpretation,kira_confidence,raz_is_euph,raz_diff,raz_interpretation,...,kenna_diff,kenna_interpretation,kenna_confidence,kelly_is_euph,kelly_diff,kelly_interpretation,sum diff,abs_diff,kelly_confidence,sum
0,disabled,"I was homeschooled up until 5th grade, which i...",1,0,1,disabled,3,0,1,disabled is the most commonly accepted term by...,...,1,impaired health condition,3,0,1,individuals with mental disability,0,0,2,1
1,detainees,1. (U ) The US Army Criminal Investigation Com...,1,0,1,those who are detained,3,0,1,those in detainment,...,1,people held by the police,3,0,1,person held in custody literal meaning,0,0,2,1
2,disabled,No no no no. I'm in the same situation-- <disa...,1,0,1,disabled,3,0,1,having a disability,...,1,physically or mentally impaired,3,0,1,incapacitated in some way,0,0,3,1
3,latrine,Access to basic sanitation includes safety and...,1,0,1,latrine,3,0,1,a pit in the earth for defacation,...,1,bathroom,2,0,1,latrine - not bathroom or lavatory,0,0,3,1
4,disabled,Got all the working people and stay-home moms ...,1,0,1,disabled,3,0,1,not ablebodied,...,1,physically or mentally impaired,3,0,1,incapacitated in some way,0,0,3,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,plump,"Hello, Mr. Biddle! chirped a beautiful blonde ...",1,1,0,round,3,1,0,fat,...,0,round,3,1,0,chunky,4,4,3,5
496,armed conflict,"The ""Durand Line"" that @ @ @ @ @ @ @ @ @ @ lar...",1,1,0,war,3,1,0,war,...,0,war,3,1,0,war,4,4,3,5
497,less fortunate,We are excited as a church to partner with Uni...,1,1,0,poorer,3,1,0,poor,...,0,poor,3,1,0,more poor,4,4,3,5
498,overweight,As someone who struggles with their weight at ...,1,1,0,overweight or fat,3,1,0,fat,...,0,fat,3,1,0,fat,4,4,2,5


In [33]:
high_agreement = annotator_data.loc[annotator_data['sum'].isin([0, 5])]
mp_wrong_1s = high_agreement.loc[high_agreement['sum'] == 1].loc[high_agreement['MP_is_euph'] == 1]
mp_wrong_0s = high_agreement.loc[high_agreement['sum'] == 4].loc[high_agreement['MP_is_euph'] == 0]
high_agreement = high_agreement.drop(mp_wrong_1s.index)
high_agreement = high_agreement.drop(mp_wrong_0s.index)
print(len(high_agreement['keyword'].unique()))
print(high_agreement['keyword'].unique().tolist())
# low_agreement = annotator_data.loc[annotator_data['sum'].isin([2, 3])]
# print(len(set(high_agreement['keyword'].unique()).difference(set(low_agreement['keyword'].unique()))))
# print(set(high_agreement['keyword'].unique()).difference(set(low_agreement['keyword'].unique())))
# high_agreement.intersection(low_agreement)

97
['slim', 'between jobs', 'accident', 'late', 'number one', 'sleep with', 'seasoned', 'wealthy', 'over the hill', 'plump', 'let go of', 'go all the way', 'overweight', 'sober', 'number two', 'slept with', 'dismissed', 'let them go', 'aging', 'expecting', 'stout', 'troubled', 'with child', 'invalid', 'experienced', 'getting clean', 'custodian', 'got clean', 'long sleep', 'mixed up', 'chest', 'same-sex', 'economical', 'passing on', 'neutralize', 'outspoken', 'gluteus maximus', 'sleep around', 'pass on', 'disabled', 'special needs', 'pass away', 'a certain age', 'well off', 'less fortunate', 'mistruths', 'droppings', 'lose your lunch', 'pregnancy termination', 'let him go', 'golden years', 'mentally challenged', 'tinkle', 'demise', 'drinking problem', 'indigent', 'detainee', 'advanced age', 'comfort women', 'time of the month', 'pass gas', 'portly', 'went to heaven', 'venereal disease', 'put to sleep', 'mistruth', 'differently-abled', 'intoxicated', 'economical with the truth', 'lavator

In [28]:
print(set(low_agreement['keyword'].unique()))
print(len(set(low_agreement['keyword'].unique())))

{'stout', 'homemaker', 'dismissed', 'special needs', 'targeted killings', 'elderly', 'put to sleep', 'psychiatric hospital', 'running behind', 'same-sex', 'same sex', 'underdeveloped', 'outlived her usefulness', 'negative cash flow', 'collateral damage', 'weed', 'income inequality', 'intoxicated', 'latrine', 'long sleep', 'aging', 'let him go', 'downsize', 'custodians', 'perished', 'developing country', 'overweight', 'underprivileged', 'demise', 'global south', 'pass on', 'disadvantaged', 'undocumented immigrants', 'troubled', 'outspoken', 'undocumented workers', 'wealthy', 'chest', 'developed country', 'economical', 'mixed up', 'invalid', 'seasoned', 'developmentally disabled', 'accident', 'pass away', 'let her go', 'venereal disease', 'a certain age', 'with child', 'expecting', 'laying off', 'regime change', 'disabled', 'well off', 'able-bodied', 'sober', 'pro-choice', 'passed away', 'deprived', 'seeing someone', 'outlived his usefulness', 'freedom fighter', 'sanitation worker', 'ext