## Testing on euph corpus

#### Load and preprocess euph sentences

In [9]:
import pandas as pd
import re

euph_corpus = pd.read_csv('data/Euphemism_Corpus_2-24.csv', index_col=0, encoding='utf-8')

In [10]:
def preprocess(s):
    s = s.strip()
    s = re.sub(r'(##\d*\W)|<\w>|,|;|:|--|\(|\)|#|%|\\|\/|\.|\*|\+|@', '', s)
    s = re.sub(r'\s\s+', ' ', s)
    s = s.lower()
    return s

In [11]:
# preprocess sentences
for i, row in euph_corpus.iterrows():
    text = euph_corpus.loc[i, 'sentence']
    euph_corpus.loc[i, 'sentence'] = preprocess(text)

In [12]:
# phrasify the sentences
from gensim.models.phrases import Phraser, Phrases

bigram_phraser = Phraser.load('data/bigram_phraser_7')
trigram_phraser = Phraser.load('data/trigram_phraser_7')
euph_corpus['phrases'] = ""
data = [] # holds phrased input sentences to update wv model with

for i, row in euph_corpus.iterrows():
    text = euph_corpus.loc[i, 'sentence']
    euph_corpus.at[i, 'phrases'] = bigram_phraser[text.split()] # use phraser to detect phrases in text
    euph_corpus.at[i, 'phrases'] = trigram_phraser[euph_corpus.loc[i, 'phrases']]
    data.append(euph_corpus.loc[i, 'phrases'])

In [13]:
# confirm the phraser is still working
trigram_phraser[['are', 'we', 'talking', 'the', 'merits', "of'", 'enhanced_interrogation', 'techniques', 'or', 'the', 'definition', 'of', 'torture']]

['are',
 'we',
 'talking',
 'the',
 'merits',
 "of'",
 'enhanced_interrogation_techniques',
 'or',
 'the',
 'definition_of_torture']

#### Define topic similarity function, topic list and stopwords

In [14]:
def sum_similarity(phrase, topic_list):
    score = 0
    for topic in topic_list:
        try:
            similarity = model.wv.similarity(phrase, topic)
            if (similarity > 0):
                score += similarity
        except:
            score += 0
    return score

In [15]:
# define topic list and stopwords
topic_list = ['politics', 'death', 'kill', 'crime',
               'drugs', 'alcohol', 'fat', 'old', 'poor', 'cheap',
               'sex', 'sexual',
               'employment', 'job', 'disability', 'disabled', 
               'accident', 'pregnant', 'poop', 'sickness', 'race', 'racial', 'vomit'
              ]

stopwords = []
#['the', 'a', 'to', 'him', 'her', 'them', 'me', 'you', 'of', 'with']

with open('data/stopwords.txt','rb') as f:
    content = f.read()
    content = content.split(b'\r\n')
    for line in content:
        stopwords.append(line.decode('utf-8'))

#### Perform topic filtering and evaluation

In [16]:
# define model and train on new data
from gensim.models import Word2Vec
model = Word2Vec.load("data/wv_model_7") # typically takes 45-90 seconds
# train model on input data 
model.train(data, total_examples=len(data), epochs=10)

(346521, 451540)

In [17]:
# TESTING for similarities to topic words on a single phrase
test_phrase = 'laid_off'
similar_topics = []
score = 0
for topic in topic_list:
    similarity = model.wv.similarity(test_phrase, topic)
    if (similarity > 0.24):
        similar_topics.append(topic)
    if (similarity > 0):
        score += similarity
    print('{}: {}'.format(topic, similarity))

print('SIMILAR TOPICS: {}'.format(similar_topics))
print('TOTAL SCORE: {}'.format(score))

politics: -0.09195461869239807
death: -0.0504598394036293
kill: 0.044105637818574905
crime: -0.005749249830842018
drugs: 0.09508519619703293
alcohol: 0.002513594925403595
fat: 0.17617137730121613
old: 0.20360994338989258
poor: 0.23568153381347656
cheap: 0.10365447402000427
sex: -0.1314983069896698
sexual: -0.20578521490097046
employment: 0.4224083721637726
job: 0.4618397355079651
disability: 0.2803969383239746
disabled: 0.39359474182128906
accident: 0.24307262897491455
pregnant: 0.3209178149700165
poop: 0.0008516162633895874
sickness: 0.1409498155117035
race: -0.1082506850361824
racial: -0.17647254467010498
vomit: -0.07262822240591049
SIMILAR TOPICS: ['employment', 'job', 'disability', 'disabled', 'accident', 'pregnant']
TOTAL SCORE: 3.1248534210026264


In [18]:
THRESHOLD = 1.5
score = 0

successes = []
partial_successes = []
failures = []
topically_filtered_euphs = []
quality_phrase_count = 0
filtered = []

euph_corpus['quality_phrases'] = ""

for i, row in euph_corpus.iterrows():
    text = euph_corpus.loc[i, 'sentence']
    phrases = euph_corpus.loc[i, 'phrases']
    euph = euph_corpus.loc[i, 'keyword']
    quality_phrases = []
    for phrase in phrases:
        if (phrase in stopwords):
            continue
        similarity = sum_similarity(phrase, topic_list)
        if (similarity > THRESHOLD and phrase not in quality_phrases):
            quality_phrases.append(phrase)
        elif (similarity < THRESHOLD and euph == re.sub(r'_', ' ', phrase)):
            if euph not in topically_filtered_euphs:
                topically_filtered_euphs.append(euph)
        else:
            filtered.append(phrase)
    # add the quality phrases to the column
    euph_corpus.at[i, 'quality_phrases'] = quality_phrases
    
    # now check if the euph in the sentence is retained in the list of quality phrases
    quality_phrases = [re.sub(r'_', ' ', p) for p in quality_phrases]
    quality_phrase_count += len(quality_phrases)
    
    if euph in quality_phrases:
        score += 1
        if euph not in successes:
            successes.append(euph)
    else:
        partial_success = False
        for p in quality_phrases: # check if phrase output contains euphemism
            if euph in p:
                score += 1
                if euph not in partial_successes:
                    partial_successes.append(euph)
                    partial_success = True
                    break
        if (partial_success == False): 
            if euph not in failures:
                failures.append(euph)

            # check failures for a particular phrase
            # if (euph == "ethnic cleansing"):
            #     print("TEXT: {}".format(text))
            #     print("PHRASES: {}".format(phrases))
            #     print("QUALITY PHRASES: {}".format(quality_phrases))
            #     print()

print("Retained the euphemism in {} out of {} sentences".format(score, len(euph_corpus)))
print("{} quality phrases retained overall".format(quality_phrase_count))
print("Filtered {} non-keywords out".format(len(filtered)))
print()
print("EXACT SUCCESSES: {}".format(successes))
print()
print("PARTIAL SUCCESSES: {}".format(partial_successes))
print()
print("FAILURES: {}".format(failures))
print()
print("FALSE NEGATIVES of TOPIC FILTERING: {}".format(topically_filtered_euphs))

Retained the euphemism in 1626 out of 1965 sentences
14787 quality phrases retained overall
Filtered 7034 non-keywords out

EXACT SUCCESSES: ['tinkle', 'undocumented immigrants', 'undocumented immigrant', 'venereal diseases', 'venereal disease', 'sex workers', 'sex worker', 'mentally disabled', 'correctional facilities', 'correctional facility', 'freedom fighters', 'freedom fighter', 'detainees', 'detainee', 'psychiatric hospital', 'ethnic cleansing', 'ethnically cleansed', 'enhanced interrogation techniques', 'mistruths', 'mistruth', 'elderly', 'armed conflict', 'drinking problem', 'deceased', 'pro-life', 'income inequality', 'rear end', 'lavatory', 'birds and the bees', 'inner city', 'developing country', 'developed country', 'substance abuse', 'global south', 'underprivileged', 'inebriated', 'homemaker', 'capital punishment', 'differently-abled', 'indigent', 'detention camp', 'pass gas', 'dearly departed', 'terminating a pregnancy', 'pregnancy termination', 'senior citizen', 'senior

## Sentiment

In [2]:
# load checkpoint containing quality phrases
euph_corpus = pd.read_csv('Euphemism_Corpus_with_Quality_Phrases_1.csv', encoding='utf-8', index_col = 0)

#### roBERTa Sentiment

In [20]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from scipy.special import softmax
import csv
import urllib.request

def load_roberta_sentiment():
    # Tasks:
    # emoji, emotion, hate, irony, offensive, sentiment
    # stance/abortion, stance/atheism, stance/climate, stance/feminist, stance/hillary

    task='sentiment'
    MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

    tokenizer = AutoTokenizer.from_pretrained(MODEL)

    # download label mapping
    labels=[]
    mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
    with urllib.request.urlopen(mapping_link) as f:
        html = f.read().decode('utf-8').split("\n")
        csvreader = csv.reader(html, delimiter='\t')
    labels = [row[1] for row in csvreader if len(row) > 1]

    # pretrained
    model = AutoModelForSequenceClassification.from_pretrained(MODEL)
    model.save_pretrained(MODEL)
    tokenizer.save_pretrained(MODEL)
    
    return labels, model, tokenizer

#### roBERTa Offensive

In [21]:
def load_roberta_offensive():
    task='offensive'
    MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

    tokenizer = AutoTokenizer.from_pretrained(MODEL)

    # download label mapping
    labels=[]
    mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
    with urllib.request.urlopen(mapping_link) as f:
        html = f.read().decode('utf-8').split("\n")
        csvreader = csv.reader(html, delimiter='\t')
    labels = [row[1] for row in csvreader if len(row) > 1]

    # PT
    model = AutoModelForSequenceClassification.from_pretrained(MODEL)
    model.save_pretrained(MODEL)
    tokenizer.save_pretrained(MODEL)
    
    return labels, model, tokenizer

In [22]:
# functions for using the roberta models
def get_sentiment(s, labels, model, tokenizer):
    encoded_input = tokenizer(s, return_tensors='pt')
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    # ranking = np.argsort(scores)
    # ranking = ranking[::-1]
    # for i in range(scores.shape[0]):
    #     l = labels[ranking[i]]
    #     s = scores[ranking[i]]
        # print(f"{i+1}) {l} {np.round(float(s), 4)}")
    return scores

def get_offensive(s, labels, model, tokenizer):
    encoded_input = tokenizer(s, return_tensors='pt')
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    # ranking = np.argsort(scores)
    # ranking = ranking[::-1]
    # for i in range(0, 2):
    #     l = labels[ranking[i]]
    #     s = scores[ranking[i]]
        # print(f"{i+1}) {l} {np.round(float(s), 4)}")
    return scores

#### Run sentiment/offensive analysis on euph corpus

In [27]:
'''
needs functions load_roberta_sentiment(), load_roberta_offensive(), get_sentiment() and get_offensive()
'''
def get_top_euph_candidates(text, phrases, num_paraphrases, wv_model, sentiment_pack, offensive_pack, show_stats=False):
    
    sentiment_labels, sentiment_model, sentiment_tokenizer = sentiment_pack[0], sentiment_pack[1], sentiment_pack[2]
    offensive_labels, offensive_model, offensive_tokenizer = offensive_pack[0], offensive_pack[1], offensive_pack[2]
    
    orig_scores = list(get_sentiment(text, sentiment_labels, sentiment_model, sentiment_tokenizer))
    orig_scores = orig_scores + list(get_offensive(text, offensive_labels, offensive_model, offensive_tokenizer))
    if show_stats == True: print('SENTIMENT OF ORIGINAL SENTENCE: {}'.format(orig_scores))
    phrase_scores = []

    for q in phrases:
        paraphrases = []
        if show_stats == True: print('\n'+q)
        paraphrases = wv_model.wv.most_similar(q, topn = num_paraphrases) # can swap out
        
        # print(paraphrases)
        
        # various sentiment statistics
        sentiment_shift = [0, 0, 0, 0, 0]
        max_inc = [0, 0, 0, 0, 0]
        max_inc_para = ["", "", "", "", ""]
        tot_neg_inc = 0
        tot_neu_inc = 0
        tot_off_inc = 0
        tot_noff_inc = 0
        
        for p in paraphrases:
            p_string = re.sub(r'_', ' ', p[0]) # the underscores are removed for sentiment computation - experiment?
            q_string = re.sub(r'_', ' ', q)
            
            if (q_string in p_string):
#                 print("Paraphrase is superstring, skipping!")
#                 print()
                continue

            # replacement
            pattern = re.compile(r'\b'+q_string+r'\b', re.IGNORECASE)
            new_sentence = pattern.sub(p_string, text)
            # at this point, we could check the integrity of the paraphrase

            # get the sentiment/offensive scores for this paraphrase
            scores = list(get_sentiment(new_sentence, sentiment_labels, sentiment_model, sentiment_tokenizer))
            scores = scores + list(get_offensive(new_sentence, offensive_labels, offensive_model, offensive_tokenizer))

            # update the quality phrase's sentiment statistics with the sentiment shifts from this paraphrase
            shifts = [0, 0, 0, 0, 0]
            for i in range(0, len(scores)):
                shifts[i] = scores[i] - orig_scores[i]
                sentiment_shift[i] += shifts[i]
                if (shifts[i] > max_inc[i]):
                    max_inc[i] = shifts[i]
                    max_inc_para[i] = p_string

            # update the relevant scores for detection
            if (shifts[0] > 0):
                tot_neg_inc += shifts[0]
            if (shifts[1] > 0):
                tot_neu_inc += shifts[1]
            if (shifts[3] > 0):
                tot_noff_inc += shifts[3]
            if (shifts[4] > 0):
                tot_off_inc += shifts[4]
        
        for val in sentiment_shift:
            val /= num_paraphrases
        if (show_stats == True):
            print("AVERAGE SENTIMENT SHIFTS: {}".format(sentiment_shift))
            print("MAX INCREASE FROM A PHRASE: {}".format(max_inc))
            print("PHRASES THAT CAUSED EACH ^: {}".format(max_inc_para))
            print("TOTAL NEGATIVE INCREASE: {}".format(tot_neg_inc))
            print("TOTAL NEUTRAL INCREASE: {}".format(tot_neu_inc))
            print("TOTAL NEUTRAL INCREASE: {}".format(tot_noff_inc))
            print("TOTAL OFFENSIVE INCREASE: {}".format(tot_off_inc))

        phrase_scores.append((q_string, tot_neg_inc + tot_neu_inc + 2*(tot_noff_inc + tot_off_inc)))
        # alternate scoring schemes
        # phrase_scores.append((q_string, max_inc[0] + 2*max_inc[4]))
        # phrase_scores.append((q_string, tot_neg_inc + tot_neu_inc + 2*(tot_off_inc)))
    phrase_scores = list(sorted(phrase_scores, key=lambda x: x[1], reverse=True))
    return phrase_scores

In [24]:
# load the models
sentiment_labels, sentiment_model, sentiment_tokenizer = load_roberta_sentiment()
offensive_labels, offensive_model, offensive_tokenizer = load_roberta_offensive()

sentiment_pack = [sentiment_labels, sentiment_model, sentiment_tokenizer]
offensive_pack = [offensive_labels, offensive_model, offensive_tokenizer]

In [28]:
import re

num_paraphrases = 25
score = 0
k = 2 # check the top k candidates for the PET -> success
euph_corpus['candidates'] = ""
euph_corpus['top_2'] = 0

In [29]:
from tqdm import tqdm

for i, row in tqdm(euph_corpus.iterrows(), total=euph_corpus.shape[0]):
# uncomment below if resuming from checkpoint
#     if (0 < i < 600):
#         continue
    phrases = euph_corpus.loc[i, 'quality_phrases']
    
    # Converting string to list IF READING FROM CSV as checkpoint
    # phrases = ast.literal_eval(phrases)
    
    text = euph_corpus.loc[i, 'sentence']
    euph = euph_corpus.loc[i, 'keyword']
    
    top_candidates = get_top_euph_candidates(text, phrases, num_paraphrases, model, 
                                             sentiment_pack, offensive_pack, show_stats=False)
#     print(top_candidates)
#     print()
    euph_corpus.at[i, 'candidates'] = top_candidates
    
    # check the top k candidates - this code could use cleaning up
    for x in range(0, k):
        if (len(top_candidates) == 0):
            break
        if (len(top_candidates) == 1):
            candidate = top_candidates[0][0]
            if euph in candidate:
                score += 1
                if (score % 50 == 0):
                    print(score)
                euph_corpus.loc[i, 'top_2'] = 1
            break
        candidate = top_candidates[x][0]
        if euph in candidate:
            score += 1
            if (score % 50 == 0):
                print(score)
            euph_corpus.loc[i, 'top_2'] = 1
            break

    if (i == 1382):
        break
print("Euphemism detected in {} out of {} sentences".format(score, 1382))

  5%|▍         | 89/1965 [16:21<5:42:08, 10.94s/it] 

50


  9%|▉         | 173/1965 [29:28<4:20:42,  8.73s/it] 

100


 13%|█▎        | 256/1965 [43:10<2:59:04,  6.29s/it] 

150


 17%|█▋        | 336/1965 [54:12<3:26:44,  7.61s/it]

200


 21%|██        | 416/1965 [1:09:17<2:05:25,  4.86s/it]

250


 26%|██▌       | 508/1965 [1:23:53<3:05:35,  7.64s/it] 

300


 31%|███       | 611/1965 [1:43:39<3:49:19, 10.16s/it]

350


 36%|███▌      | 698/1965 [1:56:12<3:18:02,  9.38s/it]

400


 42%|████▏     | 819/1965 [2:14:39<2:22:22,  7.45s/it]

450


 45%|████▌     | 888/1965 [2:23:00<2:08:47,  7.18s/it]

500


 50%|████▉     | 980/1965 [2:36:04<2:48:58, 10.29s/it]

550


 55%|█████▌    | 1083/1965 [2:52:20<2:39:49, 10.87s/it]

600


 61%|██████    | 1195/1965 [3:07:49<1:24:29,  6.58s/it]

650


 68%|██████▊   | 1329/1965 [3:26:49<3:05:32, 17.50s/it]

700


 70%|███████   | 1382/1965 [3:36:39<1:31:24,  9.41s/it]

Euphemism detected in 725 out of 1382 sentences





In [30]:
num_correct = 0
for x in euph_corpus['top_2'].tolist():
    if (x == 1):
        num_correct += 1
print(num_correct)

725


In [36]:
euph_corpus.to_csv('results_8.3.csv')

## Analytics

In [1]:
import pandas as pd

euph_corpus = pd.read_csv('results_8.3.csv', index_col=0)

#### Print number of 1st, 2nd, and 3rd place PET rankings

In [37]:
import ast # this package is helpful for parsing lists stored in CSV files; which contain the literal characters [, ], etc.
num_first_place = 0
num_second_place = 0
num_third_place = 0
for i, row in euph_corpus.iterrows():
    if (i > 1382):
        continue
    top_2 = euph_corpus.loc[i, 'top_2']
    keyword = euph_corpus.loc[i, 'keyword']
    candidates = euph_corpus.loc[i, 'candidates']
    # Converting string to list
    # candidates = ast.literal_eval(candidates)
    if (top_2 == 1):
        if (keyword in candidates[0][0]):
            num_first_place += 1
        elif (keyword in candidates[1][0]):
            num_second_place += 1
    elif (len(candidates) > 2):
        if (keyword in candidates[2][0]):
            num_third_place += 1

print(num_first_place)
print(num_second_place)
print(num_third_place)

468
257
166


#### Print number of phrase candidates and target PETs retained after Phrase Extraction

In [39]:
import re
count = 0
tot_p = 0
# denote rows where keyword was present in REGULAR phrases
for i, row in euph_corpus.iterrows():
    if (euph_corpus.loc[i, "is_euph"] == 0):
        continue
    phrases = euph_corpus.loc[i, "phrases"]
    # Converting string to list
    # phrases = ast.literal_eval(phrases)
    tot_p += len(phrases)
    keyword = euph_corpus.loc[i, 'keyword']
    for p in phrases:
        p_string = re.sub(r'_', ' ', p)
        if keyword in p_string:
            #euph_corpus.loc[i, 'keyword_present'] = 1
            count += 1
            break
            
print(count) # PETs retained after Phrase Extraction
print(tot_p) # total number of phrases remaining

1251
31348


#### Print number of phrase candidates and target PETs retained after Phrase Filtering

In [41]:
import re
count = 0
tot_q = 0
# denote rows where keyword was present in quality phrases
euph_corpus['keyword_present'] = 0
for i, row in euph_corpus.iterrows():
    if (euph_corpus.loc[i, "is_euph"] == 0):
        continue
    quality_phrases = euph_corpus.loc[i, "quality_phrases"]
    # Converting string to list
    # quality_phrases = ast.literal_eval(quality_phrases)
    tot_q += len(quality_phrases)
    keyword = euph_corpus.loc[i, 'keyword']
    for q in quality_phrases:
        q_string = re.sub(r'_', ' ', q)
        if keyword in q_string:
            euph_corpus.loc[i, 'keyword_present'] = 1
            count += 1
            break

print(count) # PETs retained after Phrase Filtering
print(tot_q) # total number of phrases remaining 

1198
10503


In [9]:
# append whether or not the PET is present in quality_phrases (prior to ranking stage) as a column
euph_corpus.to_csv('results_8.2.1.csv')

#### Print number of phrase candidates and target PETs retained after Phrase Ranking

In [45]:
tot_top_2 = 0
for i, row in euph_corpus.iterrows():
    if (euph_corpus.loc[i, "is_euph"] == 0):
        continue
    phrases = euph_corpus.loc[i, "candidates"]
    if (len(phrases) == 0):
        continue
    if (len(phrases) == 1):
        tot_top_2 += 1
    else: 
        tot_top_2 += 2
    # Converting string to list
    # phrases = ast.literal_eval(phrases)
    # tot_top_2 += len(phrases)
print(tot_top_2) # total number of phrases that are in the top 1-2

2728
