<a href="https://colab.research.google.com/github/marsgav/PETDetection/blob/main/Euph_Detection_on_Euph_Corpus_4_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Testing on euph corpus

#### Load and preprocess euph sentences

In [None]:
import pandas as pd
import re

euph_corpus = pd.read_csv('Euphemism_Corpus_2-24.csv', index_col=0, encoding='utf-8')

In [None]:
def preprocess(s):
    s = s.strip()
    s = re.sub(r'(##\d*\W)|<\w>|,|;|:|--|\(|\)|#|%|\\|\/|\.|\*|\+|@', '', s)
    s = re.sub(r'\s\s+', ' ', s)
    s = s.lower()
    return s

In [None]:
# preprocess sentences
for i, row in euph_corpus.iterrows():
    text = euph_corpus.loc[i, 'sentence']
    euph_corpus.loc[i, 'sentence'] = preprocess(text)

In [None]:
# phrasify the sentences
from gensim.models.phrases import Phraser, Phrases

bigram_phraser = Phraser.load('bigram_phraser_5')
trigram_phraser = Phraser.load('trigram_phraser_5')
euph_corpus['phrases'] = ""
data = [] # holds phrased input sentences to update wv model with

for i, row in euph_corpus.iterrows():
    text = euph_corpus.loc[i, 'sentence']
    euph_corpus.at[i, 'phrases'] = bigram_phraser[text.split()] # use phraser to detect phrases in text
    euph_corpus.at[i, 'phrases'] = trigram_phraser[euph_corpus.loc[i, 'phrases']]
    data.append(euph_corpus.loc[i, 'phrases'])

In [None]:
# confirm the phraser is still working
# trigram_phraser[['are', 'we', 'talking', 'the', 'merits', "of'", 'enhanced_interrogation', 'techniques', 'or', 'the', 'definition', 'of', 'torture']]

In [None]:
# temporary - seeing results for a particular keyword
selection = euph_corpus[euph_corpus['keyword'] == 'disabled']
for i, row in selection.iterrows():
    print(selection.loc[i, 'phrases'])
    print(' '.join(selection.loc[i, 'phrases']))

["i'm", 'in', 'the', 'same', 'situation', 'disabled', 'chronic_pain', 'artist', 'no', 'visible', 'disability', 'even', 'when', "i'm", 'in', 'my', 'chair', 'and', 'nobody_understands', 'that', 'it', 'takes', 'us', 'longer', 'to', 'do', 'everything']
i'm in the same situation disabled chronic_pain artist no visible disability even when i'm in my chair and nobody_understands that it takes us longer to do everything
['whether', 'they', 'take', 'the', 'form', 'of', 'the', "neurologist's", 'multimodal', 'sensory', 'integrations', 'or', 'the', 'alternative', 'sense', 'organs', 'described', 'by', 'disabled', 'writers', 'and', 'performers', 'these', 're-organized', 'sensations', 'suggest', 'a', 'need', 'for', 'interdisciplinary', 'anti-disciplinary']
whether they take the form of the neurologist's multimodal sensory integrations or the alternative sense organs described by disabled writers and performers these re-organized sensations suggest a need for interdisciplinary anti-disciplinary
['the'

#### Define topic similarity function, topic list and stopwords

In [None]:
def sum_similarity(phrase, topic_list):
    score = 0
    for topic in topic_list:
        try:
            similarity = model.wv.similarity(phrase, topic)
            # EXPERIMENTAL - to "reward" the phrases with a high similarity to a particular category, but maybe not others
            if (similarity > 0.50):
                # print("{} has a high similarity with {}".format(phrase, topic))
                return 1.51
            if (similarity > 0):
                score += similarity
        except:
            score += 0
    return score

In [None]:
# define topic list and stopwords
topic_list = ['politics', 'death', 'kill', 'crime',
               'drugs', 'alcohol', 'fat', 'old', 'poor', 'cheap',
               'sex', 'sexual',
               'employment', 'job', 'disability', 'disabled', 
               'accident', 'pregnant', 'poop', 'sickness', 'race', 'racial', 'vomit'
              ]

stopwords = []
#['the', 'a', 'to', 'him', 'her', 'them', 'me', 'you', 'of', 'with']

with open('stopwords.txt','rb') as f:
    content = f.read()
    content = content.split(b'\r\n')
    for line in content:
        stopwords.append(line.decode('utf-8'))

#### Perform topic filtering and evaluation

In [None]:
# define model and train on new data
from gensim.models import Word2Vec
model = Word2Vec.load("TEMP")
# train model on input data - does number of epochs matter?
model.train(data, total_examples=len(data), epochs=10)

(393254, 528250)

In [None]:
import tqdm

THRESHOLD = 1.45
score = 0

successes = []
partial_successes = []
failures = []
topically_filtered_euphs = []
quality_phrase_count = 0
filtered = []

euph_corpus['quality_phrases'] = ""

for i, row in euph_corpus.iterrows():
    text = euph_corpus.loc[i, 'sentence']
    #phrases = phraser[text.split()] # use phraser to detect phrases in text
    phrases = euph_corpus.loc[i, 'phrases']
    euph = euph_corpus.loc[i, 'keyword']
    quality_phrases = []
    for phrase in phrases:
        if (phrase in stopwords):
            continue
        similarity = sum_similarity(phrase, topic_list)
        if (similarity > THRESHOLD and phrase not in quality_phrases):
            quality_phrases.append(phrase)
        elif (similarity < THRESHOLD and euph == re.sub(r'_', ' ', phrase)):
            if euph not in topically_filtered_euphs:
                topically_filtered_euphs.append(euph)
        else:
            filtered.append(phrase)
    # add the quality phrases to the column
    euph_corpus.at[i, 'quality_phrases'] = quality_phrases
    
    # now check if the euph in the sentence is retained in the list of quality phrases
    quality_phrases = [re.sub(r'_', ' ', p) for p in quality_phrases]
    quality_phrase_count += len(quality_phrases)
    
    if euph in quality_phrases:
        score += 1
        if euph not in successes:
            successes.append(euph)
    else:
        partial_success = False
        for p in quality_phrases: # check if phrase output contains euphemism
            if euph in p:
                score += 1
                if euph not in partial_successes:
                    partial_successes.append(euph)
                    partial_success = True
                    break
        if (partial_success == False): 
            if euph not in failures:
                failures.append(euph)

            # check failures for a particular phrase
            # if (euph == "ethnic cleansing"):
            #     print("TEXT: {}".format(text))
            #     print("PHRASES: {}".format(phrases))
            #     print("QUALITY PHRASES: {}".format(quality_phrases))
            #     print()

print("Retained the euphemism in {} out of {} sentences".format(score, len(euph_corpus)))
print("{} quality phrases retained overall".format(quality_phrase_count))
print("Filtered {} non-keywords out".format(len(filtered)))
print()
print("EXACT SUCCESSES: {}".format(successes))
print()
print("PARTIAL SUCCESSES: {}".format(partial_successes))
print()
print("FAILURES: {}".format(failures))
print()
print("FALSE NEGATIVES of TOPIC FILTERING: {}".format(topically_filtered_euphs))

Retained the euphemism in 1401 out of 1965 sentences
13532 quality phrases retained overall
Filtered 7243 non-keywords out

EXACT SUCCESSES: ['tinkle', 'undocumented immigrants', 'undocumented immigrant', 'venereal diseases', 'venereal disease', 'sex workers', 'sex worker', 'mentally disabled', 'correctional facilities', 'correctional facility', 'freedom fighters', 'freedom fighter', 'detainees', 'detainee', 'psychiatric hospital', 'ethnic cleansing', 'ethnically cleansed', 'enhanced interrogation techniques', 'mistruths', 'elderly', 'armed conflict', 'deceased', 'pro-life', 'income inequality', 'rear end', 'lavatory', 'birds and the bees', 'inner city', 'substance abuse', 'underprivileged', 'inebriated', 'homemaker', 'capital punishment', 'indigent', 'detention camp', 'dearly departed', 'terminating a pregnancy', 'pregnancy termination', 'senior citizen', 'senior citizens', 'substance abuser', 'substance abusers', 'undocumented workers', 'pre-owned', 'sanitation workers', 'latrine', '

In [None]:
# testing - for topic similarity queries on a single phrase
test_phrase = 'differently-abled'
similar_topics = []
score = 0
for topic in topic_list:
    similarity = model.wv.similarity(test_phrase, topic)
    if (similarity > 0.24):
        similar_topics.append(topic)
    if (similarity > 0):
        score += similarity
    print('{}: {}'.format(topic, similarity))

print('SIMILAR TOPICS: {}'.format(similar_topics))
print('TOTAL SCORE: {}'.format(score))

KeyError: "Key 'differently-abled' not present"

## Sentiment

In [None]:
euph_corpus

Unnamed: 0,keyword,edited_text,is_euph,category,type,euph_status,sentence,phrases,quality_phrases
0,tinkle,We're just getting back what was TAKEN from us...,1,bodily functions,tinkle,always_euph,we're just getting back what was taken from us...,"[we're, just, getting, back, what, was, taken,...","[greedy, mind, economy, tinkle, bush, mccain]"
1,tinkle,I think AB390 will pass next year now that the...,1,bodily functions,tinkle,always_euph,i think ab390 will pass next year now that the...,"[i, think, ab390, will, pass, next_year, now, ...","[pass, protection, fired, tinkle, positive]"
2,undocumented immigrants,"Singled Out Think Like a Man, the new movie ba...",1,politics,undocumented immigrant,always_euph,anything but secure a federal program designed...,"[anything, but, secure, a, federal, program, d...","[federal, program, immigrants, criminal_record..."
3,undocumented immigrants,"Not to be outdone, Sen. Rand Paul (R-Ky. ), so...",1,politics,undocumented immigrant,always_euph,in a post-election interview with politico pau...,"[in, a, post-election_interview, with, politic...","[marijuana_laws, immigrants]"
4,undocumented immigrants,The law has also galvanized the growing immigr...,1,politics,undocumented immigrant,always_euph,aside from undocumented immigrants the america...,"[aside, from, undocumented_immigrants, the, am...","[undocumented_immigrants, american_citizens, u..."
...,...,...,...,...,...,...,...,...,...
1960,sleep with,There were other photos she wanted me to see: ...,0,sexual activity,sleep with,sometimes_euph,there were other photos she wanted me to see b...,"[there, were, other, photos, she, wanted, me, ...","[white, lap, fending_off, sleep, gummy, smile]"
1961,sleep with,I am relieved to see two pup tents marked STAF...,0,sexual activity,sleep with,sometimes_euph,thank god i don't have to sleep with ace wands,"[thank, god, i, don't, have, to, sleep, with, ...","[god, sleep, wands]"
1962,sleep around,"Nothing serious, just long nights of me hackin...",0,sexual activity,sleep around,sometimes_euph,with all my caterwauling it's a wonder anyone ...,"[with, all, my, caterwauling, it's, a, wonder,...","[caterwauling, sleep]"
1963,with child,sounds more like Jonestown. They cant leave @ ...,0,physical/mental attributes,with child,sometimes_euph,they cant leave best advice i can give them is...,"[they, cant, leave, best, advice, i, can, give...","[leave, advice, feel, children, danger, phone,..."


#### roBERTa Sentiment

In [None]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from scipy.special import softmax
import csv
import urllib.request

def load_roberta_sentiment():
    # Tasks:
    # emoji, emotion, hate, irony, offensive, sentiment
    # stance/abortion, stance/atheism, stance/climate, stance/feminist, stance/hillary

    task='sentiment'
    MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

    tokenizer = AutoTokenizer.from_pretrained(MODEL)

    # download label mapping
    labels=[]
    mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
    with urllib.request.urlopen(mapping_link) as f:
        html = f.read().decode('utf-8').split("\n")
        csvreader = csv.reader(html, delimiter='\t')
    labels = [row[1] for row in csvreader if len(row) > 1]

    # pretrained
    model = AutoModelForSequenceClassification.from_pretrained(MODEL)
    model.save_pretrained(MODEL)
    tokenizer.save_pretrained(MODEL)
    
    return labels, model, tokenizer

#### roBERTa Offensive

In [None]:
def load_roberta_offensive():
    task='offensive'
    MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

    tokenizer = AutoTokenizer.from_pretrained(MODEL)

    # download label mapping
    labels=[]
    mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
    with urllib.request.urlopen(mapping_link) as f:
        html = f.read().decode('utf-8').split("\n")
        csvreader = csv.reader(html, delimiter='\t')
    labels = [row[1] for row in csvreader if len(row) > 1]

    # PT
    model = AutoModelForSequenceClassification.from_pretrained(MODEL)
    model.save_pretrained(MODEL)
    tokenizer.save_pretrained(MODEL)
    
    return labels, model, tokenizer

In [None]:
#EXPERIMENTAL - function for calculating sentiment score

def get_sentiment(s, labels, model, tokenizer):
    encoded_input = tokenizer(s, return_tensors='pt')
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    ranking = np.argsort(scores)
    ranking = ranking[::-1]
    for i in range(scores.shape[0]):
        l = labels[ranking[i]]
        s = scores[ranking[i]]
        # print(f"{i+1}) {l} {np.round(float(s), 4)}")
    return scores

def get_offensive(s, labels, model, tokenizer):
    encoded_input = tokenizer(s, return_tensors='pt')
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    ranking = np.argsort(scores)
    ranking = ranking[::-1]
    for i in range(0, 2):
        l = labels[ranking[i]]
        s = scores[ranking[i]]
        # print(f"{i+1}) {l} {np.round(float(s), 4)}")
    return scores

[0.4909721 0.5090279]
[0.5206446  0.47935542]


#### Testing sentence similarity (unfortunately, laid_off has a high similarity score with 'rehired')

In [None]:
import spacy
from spacy.lang.en import English

# we disable all the annotators except the tokenizer so its fast
nlp = English(disable=['tagger', 'parser', 'ner'])

def tokenize(text):
  return [t.text.lower() for t in nlp(text)]  

In [None]:
s1 = tokenize('laid_off')
s2 = tokenize('rehired')
model.wv.n_similarity(s1, s2)

0.7555735

#### Testing sentiment shifts on a single sentence

In [None]:
# 815 - passed away
# 1232 - laid off
# 289 - birds and the bees
# 48 - mentally disabled
# 61 - correctional facility
# 77 - freedom fighters
# 155 - enhanced interrogation techniques (! - a diff word is 1st place)
# 165 - elderly (! - a diff word is 1st place)
# 392 - homemaker (! - actual euph is last place, but PET is 1st place)
# 1000 - same-sex (fails)
# 1010 - go all the way (fails because intended euph is not detected)
# 600 - fatality (! - "died" is 1st place, consider weighging scores differently? off > neg > neu)

i = 600 # index of a sentence from euph corpus 
sample_quality_phrases = euph_corpus.loc[i, 'quality_phrases']
sample_sentence = euph_corpus.loc[i, 'sentence']
sample_keyword = euph_corpus.loc[i, 'keyword']
num_paraphrases = 25

# load the models
sentiment_labels, sentiment_model, sentiment_tokenizer = load_roberta_sentiment()
offensive_labels, offensive_model, offensive_tokenizer = load_roberta_offensive()

# get the scores for the original sentence
print(sample_sentence)
orig_scores = list(get_sentiment(sample_sentence, sentiment_labels, sentiment_model, sentiment_tokenizer))
orig_scores = orig_scores + list(get_offensive(sample_sentence, offensive_labels, offensive_model, offensive_tokenizer))
print(orig_scores)

output = []

for q in sample_quality_phrases:
    sample_paraphrases = []
    print()
    print(q)
    sample_paraphrases = model.wv.most_similar(q, topn = num_paraphrases) # can swap out
    
    # various sentiment statistics
    sentiment_shift = [0, 0, 0, 0, 0]
    max_inc = [0, 0, 0, 0, 0]
    max_inc_para = ["", "", "", "", ""]
    tot_neg_inc = 0
    tot_neu_inc = 0
    tot_off_inc = 0
    
    for p in sample_paraphrases:
        p_string = re.sub(r'_', ' ', p[0]) # the underscores are removed for sentiment computation - experiment?
        q_string = re.sub(r'_', ' ', q)
        # replacement
        pattern = re.compile(r'\b'+q_string+r'\b', re.IGNORECASE)
        new_sentence = pattern.sub(p_string, sample_sentence)
        
        # print(new_sentence)
        # at this point, we could check the integrity of the paraphrase
        
        # get the sentiment/offensive scores for this paraphrase
        scores = list(get_sentiment(new_sentence, sentiment_labels, sentiment_model, sentiment_tokenizer))
        scores = scores + list(get_offensive(new_sentence, offensive_labels, offensive_model, offensive_tokenizer))
        
        # update the quality phrase's sentiment statistics with the sentiment shifts from this paraphrase
        shifts = [0, 0, 0, 0, 0]
        for i in range(0, len(scores)):
            shifts[i] = scores[i] - orig_scores[i]
            sentiment_shift[i] += shifts[i]
            if (shifts[i] > max_inc[i]):
                max_inc[i] = shifts[i]
                max_inc_para[i] = p_string
        
        # update the relevant scores for detection
        if (shifts[0] > 0):
            tot_neg_inc += shifts[0]
        if (shifts[1] > 0):
            tot_neu_inc += shifts[1]
        if (shifts[3] > 0):
            tot_off_inc += shifts[3]
        
    for val in sentiment_shift:
        val /= num_paraphrases
    print("AVERAGE SENTIMENT SHIFTS: {}".format(sentiment_shift))
    print("MAX INCREASE FROM A PHRASE: {}".format(max_inc))
    print("PHRASES THAT CAUSED EACH ^: {}".format(max_inc_para))
    print("TOTAL NEGATIVE INCREASE: {}".format(tot_neg_inc))
    print("TOTAL NEUTRAL INCREASE: {}".format(tot_neu_inc))
    print("TOTAL OFFENSIVE INCREASE: {}".format(tot_off_inc))
    
    output.append((q_string, tot_neg_inc + tot_neu_inc + tot_off_inc))

output = list(sorted(output, key=lambda x: x[1], reverse=True))
print()
print(output)

india registered a fifth fatality due to novel coronavirus on friday after an italian national died of covid-19 in rajasthan's jaipur
[0.7230875, 0.27123973, 0.005672735, 0.73014456, 0.26985538]

india
AVERAGE SENTIMENT SHIFTS: [0.26794058084487915, -0.26496194303035736, -0.0029774424619972706, -0.15483760833740234, 0.15483880043029785]
MAX INCREASE FROM A PHRASE: [0.046844244, 0.047704726, 0.0011110385, 0.00830394, 0.021167517]
PHRASES THAT CAUSED EACH ^: ['south africa', 'germany', 'brazil', 'zimbabwe', 'south africa']
TOTAL NEGATIVE INCREASE: 0.4220535159111023
TOTAL NEUTRAL INCREASE: 0.15132296085357666
TOTAL OFFENSIVE INCREASE: 0.034567296504974365

registered
AVERAGE SENTIMENT SHIFTS: [-0.9280001521110535, 0.8991212546825409, 0.028880521655082703, 0.11024391651153564, -0.11024245619773865]
MAX INCREASE FROM A PHRASE: [0.054001927, 0.1613546, 0.0055799475, 0.022153616, 0.006734699]
PHRASES THAT CAUSED EACH ^: ['backordered', 'certifying', 'legally qualified', 're-registered', 'cer