## Testing on euph corpus

#### Load and preprocess euph sentences

In [2]:
import pandas as pd
import re

euph_corpus = pd.read_csv('Euphemism_Corpus_2-24.csv', index_col=0, encoding='utf-8')

In [5]:
%pip install --upgrade pandas

Collecting pandas
  Downloading pandas-1.4.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.7 MB)
[K     |████████████████████████████████| 11.7 MB 29.0 MB/s eta 0:00:01████▏                    | 4.1 MB 29.0 MB/s eta 0:00:01██████████████████████▏       | 8.8 MB 29.0 MB/s eta 0:00:01
Installing collected packages: pandas
  Attempting uninstall: pandas
    Found existing installation: pandas 1.2.4
    Uninstalling pandas-1.2.4:
      Successfully uninstalled pandas-1.2.4
Successfully installed pandas-1.4.2
Note: you may need to restart the kernel to use updated packages.


In [2]:
def preprocess(s):
    s = s.strip()
    s = re.sub(r'(##\d*\W)|<\w>|,|;|:|--|\(|\)|#|%|\\|\/|\.|\*|\+|@', '', s)
    s = re.sub(r'\s\s+', ' ', s)
    s = s.lower()
    return s

In [12]:
# preprocess sentences
for i, row in euph_corpus.iterrows():
    text = euph_corpus.loc[i, 'sentence']
    euph_corpus.loc[i, 'sentence'] = preprocess(text)

In [13]:
# phrasify the sentences
from gensim.models.phrases import Phraser, Phrases

bigram_phraser = Phraser.load('bigram_phraser_7')
trigram_phraser = Phraser.load('trigram_phraser_7')
euph_corpus['phrases'] = ""
data = [] # holds phrased input sentences to update wv model with

for i, row in euph_corpus.iterrows():
    text = euph_corpus.loc[i, 'sentence']
    euph_corpus.at[i, 'phrases'] = bigram_phraser[text.split()] # use phraser to detect phrases in text
    euph_corpus.at[i, 'phrases'] = trigram_phraser[euph_corpus.loc[i, 'phrases']]
    data.append(euph_corpus.loc[i, 'phrases'])

In [14]:
# confirm the phraser is still working
trigram_phraser[['are', 'we', 'talking', 'the', 'merits', "of'", 'enhanced_interrogation', 'techniques', 'or', 'the', 'definition', 'of', 'torture']]

['are',
 'we',
 'talking',
 'the',
 'merits',
 "of'",
 'enhanced_interrogation_techniques',
 'or',
 'the',
 'definition_of_torture']

#### Define topic similarity function, topic list and stopwords

In [15]:
def sum_similarity(phrase, topic_list):
    score = 0
    for topic in topic_list:
        try:
            similarity = model.wv.similarity(phrase, topic)
            if (similarity > 0):
                score += similarity
        except:
            score += 0
    return score

In [16]:
# define topic list and stopwords
topic_list = ['politics', 'death', 'kill', 'crime',
               'drugs', 'alcohol', 'fat', 'old', 'poor', 'cheap',
               'sex', 'sexual',
               'employment', 'job', 'disability', 'disabled', 
               'accident', 'pregnant', 'poop', 'sickness', 'race', 'racial', 'vomit'
              ]

stopwords = []
#['the', 'a', 'to', 'him', 'her', 'them', 'me', 'you', 'of', 'with']

with open('stopwords.txt','rb') as f:
    content = f.read()
    content = content.split(b'\r\n')
    for line in content:
        stopwords.append(line.decode('utf-8'))

#### Perform topic filtering and evaluation

In [3]:
# define model and train on new data
from gensim.models import Word2Vec
model = Word2Vec.load("data/wv_model_7") # typically takes 45-90 seconds
# train model on input data 
model.train(data, total_examples=len(data), epochs=10)

NameError: name 'data' is not defined

In [18]:
THRESHOLD = 1.5
score = 0

successes = []
partial_successes = []
failures = []
topically_filtered_euphs = []
quality_phrase_count = 0
filtered = []

euph_corpus['quality_phrases'] = ""

for i, row in euph_corpus.iterrows():
    text = euph_corpus.loc[i, 'sentence']
    phrases = euph_corpus.loc[i, 'phrases']
    euph = euph_corpus.loc[i, 'keyword']
    quality_phrases = []
    for phrase in phrases:
        if (phrase in stopwords):
            continue
        similarity = sum_similarity(phrase, topic_list)
        if (similarity > THRESHOLD and phrase not in quality_phrases):
            quality_phrases.append(phrase)
        elif (similarity < THRESHOLD and euph == re.sub(r'_', ' ', phrase)):
            if euph not in topically_filtered_euphs:
                topically_filtered_euphs.append(euph)
        else:
            filtered.append(phrase)
    # add the quality phrases to the column
    euph_corpus.at[i, 'quality_phrases'] = quality_phrases
    
    # now check if the euph in the sentence is retained in the list of quality phrases
    quality_phrases = [re.sub(r'_', ' ', p) for p in quality_phrases]
    quality_phrase_count += len(quality_phrases)
    
    if euph in quality_phrases:
        score += 1
        if euph not in successes:
            successes.append(euph)
    else:
        partial_success = False
        for p in quality_phrases: # check if phrase output contains euphemism
            if euph in p:
                score += 1
                if euph not in partial_successes:
                    partial_successes.append(euph)
                    partial_success = True
                    break
        if (partial_success == False): 
            if euph not in failures:
                failures.append(euph)

            # check failures for a particular phrase
            # if (euph == "ethnic cleansing"):
            #     print("TEXT: {}".format(text))
            #     print("PHRASES: {}".format(phrases))
            #     print("QUALITY PHRASES: {}".format(quality_phrases))
            #     print()

print("Retained the euphemism in {} out of {} sentences".format(score, len(euph_corpus)))
print("{} quality phrases retained overall".format(quality_phrase_count))
print("Filtered {} non-keywords out".format(len(filtered)))
print()
print("EXACT SUCCESSES: {}".format(successes))
print()
print("PARTIAL SUCCESSES: {}".format(partial_successes))
print()
print("FAILURES: {}".format(failures))
print()
print("FALSE NEGATIVES of TOPIC FILTERING: {}".format(topically_filtered_euphs))

Retained the euphemism in 1626 out of 1965 sentences
14787 quality phrases retained overall
Filtered 7034 non-keywords out

EXACT SUCCESSES: ['tinkle', 'undocumented immigrants', 'undocumented immigrant', 'venereal diseases', 'venereal disease', 'sex workers', 'sex worker', 'mentally disabled', 'correctional facilities', 'correctional facility', 'freedom fighters', 'freedom fighter', 'detainees', 'detainee', 'psychiatric hospital', 'ethnic cleansing', 'ethnically cleansed', 'enhanced interrogation techniques', 'mistruths', 'mistruth', 'elderly', 'armed conflict', 'drinking problem', 'deceased', 'pro-life', 'income inequality', 'rear end', 'lavatory', 'birds and the bees', 'inner city', 'developing country', 'developed country', 'substance abuse', 'global south', 'underprivileged', 'inebriated', 'homemaker', 'capital punishment', 'differently-abled', 'indigent', 'detention camp', 'pass gas', 'dearly departed', 'terminating a pregnancy', 'pregnancy termination', 'senior citizen', 'senior

In [19]:
# testing - for topic similarity queries on a single phrase
test_phrase = 'weed'
similar_topics = []
score = 0
for topic in topic_list:
    similarity = model.wv.similarity(test_phrase, topic)
    if (similarity > 0.24):
        similar_topics.append(topic)
    if (similarity > 0):
        score += similarity
    print('{}: {}'.format(topic, similarity))

print('SIMILAR TOPICS: {}'.format(similar_topics))
print('TOTAL SCORE: {}'.format(score))

politics: 0.11407020688056946
death: 0.08639270067214966
kill: 0.31013041734695435
crime: 0.3427277207374573
drugs: 0.5708686113357544
alcohol: 0.6496191024780273
fat: 0.5302789211273193
old: 0.24798649549484253
poor: 0.1948791742324829
cheap: 0.41431498527526855
sex: 0.30003926157951355
sexual: 0.15527567267417908
employment: -0.014575891196727753
job: 0.12978608906269073
disability: -0.025316180661320686
disabled: -0.03400751203298569
accident: 0.17667508125305176
pregnant: 0.2841845154762268
poop: 0.6066109538078308
sickness: 0.3730947971343994
race: 0.10498347878456116
racial: 0.1140114963054657
vomit: 0.6100114583969116
SIMILAR TOPICS: ['kill', 'crime', 'drugs', 'alcohol', 'fat', 'old', 'cheap', 'sex', 'pregnant', 'poop', 'sickness', 'vomit']
TOTAL SCORE: 6.315941140055656


## Sentiment

#### Load up the checkpoint

In [1]:
# load checkpoint containing quality phrases
import pandas as pd
import ast

euph_corpus = pd.read_csv('Euphemism_Corpus_with_Quality_Phrases_1.csv', encoding='utf-8', index_col = 0)
euph_corpus = euph_corpus.drop(euph_corpus[euph_corpus.is_euph == 0].index) # drop all non-euph rows
euph_corpus = euph_corpus.sample(frac=1) # randomize row order

import re
euph_corpus['keyword_present'] = 0
# denote rows where keyword was present in REGULAR phrases
for i, row in euph_corpus.iterrows():
    if (euph_corpus.loc[i, "is_euph"] == 0):
        continue
    quality_phrases = euph_corpus.loc[i, "quality_phrases"]
    # Converting string to list
    quality_phrases = ast.literal_eval(quality_phrases)
    keyword = euph_corpus.loc[i, 'keyword']
    for q in quality_phrases:
        q_string = re.sub(r'_', ' ', q)
        if keyword in q_string:
            euph_corpus.loc[i, 'keyword_present'] = 1
            # count += 1
            break

euph_corpus

Unnamed: 0,keyword,edited_text,is_euph,category,type,euph_status,sentence,phrases,quality_phrases,keyword_present
554,same sex,"Multiple ""partners"" as you call it was NEVER i...",1,physical/mental attributes,same sex,always_euph,multiple partners and same sex partners was ne...,"['multiple_partners', 'and', 'same_sex', 'part...","['multiple_partners', 'same_sex', 'partners', ...",1
67,correctional facility,Ray Reyna Jr. said Los Banos Police will aid t...,1,employment,correctional facility,always_euph,this was such a tragic case and and booked on ...,"['this', 'was', 'such', 'a', 'tragic_case', 'a...","['tragic_case', 'suspicion_of_felony', 'dui', ...",1
97,detainees,"The disclosure that the detainee, Adnan Farhan...",1,politics,detainee,always_euph,instead the detainees remain stuck in legal li...,"['instead', 'the', 'detainees', 'remain_stuck'...","['detainees', 'remain_stuck', 'legal_limbo']",1
713,mentally challenged,Society as a whole has the respect for life sl...,1,physical/mental attributes,mentally challenged,always_euph,had down syndrome or was crippled mentally cha...,"['had', 'down_syndrome', 'or', 'was', 'cripple...","['down_syndrome', 'crippled', 'mentally_challe...",1
754,golden years,The new figures come out of BC's Center for Re...,1,physical/mental attributes,golden years,always_euph,only about 48 percent of current working house...,"['only', 'about', '48_percent', 'of', 'current...","['48_percent', 'working_households', 'ready', ...",1
...,...,...,...,...,...,...,...,...,...,...
1184,custodian,Ten dollars can help a kindergartner @ @ @ @ @...,1,employment,custodian,sometimes_euph,his journey has included a stint playing on hi...,"['his_journey', 'has', 'included_a_stint', 'pl...","['his_journey', 'included_a_stint', 'playing',...",1
212,deceased,This Must Be the Place-- Sean Penn stars in th...,1,death,deceased,always_euph,this must be the place sean penn stars in this...,"['this', 'must_be', 'the', 'place', 'sean_penn...","['sean_penn', 'comedy', 'retired', 'rock_star'...",1
978,collateral damage,"Arkin then turns to Afghanistan, arguing that ...",1,death,collateral damage,sometimes_euph,hence we need not be overly concerned about ci...,"['hence', 'we_need', 'not', 'be', 'overly_conc...","['overly_concerned', 'collateral_damage', ""'""]",1
278,rear end,Howard said he might be. Roger said that there...,1,physical/mental attributes,rear end,always_euph,roger said that there were people who came on ...,"['roger_said', 'that', 'there_were', 'people_w...","['people_who', 'touch', 'rear_end']",1


In [2]:
# load w2v model if using checkpoint
data = [] # holds phrased input sentences to update wv model with

for i, row in euph_corpus.iterrows():
    data.append(euph_corpus.loc[i, 'phrases'])
    
# define model and train on new data
from gensim.models import Word2Vec
model = Word2Vec.load("data/wv_model_7") # typically takes 45-90 seconds
# train model on input data 
model.train(data, total_examples=len(data), epochs=10)

(2350434, 3241180)

#### roBERTa Sentiment

In [6]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from scipy.special import softmax
import csv
import urllib.request

def load_roberta_sentiment():
    # Tasks:
    # emoji, emotion, hate, irony, offensive, sentiment
    # stance/abortion, stance/atheism, stance/climate, stance/feminist, stance/hillary

    task='sentiment'
    MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

    tokenizer = AutoTokenizer.from_pretrained(MODEL)

    # download label mapping
    labels=[]
    mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
    with urllib.request.urlopen(mapping_link) as f:
        html = f.read().decode('utf-8').split("\n")
        csvreader = csv.reader(html, delimiter='\t')
    labels = [row[1] for row in csvreader if len(row) > 1]

    # pretrained
    model = AutoModelForSequenceClassification.from_pretrained(MODEL)
    model.save_pretrained(MODEL)
    tokenizer.save_pretrained(MODEL)
    
    return labels, model, tokenizer

#### roBERTa Offensive

In [7]:
def load_roberta_offensive():
    task='offensive'
    MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

    tokenizer = AutoTokenizer.from_pretrained(MODEL)

    # download label mapping
    labels=[]
    mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
    with urllib.request.urlopen(mapping_link) as f:
        html = f.read().decode('utf-8').split("\n")
        csvreader = csv.reader(html, delimiter='\t')
    labels = [row[1] for row in csvreader if len(row) > 1]

    # PT
    model = AutoModelForSequenceClassification.from_pretrained(MODEL)
    model.save_pretrained(MODEL)
    tokenizer.save_pretrained(MODEL)
    
    return labels, model, tokenizer

In [8]:
# functions for using the roberta models
def get_sentiment(s, labels, model, tokenizer):
    encoded_input = tokenizer(s, return_tensors='pt')
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    # ranking = np.argsort(scores)
    # ranking = ranking[::-1]
    # for i in range(scores.shape[0]):
    #     l = labels[ranking[i]]
    #     s = scores[ranking[i]]
        # print(f"{i+1}) {l} {np.round(float(s), 4)}")
    return scores

def get_offensive(s, labels, model, tokenizer):
    encoded_input = tokenizer(s, return_tensors='pt')
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    # ranking = np.argsort(scores)
    # ranking = ranking[::-1]
    # for i in range(0, 2):
    #     l = labels[ranking[i]]
    #     s = scores[ranking[i]]
        # print(f"{i+1}) {l} {np.round(float(s), 4)}")
    return scores

#### Run sentiment/offensive analysis on euph corpus

In [23]:
'''
needs functions load_roberta_sentiment(), load_roberta_offensive(), get_sentiment() and get_offensive()
'''

# TEMP EXPERIMENT CHUNK
from difflib import SequenceMatcher
def get_similarity(a, b):
    return SequenceMatcher(None, a, b).ratio()

def get_top_euph_candidates(text, phrases, num_paraphrases, wv_model, sentiment_pack, offensive_pack, show_stats=False):
    
    sentiment_labels, sentiment_model, sentiment_tokenizer = sentiment_pack[0], sentiment_pack[1], sentiment_pack[2]
    offensive_labels, offensive_model, offensive_tokenizer = offensive_pack[0], offensive_pack[1], offensive_pack[2]
    
    orig_scores = list(get_sentiment(text, sentiment_labels, sentiment_model, sentiment_tokenizer))
    orig_scores = orig_scores + list(get_offensive(text, offensive_labels, offensive_model, offensive_tokenizer))
    if show_stats == True: print('SENTIMENT OF ORIGINAL SENTENCE: {}'.format(orig_scores))
    phrase_scores = []

    for q in phrases:
        paraphrases = []
        if show_stats == True: print('\n'+q)
        paraphrases = wv_model.wv.most_similar(q, topn = num_paraphrases) # can swap out
        
        # print(paraphrases)
        
        # various sentiment statistics
        sentiment_shift = [0, 0, 0, 0, 0]
        max_inc = [0, 0, 0, 0, 0]
        max_inc_para = ["", "", "", "", ""]
        tot_neg_inc = 0
        tot_neu_inc = 0
        tot_pos_inc = 0
        tot_off_inc = 0
        tot_noff_inc = 0
        
        # for length ratio feature
        length_ratio = 0
        tot_para_length = 0
        num_para = 0
        
        for p in paraphrases:
            p_string = re.sub(r'_', ' ', p[0]) # the underscores are removed for sentiment computation - experiment?
            q_string = re.sub(r'_', ' ', q)
            
            '''FILTERING PARAPHRASES'''
            # filtering out paraphrases if they're a superstring
            if (q_string in p_string):
#                 print("Paraphrase is superstring, skipping!")
#                 print()
                continue
    
            # filtering out paraphrases if they're too similar
            if (get_similarity(q_string, p_string) > 0.5):
                continue
                
            if ('fuck' in p_string):
                continue
            
            if (wv_model.wv.get_vecattr(p[0], 'count') < 5):
                continue
                
            '''END PARAPHRASE FILTERING'''
            
            # replacement
            pattern = re.compile(r'\b'+q_string+r'\b', re.IGNORECASE)
            new_sentence = pattern.sub(p_string, text)
            # at this point, we could check the integrity of the paraphrase

            # get the sentiment/offensive scores for this paraphrase
            scores = list(get_sentiment(new_sentence, sentiment_labels, sentiment_model, sentiment_tokenizer))
            scores = scores + list(get_offensive(new_sentence, offensive_labels, offensive_model, offensive_tokenizer))

            # update the quality phrase's sentiment statistics with the sentiment shifts from this paraphrase
            shifts = [0, 0, 0, 0, 0]
            for i in range(0, len(scores)):
                shifts[i] = scores[i] - orig_scores[i]
                sentiment_shift[i] += shifts[i]
                if (shifts[i] > max_inc[i]):
                    max_inc[i] = shifts[i]
                    max_inc_para[i] = p_string

            # update the relevant scores for detection
            if (shifts[0] > 0):
                tot_neg_inc += shifts[0]
            if (shifts[1] > 0):
                tot_neu_inc += shifts[1]
            if (shifts[2] > 0):
                tot_pos_inc += shifts[2]
            if (shifts[3] > 0):
                tot_noff_inc += shifts[3]
            if (shifts[4] > 0):
                tot_off_inc += shifts[4]
                
            # update counts for length ratio
            num_para += 1
            tot_para_length += len(p_string)
        
        # compute length ratio feature
        if (num_para != 0):
            avg_para_length = tot_para_length / num_para
            length_ratio = len(q_string) / avg_para_length
#         print(length_ratio)
#         break
        
        for val in sentiment_shift:
            val /= num_paraphrases
        if (show_stats == True):
            print("AVERAGE SENTIMENT SHIFTS: {}".format(sentiment_shift))
            print("MAX INCREASE FROM A PHRASE: {}".format(max_inc))
            print("PHRASES THAT CAUSED EACH ^: {}".format(max_inc_para))
            print("TOTAL NEGATIVE INCREASE: {}".format(tot_neg_inc))
            print("TOTAL NEUTRAL INCREASE: {}".format(tot_neu_inc))
            print("TOTAL NEUTRAL INCREASE: {}".format(tot_noff_inc))
            print("TOTAL OFFENSIVE INCREASE: {}".format(tot_off_inc))

        # phrase_scores.append((q_string, tot_neg_inc + tot_neu_inc + 2*(tot_noff_inc + tot_off_inc)))
        # phrase_scores.append((q_string, 0.02337676*tot_neg_inc + 0.02267515*tot_neu_inc + 0.09802046*tot_noff_inc + 0.14442855*tot_off_inc))
        # 0.01125929*tot_pos_inc
        phrase_scores.append((q_string, 0.02183689*tot_neg_inc + 0.01949368*tot_neu_inc + 0.09130243*tot_noff_inc + 0.12983809 *tot_off_inc + 0.15030182*length_ratio))
            
        # phrase_scores.append((q_string, max_inc[0] + 2*max_inc[4]))
        # phrase_scores.append((q_string, tot_neg_inc + tot_neu_inc + 2*(tot_off_inc)))
    phrase_scores = list(sorted(phrase_scores, key=lambda x: x[1], reverse=True))
    return phrase_scores

In [10]:
# load the models
sentiment_labels, sentiment_model, sentiment_tokenizer = load_roberta_sentiment()
offensive_labels, offensive_model, offensive_tokenizer = load_roberta_offensive()

sentiment_pack = [sentiment_labels, sentiment_model, sentiment_tokenizer]
offensive_pack = [offensive_labels, offensive_model, offensive_tokenizer]

In [19]:
import re

num_paraphrases = 25
score = 0
k = 2 # check the top k candidates for the PET -> success
euph_corpus['candidates'] = ""
euph_corpus['top_2'] = 0

In [24]:
from tqdm import tqdm
import ast

for i, row in tqdm(euph_corpus.iterrows(), total=euph_corpus.shape[0]):
# uncomment below if resuming from checkpoint
#     if (0 < i < 600):
#         continue
    phrases = euph_corpus.loc[i, 'quality_phrases']
    
    # Converting string to list IF READING FROM CSV as checkpoint
    phrases = ast.literal_eval(phrases)
    
    text = euph_corpus.loc[i, 'sentence']
    euph = euph_corpus.loc[i, 'keyword']
    
    top_candidates = get_top_euph_candidates(text, phrases, num_paraphrases, model, 
                                             sentiment_pack, offensive_pack, show_stats=False)
    
#     print(top_candidates)
#     print()
    euph_corpus.at[i, 'candidates'] = top_candidates
    
    # check the top k candidates - this code could use cleaning up
    for x in range(0, k):
        if (len(top_candidates) == 0):
            break
        if (len(top_candidates) == 1):
            candidate = top_candidates[0][0]
            if euph in candidate:
                score += 1
                if (score % 50 == 0):
                    print(score)
                euph_corpus.loc[i, 'top_2'] = 1
            break
        candidate = top_candidates[x][0]
        if euph in candidate:
            score += 1
            if (score % 50 == 0):
                print(score)
            euph_corpus.loc[i, 'top_2'] = 1
            break

    if (i == 691):
        euph_corpus.to_csv('CHECKPOINT.csv')
    if (i == 1382):
        break
print("Euphemism detected in {} out of {} sentences".format(score, 1382))

  6%|▋         | 87/1382 [10:23<1:39:43,  4.62s/it]

50


 13%|█▎        | 182/1382 [23:54<2:28:50,  7.44s/it]

100


 19%|█▉        | 269/1382 [35:59<2:16:42,  7.37s/it]

150


 26%|██▌       | 358/1382 [46:29<2:10:46,  7.66s/it]

200


 32%|███▏      | 441/1382 [57:23<1:20:02,  5.10s/it]

250


 39%|███▉      | 539/1382 [1:10:37<1:07:56,  4.84s/it]

300


 46%|████▌     | 630/1382 [1:24:06<1:40:42,  8.04s/it]

350


 52%|█████▏    | 716/1382 [1:34:52<1:26:01,  7.75s/it]

400


 58%|█████▊    | 806/1382 [1:46:34<59:16,  6.18s/it]  

450


 64%|██████▍   | 888/1382 [1:57:49<45:01,  5.47s/it]  

500


 71%|███████   | 979/1382 [2:07:45<51:46,  7.71s/it]  

550


 77%|███████▋  | 1064/1382 [2:18:03<30:30,  5.76s/it]  

600


 83%|████████▎ | 1150/1382 [2:29:53<21:39,  5.60s/it]  

650


 89%|████████▊ | 1225/1382 [2:38:11<17:58,  6.87s/it]

700


 94%|█████████▍| 1305/1382 [2:47:04<06:51,  5.34s/it]

750


100%|██████████| 1382/1382 [2:58:11<00:00,  7.74s/it]

Euphemism detected in 798 out of 1382 sentences





In [30]:
num_correct = 0
for x in euph_corpus['top_2'].tolist():
    if (x == 1):
        num_correct += 1
print(num_correct)

725


In [25]:
euph_corpus.to_csv('results_10.3.csv')

## Analytics

In [1]:
import pandas as pd

euph_corpus = pd.read_csv('results_8.3.csv', index_col=0)

#### Print number of 1st, 2nd, and 3rd place PET rankings

In [26]:
import ast # this package is helpful for parsing lists stored in CSV files; which contain the literal characters [, ], etc.
num_first_place = 0
num_second_place = 0
num_third_place = 0
for i, row in euph_corpus.iterrows():
    if (i > 1382):
        continue
    top_2 = euph_corpus.loc[i, 'top_2']
    keyword = euph_corpus.loc[i, 'keyword']
    candidates = euph_corpus.loc[i, 'candidates']
    # Converting string to list
    # candidates = ast.literal_eval(candidates)
    if (top_2 == 1):
        if (keyword in candidates[0][0]):
            num_first_place += 1
        elif (keyword in candidates[1][0]):
            num_second_place += 1
    elif (len(candidates) > 2):
        if (keyword in candidates[2][0]):
            num_third_place += 1

print(num_first_place)
print(num_second_place)
print(num_third_place)

555
243
145


#### Print number of phrase candidates and target PETs retained after Phrase Extraction

In [16]:
import re
count = 0
tot_p = 0
# denote rows where keyword was present in REGULAR phrases
for i, row in euph_corpus.iterrows():
    if (euph_corpus.loc[i, "is_euph"] == 0):
        continue
    phrases = euph_corpus.loc[i, "phrases"]
    # Converting string to list
    phrases = ast.literal_eval(phrases)
    tot_p += len(phrases)
    keyword = euph_corpus.loc[i, 'keyword']
    for p in phrases:
        p_string = re.sub(r'_', ' ', p)
        if keyword in p_string:
            euph_corpus.loc[i, 'keyword_present'] = 1
            count += 1
            break
            
print(count)
print(tot_p)

1251
31348


#### Print number of phrase candidates and target PETs retained after Phrase Filtering

In [17]:
import re
count = 0
tot_q = 0
# denote rows where keyword was present in quality phrases
euph_corpus['keyword_present'] = 0
for i, row in euph_corpus.iterrows():
    if (euph_corpus.loc[i, "is_euph"] == 0):
        continue
    quality_phrases = euph_corpus.loc[i, "quality_phrases"]
    # Converting string to list
    quality_phrases = ast.literal_eval(quality_phrases)
    tot_q += len(quality_phrases)
    keyword = euph_corpus.loc[i, 'keyword']
    for q in quality_phrases:
        q_string = re.sub(r'_', ' ', q)
        if keyword in q_string:
            euph_corpus.loc[i, 'keyword_present'] = 1
            count += 1
            break

print(count)
print(tot_q)

1199
10492


In [9]:
# append whether or not the PET is present in quality_phrases (prior to ranking stage) as a column
euph_corpus.to_csv('results_8.2.1.csv')

#### Print number of phrase candidates and target PETs retained after Phrase Ranking

In [15]:
tot_top_2 = 0
for i, row in euph_corpus.iterrows():
    if (euph_corpus.loc[i, "is_euph"] == 0):
        continue
    phrases = euph_corpus.loc[i, "candidates"]
    if (len(phrases) == 0):
        continue
    if (len(phrases) == 1):
        tot_top_2 += 1
    else: 
        tot_top_2 += 2
    # Converting string to list
    # phrases = ast.literal_eval(phrases)
    # tot_top_2 += len(phrases)
print(tot_top_2)

2727
