<a href="https://colab.research.google.com/github/marsgav/TowardTacklingEuphemisms/blob/main/MGEuphemisms_AutoPhrase.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Toward Processing Euphemisms in Natural Language Processing
Martha Gavidia
APLN 606

This notebook contains the code used to run the experiment in "Toward Processing Euphemisms in Natural Language Processing" for APLN 606.

The code base comes from Searching for PETs: Using Distributional and Sentiment-based Methods for Finding Potentially Euphemistic PETs 

Lee P., Gavidia M., Feldman A. and J. Peng. “Searching for PETs: Using Distributional and Sentiment-Based Methods to Find Potentially Euphemistic Terms”. In Proceedings of the 2nd Workshop on Understanding Implicit and Underspecified Language, NAACL 2022, Seattle

The changes in this notebook come at the phrase extraction stage, where Lee et. all use Phrases for finding phrases within text, this notebook experiments with Autophrase.  Used in conjunction with spacy's PhraseMatcher, the algorithm is then run with this new phrase extraction method.

# Load Euphemism Corpus

In [None]:
import pandas as pd
import re

euph_corpus = pd.read_csv('Euphemism_Corpus_2-24.csv', index_col=0, encoding='utf-8')

In [None]:
euph_corpus.head()


Unnamed: 0,keyword,edited_text,is_euph,category,type,euph_status,sentence
0,tinkle,We're just getting back what was TAKEN from us...,1,bodily functions,tinkle,always_euph,We're just getting back what was TAKEN from us...
1,tinkle,I think AB390 will pass next year now that the...,1,bodily functions,tinkle,always_euph,I think AB390 will pass next year now that the...
2,undocumented immigrants,"Singled Out Think Like a Man, the new movie ba...",1,politics,undocumented immigrant,always_euph,Anything but Secure A federal program designed...
3,undocumented immigrants,"Not to be outdone, Sen. Rand Paul (R-Ky. ), so...",1,politics,undocumented immigrant,always_euph,In a post-election interview with POLITICO Pau...
4,undocumented immigrants,The law has also galvanized the growing immigr...,1,politics,undocumented immigrant,always_euph,Aside from undocumented immigrants the America...


In [None]:
def preprocess(s):
    s = s.strip()
    s = re.sub(r'(##\d*\W)|<\w>|,|;|:|--|\(|\)|#|%|\\|\/|\.|\*|\+|@', '', s)
    s = re.sub(r'\s\s+', ' ', s)
    s = s.lower()
    return s

In [None]:
# preprocess sentences
for i, row in euph_corpus.iterrows():
    text = euph_corpus.loc[i, 'sentence']
    euph_corpus.loc[i, 'sentence'] = preprocess(text)

In [None]:
euph_corpus.head()

Unnamed: 0,keyword,edited_text,is_euph,category,type,euph_status,sentence
0,tinkle,We're just getting back what was TAKEN from us...,1,bodily functions,tinkle,always_euph,we're just getting back what was taken from us...
1,tinkle,I think AB390 will pass next year now that the...,1,bodily functions,tinkle,always_euph,i think ab390 will pass next year now that the...
2,undocumented immigrants,"Singled Out Think Like a Man, the new movie ba...",1,politics,undocumented immigrant,always_euph,anything but secure a federal program designed...
3,undocumented immigrants,"Not to be outdone, Sen. Rand Paul (R-Ky. ), so...",1,politics,undocumented immigrant,always_euph,in a post-election interview with politico pau...
4,undocumented immigrants,The law has also galvanized the growing immigr...,1,politics,undocumented immigrant,always_euph,aside from undocumented immigrants the america...


# Autophrase

In [None]:
# create phrase column with found phrases through AutoPhrase

In [None]:
multidata = pd.read_csv('/home/gavidiam1/Downloads/euphcorpus_AutoPhrase_multi-words.txt', sep='\t', header=None)
multidata.columns = ['score','mwe']
multidata['mwe'] = multidata['mwe'].str.strip()
#multidata['mwe2'] = multidata['mwe'].str.replace(r' ','_')
multi = multidata['mwe'].to_list()
#multi2 = multidata['mwe2'].to_list()

singledata = pd.read_csv('/home/gavidiam1/Downloads/euphcorpus_AutoPhrase_single-word.txt', sep='\t',header=None)
singledata.columns = ['score', 'phrase']
singledata['phrase'] =singledata['phrase'].str.strip()
singledata['phrase'] = singledata['phrase'].astype(str)
single = singledata['phrase'].to_list()

#combine all
autophrases = (multi+single)

# PhraseMatcher

In [None]:
import spacy
from spacy.matcher import PhraseMatcher
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

In [None]:
import spacy.cli
spacy.cli.download('en_core_web_md')

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [None]:
#Find Autophrase phrases in text
from tqdm import tqdm
nlp = spacy.load("en_core_web_md")
matcher = PhraseMatcher(nlp.vocab,attr="LOWER")
terms = autophrases
patterns = [nlp.make_doc(text) for text in terms]
matcher.add("TerminologyList", patterns)

with tqdm(total=len(euph_corpus)) as pbar: # this loop is for the progress bar
    for index, row in euph_corpus.iterrows():
        d = []
        doc = nlp(row['sentence'])
        matches = matcher(doc)
        for match_id, start, end in matches:
            span = doc[start : end]  # get the matched slice of the doc
            d.append(span.text)
            euph_corpus.at[index, 'autophrase'] = d
        pbar.update(1)

100%|██████████| 1965/1965 [00:26<00:00, 74.80it/s]


In [None]:
euph_corpus.head()

Unnamed: 0,keyword,edited_text,is_euph,category,type,euph_status,sentence,autophrase
0,tinkle,We're just getting back what was TAKEN from us...,1,bodily functions,tinkle,always_euph,we're just getting back what was taken from us...,"[we, we're, just, getting, back, what, was, ta..."
1,tinkle,I think AB390 will pass next year now that the...,1,bodily functions,tinkle,always_euph,i think ab390 will pass next year now that the...,"[i, think, will, pass, next, year, now, that, ..."
2,undocumented immigrants,"Singled Out Think Like a Man, the new movie ba...",1,politics,undocumented immigrant,always_euph,anything but secure a federal program designed...,"[anything, but, a, program, to, undocumented, ..."
3,undocumented immigrants,"Not to be outdone, Sen. Rand Paul (R-Ky. ), so...",1,politics,undocumented immigrant,always_euph,in a post-election interview with politico pau...,"[in, a, election, with, said, he, wants, to, t..."
4,undocumented immigrants,The law has also galvanized the growing immigr...,1,politics,undocumented immigrant,always_euph,aside from undocumented immigrants the america...,"[from, undocumented, undocumented immigrants, ..."


In [None]:
euph_corpus['autophrase'][4]

['from',
 'undocumented',
 'undocumented immigrants',
 'immigrants',
 'the',
 'the american',
 'american',
 'citizens',
 'who',
 'make',
 'up',
 'what',
 'i',
 'call',
 'the',
 'of',
 'people',
 'like',
 'who',
 'for',
 'immigration',
 'even',
 'though',
 'her',
 'a',
 'in',
 'was',
 'killed',
 'by',
 'an',
 'undocumented',
 'are',
 'up',
 'and',
 'out']

In [None]:
for i, row in euph_corpus.iterrows():
    text = euph_corpus.loc[i, 'autophrase']
    text = [s.replace(' ', '_') for s in text]
    euph_corpus.at[i, 'autophrase'] = text

In [None]:
euph_corpus

Unnamed: 0,keyword,edited_text,is_euph,category,type,euph_status,sentence,autophrase
0,tinkle,We're just getting back what was TAKEN from us...,1,bodily functions,tinkle,always_euph,we're just getting back what was taken from us...,"[we, we're, just, getting, back, what, was, ta..."
1,tinkle,I think AB390 will pass next year now that the...,1,bodily functions,tinkle,always_euph,i think ab390 will pass next year now that the...,"[i, think, will, pass, next, year, now, that, ..."
2,undocumented immigrants,"Singled Out Think Like a Man, the new movie ba...",1,politics,undocumented immigrant,always_euph,anything but secure a federal program designed...,"[anything, but, a, program, to, undocumented, ..."
3,undocumented immigrants,"Not to be outdone, Sen. Rand Paul (R-Ky. ), so...",1,politics,undocumented immigrant,always_euph,in a post-election interview with politico pau...,"[in, a, election, with, said, he, wants, to, t..."
4,undocumented immigrants,The law has also galvanized the growing immigr...,1,politics,undocumented immigrant,always_euph,aside from undocumented immigrants the america...,"[from, undocumented, undocumented_immigrants, ..."
...,...,...,...,...,...,...,...,...
1960,sleep with,There were other photos she wanted me to see: ...,0,sexual activity,sleep with,sometimes_euph,there were other photos she wanted me to see b...,"[there, were, other, she, wanted, me, to, to_s..."
1961,sleep with,I am relieved to see two pup tents marked STAF...,0,sexual activity,sleep with,sometimes_euph,thank god i don't have to sleep with ace wands,"[god, i, do, don't, have, to, to_sleep, sleep,..."
1962,sleep around,"Nothing serious, just long nights of me hackin...",0,sexual activity,sleep around,sometimes_euph,with all my caterwauling it's a wonder anyone ...,"[with, all, my, it, it's, a, anyone, gets, any..."
1963,with child,sounds more like Jonestown. They cant leave @ ...,0,physical/mental attributes,with child,sometimes_euph,they cant leave best advice i can give them is...,"[they, leave, best, i, can, give, them, is, if..."


In [None]:
euph_corpus.to_csv('euph_corpus_segmented.csv')

## Similarity measure

In [None]:
def sum_similarity(phrase, topic_list):
    score = 0
    for topic in topic_list:
        try:
            similarity = model.wv.similarity(phrase, topic)
            if (similarity > 0):
                score += similarity
        except:
            score += 0
    return score

In [None]:
# define topic list and stopwords
topic_list = ['politics', 'death', 'kill', 'crime',
               'drugs', 'alcohol', 'fat', 'old', 'poor', 'cheap',
               'sex', 'sexual',
               'employment', 'job', 'disability', 'disabled', 
               'accident', 'pregnant', 'poop', 'sickness', 'race', 'racial', 'vomit'
              ]

stopwords = []
#['the', 'a', 'to', 'him', 'her', 'them', 'me', 'you', 'of', 'with']

with open('/home/gavidiam1/stopwords.txt','rb') as f:
    content = f.read()
    content = content.split(b'\r\n')
    for line in content:
        stopwords.append(line.decode('utf-8'))

# Train Glowbe with word2vec

In [None]:
# define model and train on new data
from gensim.models import Word2Vec
model = Word2Vec.load("wv_model_7")
# train model on input data 
model.train(data, total_examples=len(data), epochs=10)

In [None]:
THRESHOLD = 1.5
score = 0

successes = []
partial_successes = []
failures = []
topically_filtered_euphs = []
quality_phrase_count = 0
filtered = []

euph_corpus['quality_phrases'] = ""

for i, row in euph_corpus.iterrows():
    text = euph_corpus.loc[i, 'sentence']
    phrases = euph_corpus.loc[i, 'autophrase']
    euph = euph_corpus.loc[i, 'keyword']
    quality_phrases = []
    for phrase in phrases:
        if (phrase in stopwords):
            continue
        similarity = sum_similarity(phrase, topic_list)
        if (similarity > THRESHOLD and phrase not in quality_phrases):
            quality_phrases.append(phrase)
        elif (similarity < THRESHOLD and euph == re.sub(r'_', ' ', phrase)):
            if euph not in topically_filtered_euphs:
                topically_filtered_euphs.append(euph)
        else:
            filtered.append(phrase)
    # add the quality phrases to the column
    euph_corpus.at[i, 'quality_phrases'] = quality_phrases
    
    # now check if the euph in the sentence is retained in the list of quality phrases
    quality_phrases = [re.sub(r'_', ' ', p) for p in quality_phrases]
    quality_phrase_count += len(quality_phrases)
    
    if euph in quality_phrases:
        score += 1
        if euph not in successes:
            successes.append(euph)
    else:
        partial_success = False
        for p in quality_phrases: # check if phrase output contains euphemism
            if euph in p:
                score += 1
                if euph not in partial_successes:
                    partial_successes.append(euph)
                    partial_success = True
                    break
        if (partial_success == False): 
            if euph not in failures:
                failures.append(euph)

            # check failures for a particular phrase
            # if (euph == "ethnic cleansing"):
            #     print("TEXT: {}".format(text))
            #     print("PHRASES: {}".format(phrases))
            #     print("QUALITY PHRASES: {}".format(quality_phrases))
            #     print()

print("Retained the euphemism in {} out of {} sentences".format(score, len(euph_corpus)))
print("{} quality phrases retained overall".format(quality_phrase_count))
print("Filtered {} non-keywords out".format(len(filtered)))
#print()
print("EXACT SUCCESSES: {}".format(successes))
#print()
print("PARTIAL SUCCESSES: {}".format(partial_successes))
#print()
print("FAILURES: {}".format(failures))
#print()
print("FALSE NEGATIVES of TOPIC FILTERING: {}".format(topically_filtered_euphs))

'Retained the euphemism in 1769 out of 1965 sentences'
'45349 quality phrases retained overall'
'Filtered 1646 non-keywords out'
("EXACT SUCCESSES: ['tinkle', 'undocumented immigrants', 'undocumented "
 "immigrant', 'venereal diseases', 'venereal disease', 'sex workers', 'sex "
 "worker', 'mentally disabled', 'correctional facilities', 'correctional "
 "facility', 'freedom fighters', 'freedom fighter', 'detainees', 'detainee', "
 "'comfort women', 'psychiatric hospital', 'ethnic cleansing', 'ethnically "
 "cleansed', 'enhanced interrogation techniques', 'mistruths', 'elderly', "
 "'armed conflict', 'drinking problem', 'deceased', 'income inequality', 'rear "
 "end', 'lavatory', 'inner city', 'developing country', 'developed country', "
 "'substance abuse', 'global south', 'underprivileged', 'inebriated', "
 "'homemaker', 'capital punishment', 'indigent', 'detention camp', 'pass gas', "
 "'dearly departed', 'pregnancy termination', 'senior citizen', 'senior "
 "citizens', 'substance abu

# Sentiment

## roBERTa Sentiment

In [None]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from scipy.special import softmax
import csv
import urllib.request

def load_roberta_sentiment():
    # Tasks:
    # emoji, emotion, hate, irony, offensive, sentiment
    # stance/abortion, stance/atheism, stance/climate, stance/feminist, stance/hillary

    task='sentiment'
    MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

    tokenizer = AutoTokenizer.from_pretrained(MODEL)

    # download label mapping
    labels=[]
    mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
    with urllib.request.urlopen(mapping_link) as f:
        html = f.read().decode('utf-8').split("\n")
        csvreader = csv.reader(html, delimiter='\t')
    labels = [row[1] for row in csvreader if len(row) > 1]

    # pretrained
    model = AutoModelForSequenceClassification.from_pretrained(MODEL)
    model.save_pretrained(MODEL)
    tokenizer.save_pretrained(MODEL)
    
    return labels, model, tokenizer

## Offensive sentiment

In [None]:
def load_roberta_offensive():
    task='offensive'
    MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

    tokenizer = AutoTokenizer.from_pretrained(MODEL)

    # download label mapping
    labels=[]
    mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
    with urllib.request.urlopen(mapping_link) as f:
        html = f.read().decode('utf-8').split("\n")
        csvreader = csv.reader(html, delimiter='\t')
    labels = [row[1] for row in csvreader if len(row) > 1]

    # PT
    model = AutoModelForSequenceClassification.from_pretrained(MODEL)
    model.save_pretrained(MODEL)
    tokenizer.save_pretrained(MODEL)
    
    return labels, model, tokenizer

In [None]:
def get_sentiment(s, labels, model, tokenizer):
    encoded_input = tokenizer(s, return_tensors='pt')
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    # ranking = np.argsort(scores)
    # ranking = ranking[::-1]
    # for i in range(scores.shape[0]):
    #     l = labels[ranking[i]]
    #     s = scores[ranking[i]]
        # print(f"{i+1}) {l} {np.round(float(s), 4)}")
    return scores

def get_offensive(s, labels, model, tokenizer):
    encoded_input = tokenizer(s, return_tensors='pt')
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    # ranking = np.argsort(scores)
    # ranking = ranking[::-1]
    # for i in range(0, 2):
    #     l = labels[ranking[i]]
    #     s = scores[ranking[i]]
        # print(f"{i+1}) {l} {np.round(float(s), 4)}")
    return scores

In [None]:
'''
needs functions load_roberta_sentiment(), load_roberta_offensive(), get_sentiment() and get_offensive()
'''
def get_top_euph_candidates(text, phrases, num_paraphrases, wv_model, sentiment_pack, offensive_pack, show_stats=False):
    
    sentiment_labels, sentiment_model, sentiment_tokenizer = sentiment_pack[0], sentiment_pack[1], sentiment_pack[2]
    offensive_labels, offensive_model, offensive_tokenizer = offensive_pack[0], offensive_pack[1], offensive_pack[2]
    
    orig_scores = list(get_sentiment(text, sentiment_labels, sentiment_model, sentiment_tokenizer))
    orig_scores = orig_scores + list(get_offensive(text, offensive_labels, offensive_model, offensive_tokenizer))
    if show_stats == True: print('SENTIMENT OF ORIGINAL SENTENCE: {}'.format(orig_scores))
    phrase_scores = []

    for q in phrases:
        paraphrases = []
        if show_stats == True: print('\n'+q)
        paraphrases = wv_model.wv.most_similar(q, topn = num_paraphrases) # can swap out
        
        # print(paraphrases)
        
        # various sentiment statistics
        sentiment_shift = [0, 0, 0, 0, 0]
        max_inc = [0, 0, 0, 0, 0]
        max_inc_para = ["", "", "", "", ""]
        tot_neg_inc = 0
        tot_neu_inc = 0
        tot_off_inc = 0
        tot_noff_inc = 0
        
        for p in paraphrases:
            p_string = re.sub(r'_', ' ', p[0]) # the underscores are removed for sentiment computation - experiment?
            q_string = re.sub(r'_', ' ', q)
            # replacement
            pattern = re.compile(r'\b'+q_string+r'\b', re.IGNORECASE)
            new_sentence = pattern.sub(p_string, text)
            # at this point, we could check the integrity of the paraphrase

            # get the sentiment/offensive scores for this paraphrase
            scores = list(get_sentiment(new_sentence, sentiment_labels, sentiment_model, sentiment_tokenizer))
            scores = scores + list(get_offensive(new_sentence, offensive_labels, offensive_model, offensive_tokenizer))

            # update the quality phrase's sentiment statistics with the sentiment shifts from this paraphrase
            shifts = [0, 0, 0, 0, 0]
            for i in range(0, len(scores)):
                shifts[i] = scores[i] - orig_scores[i]
                sentiment_shift[i] += shifts[i]
                if (shifts[i] > max_inc[i]):
                    max_inc[i] = shifts[i]
                    max_inc_para[i] = p_string

            # update the relevant scores for detection
            if (shifts[0] > 0):
                tot_neg_inc += shifts[0]
            if (shifts[1] > 0):
                tot_neu_inc += shifts[1]
            if (shifts[3] > 0):
                tot_noff_inc += shifts[3]
            if (shifts[4] > 0):
                tot_off_inc += shifts[4]
        
        for val in sentiment_shift:
            val /= num_paraphrases
        if (show_stats == True):
            print("AVERAGE SENTIMENT SHIFTS: {}".format(sentiment_shift))
            print("MAX INCREASE FROM A PHRASE: {}".format(max_inc))
            print("PHRASES THAT CAUSED EACH ^: {}".format(max_inc_para))
            print("TOTAL NEGATIVE INCREASE: {}".format(tot_neg_inc))
            print("TOTAL NEUTRAL INCREASE: {}".format(tot_neu_inc))
            print("TOTAL NEUTRAL INCREASE: {}".format(tot_noff_inc))
            print("TOTAL OFFENSIVE INCREASE: {}".format(tot_off_inc))

        phrase_scores.append((q_string, tot_neg_inc + tot_neu_inc + 2*(tot_noff_inc + tot_off_inc)))

    phrase_scores = list(sorted(phrase_scores, key=lambda x: x[1], reverse=True))
    return phrase_scores

In [None]:
# load the models
sentiment_labels, sentiment_model, sentiment_tokenizer = load_roberta_sentiment()
offensive_labels, offensive_model, offensive_tokenizer = load_roberta_offensive()

sentiment_pack = [sentiment_labels, sentiment_model, sentiment_tokenizer]
offensive_pack = [offensive_labels, offensive_model, offensive_tokenizer]

In [None]:
import re

num_paraphrases = 25
score = 0
k = 2
euph_corpus['candidates'] = ""
euph_corpus['top_2'] = 0

In [None]:
from tqdm import tqdm

for i, row in tqdm(euph_corpus.iterrows(), total=euph_corpus.shape[0]):
# uncomment below if resuming from checkpoint
#     if (0 < i < 600):
#         continue
    phrases = euph_corpus.loc[i, 'quality_phrases']
    
    # Converting string to list IF READING FROM CSV as checkpoint
    # phrases = ast.literal_eval(phrases)
    
    text = euph_corpus.loc[i, 'sentence']
    euph = euph_corpus.loc[i, 'keyword']
    
    top_candidates = get_top_euph_candidates(text, phrases, num_paraphrases, model, 
                                             sentiment_pack, offensive_pack, show_stats=False)
#     print(top_candidates)
#     print()
    euph_corpus.at[i, 'candidates'] = top_candidates
    
    # check the top k candidates - this code could use cleaning up
    for x in range(0, k):
        if (len(top_candidates) == 0):
            break
        if (len(top_candidates) == 1):
            candidate = top_candidates[0][0]
            if euph in candidate:
                score += 1
                if (score % 50 == 0):
                    print(score)
                euph_corpus.loc[i, 'top_2'] = 1
            break
        candidate = top_candidates[x][0]
        if euph in candidate:
            score += 1
            if (score % 50 == 0):
                print(score)
            euph_corpus.loc[i, 'top_2'] = 1
            break

    if (i == 1382):
        break
print("Euphemism detected in {} out of {} sentences".format(score, len(euph_corpus)))

  6%|▌         | 113/1965 [1:14:34<15:49:16, 30.75s/it]

50


 14%|█▍        | 279/1965 [2:59:48<12:54:19, 27.56s/it] 

100


 25%|██▌       | 495/1965 [5:05:11<16:02:03, 39.27s/it] 

150


 36%|███▌      | 702/1965 [7:13:27<6:19:51, 18.05s/it]  

200


 42%|████▏     | 827/1965 [8:13:37<6:35:54, 20.87s/it] 

250


 49%|████▉     | 960/1965 [9:13:45<4:19:51, 15.51s/it] 

300


 59%|█████▊    | 1153/1965 [10:57:15<6:27:20, 28.62s/it] 

350


 67%|██████▋   | 1324/1965 [12:20:26<3:28:08, 19.48s/it] 

400


 70%|███████   | 1382/1965 [12:59:30<5:28:50, 33.84s/it]  

'Euphemism detected in 420 out of 1965 sentences'





# Analytics

In [None]:
euph_corpus.to_csv('results_mar.csv')

In [None]:
import pandas as pd

euph_corpus = pd.read_csv('results_mar.csv', index_col=0)

In [None]:
import ast
# TODO: compute 1st place 2nd place 
num_first_place = 0
num_second_place = 0
num_third_place = 0
for i, row in euph_corpus.iterrows():
    if (i > 1382):
        continue
    top_2 = euph_corpus.loc[i, 'top_2']
    keyword = euph_corpus.loc[i, 'keyword']
    candidates = euph_corpus.loc[i, 'candidates']
    # Converting string to list
    candidates = ast.literal_eval(candidates)
    if (top_2 == 1):
        if (keyword in candidates[0][0]):
            num_first_place += 1
        elif (keyword in candidates[1][0]):
            num_second_place += 1
    elif (len(candidates) > 2):
        if (keyword in candidates[2][0]):
            num_third_place += 1

print(num_first_place)
print(num_second_place)
print(num_third_place)

282
138
91


In [None]:
import re
count = 0
tot_p = 0
# denote rows where keyword was present in REGULAR phrases
for i, row in euph_corpus.iterrows():
    if (euph_corpus.loc[i, "is_euph"] == 0):
        continue
    phrases = euph_corpus.loc[i, "autophrase"]
    # Converting string to list
    phrases = ast.literal_eval(phrases)
    tot_p += len(phrases)
    keyword = euph_corpus.loc[i, 'keyword']
    for p in phrases:
        p_string = re.sub(r'_', ' ', p)
        if keyword in p_string:
            #euph_corpus.loc[i, 'keyword_present'] = 1
            count += 1
            break
            
print(count)
print(tot_p)

1228
57297


In [None]:
import re
count = 0
tot_q = 0
# denote rows where keyword was present in quality phrases
euph_corpus['keyword_present'] = 0
for i, row in euph_corpus.iterrows():
    if (euph_corpus.loc[i, "is_euph"] == 0):
        continue
    quality_phrases = euph_corpus.loc[i, "quality_phrases"]
    # Converting string to list
    quality_phrases = ast.literal_eval(quality_phrases)
    tot_q += len(quality_phrases)
    keyword = euph_corpus.loc[i, 'keyword']
    for q in quality_phrases:
        q_string = re.sub(r'_', ' ', q)
        if keyword in q_string:
            euph_corpus.loc[i, 'keyword_present'] = 1
            count += 1
            break

print(count)
print(tot_q)

1228
32359
