# Test a sentence input
Requires these files in the same folder as this notebook:
- a bigram_phraser
- a trigram_phraser
- a word2vec model (3 files: model, syn1neg, and vectors)
- a list of stopwords (to ignore as potential euphemisms)

Required packages:
- gensim
- transformers

In [1]:
from gensim.models.phrases import Phraser, Phrases
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from scipy.special import softmax
import csv
import urllib.request
from tqdm import tqdm
import re

In [8]:
class Euph_Detection:
    def __init__(self, bigram_phraser, trigram_phraser, w2v_model, stopwords_text, sentiment, offensive):
        self.bigram_phraser = Phraser.load(bigram_phraser)
        self.trigram_phraser = Phraser.load(trigram_phraser)
        self.model = Word2Vec.load(w2v_model)
        # self.model = KeyedVectors.load(w2v_model)
        self.topic_list = ['politics', 'death', 'kill', 'crime',
               'drugs', 'alcohol', 'fat', 'old', 'poor', 'cheap',
               'sex', 'sexual',
               'employment', 'job', 'disability', 'disabled', 
               'accident', 'pregnant', 'poop', 'sickness', 'race', 'racial', 'vomit'
              ]
        self.stopwords = self.read_stopwords(stopwords_text)
        # load the sentiment models
        # sentiment_labels, sentiment_model, sentiment_tokenizer = self.load_roberta(sentiment)
        # offensive_labels, offensive_model, offensive_tokenizer = self.load_roberta(offensive)
        # pack them together - just for conciseness
        self.sentiment_pack = [x for x in self.load_roberta(sentiment)]
        self.offensive_pack = [x for x in self.load_roberta(offensive)]
    
    def preprocess(self, s):
        s = s.strip()
        s = re.sub(r'(##\d*\W)|<\w>|,|;|:|--|\(|\)|#|%|\\|\/|\.|\*|\+|@', '', s)
        s = re.sub(r'\s\s+', ' ', s)
        s = s.lower()
        return s

    def get_phrases(self, s):
        bigrammed_phrases = self.bigram_phraser[s.split()]
        trigrammed_phrases = self.trigram_phraser[bigrammed_phrases]

    def sum_similarity(self, phrase, topic_list):
        score = 0
        for topic in topic_list:
            try:
                similarity = self.model.wv.similarity(phrase, topic)
                # EXPERIMENTAL - to "reward" the phrases with a high similarity to a particular category, but maybe not others
                if (similarity > 0.50):
                    # print("{} has a high similarity with {}".format(phrase, topic))
                    return 1.51
                if (similarity > 0):
                    score += similarity
            except:
                score += 0
        return score
    
    def read_stopwords(self, text):
        stopwords = []
        with open(text,'rb') as f:
            content = f.read()
            content = content.split(b'\r\n')
            for line in content:
                stopwords.append(line.decode('utf-8'))
        return stopwords

    def topically_filter_phrases(self, phrases, topic_list, stopwords, THRESHOLD, show_stats=False):
        quality_phrases = []
        filtered = []

        for phrase in phrases:
            if (phrase in stopwords):
                continue
            similarity = self.sum_similarity(phrase, topic_list)

            if (show_stats == True):
                print("{} has a relevance score of {}".format(phrase, similarity)) #table?

            if (similarity > THRESHOLD and phrase not in quality_phrases):
                quality_phrases.append(phrase)
            else:
                filtered.append(phrase)

        # if (show_stats == True):
        #     print("\nRELEVANT PHRASES: {}".format(quality_phrases))
        #     print("IRRELEVANT PHRASES: {}".format(filtered))
        return quality_phrases

    def load_roberta(self, task):
        # Tasks:
        # emoji, emotion, hate, irony, offensive, sentiment
        # stance/abortion, stance/atheism, stance/climate, stance/feminist, stance/hillary

        # task='sentiment' or 'offensive'
        MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

        tokenizer = AutoTokenizer.from_pretrained(MODEL)

        # download label mapping
        labels=[]
        mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
        with urllib.request.urlopen(mapping_link) as f:
            html = f.read().decode('utf-8').split("\n")
            csvreader = csv.reader(html, delimiter='\t')
        labels = [row[1] for row in csvreader if len(row) > 1]

        # pretrained
        model = AutoModelForSequenceClassification.from_pretrained(MODEL)
        model.save_pretrained(MODEL)
        tokenizer.save_pretrained(MODEL)

        return labels, model, tokenizer
        
    '''
    functions for getting the sentiment 
    '''
    def get_sentiment(self, s, pack):
        labels, model, tokenizer = pack[0], pack[1], pack[2]
        encoded_input = tokenizer(s, return_tensors='pt')
        output = model(**encoded_input)
        scores = output[0][0].detach().numpy()
        scores = softmax(scores)
        return scores
    
    '''
    needs functions load_roberta_sentiment(), load_roberta_offensive(), get_sentiment() and get_offensive()
    '''
    def get_top_euph_candidates(self, text, phrases, num_paraphrases, wv_model, sentiment_pack, offensive_pack, show_stats=False):
        orig_scores = list(self.get_sentiment(text, sentiment_pack))
        orig_scores = orig_scores + list(self.get_sentiment(text, offensive_pack))
        if show_stats == True: print('SENTIMENT OF ORIGINAL SENTENCE: {}'.format(orig_scores))
        phrase_scores = []

        for q in tqdm(phrases):
            paraphrases = []
            if show_stats == True: print('\n'+q)
            paraphrases = wv_model.wv.most_similar(q, topn = num_paraphrases) # can swap out
            # print(q)
            # print(paraphrases)
            
            # various sentiment statistics
            sentiment_shift = [0, 0, 0, 0, 0] # [neg, neu, pos, off, n-off]
            max_inc = [0, 0, 0, 0, 0]
            max_inc_para = ["", "", "", "", ""]
            tot_neg_inc = 0
            tot_neu_inc = 0
            tot_pos_inc = 0
            tot_noff_inc = 0
            tot_off_inc = 0
            
            for p in paraphrases:
                p_string = re.sub(r'_', ' ', p[0]) # the underscores are removed for sentiment computation - experiment?
                q_string = re.sub(r'_', ' ', q)

                # replacement
                pattern = re.compile(r'\b'+q_string+r'\b', re.IGNORECASE)
                new_sentence = pattern.sub(p_string, text)
                # at this point, we could check the integrity of the paraphrase

                # get the sentiment/offensive scores for this paraphrase
                scores = list(self.get_sentiment(new_sentence, sentiment_pack))
                scores = scores + list(self.get_sentiment(new_sentence, offensive_pack))

                # update the quality phrase's sentiment statistics with the sentiment shifts from this paraphrase
                shifts = [0, 0, 0, 0, 0]
                for i in range(0, len(scores)):
                    shifts[i] = scores[i] - orig_scores[i]
                    sentiment_shift[i] += shifts[i]
                    if (shifts[i] > max_inc[i]):
                        max_inc[i] = shifts[i]
                        max_inc_para[i] = p_string

                # update the relevant scores for detection
                if (shifts[0] > 0):
                    tot_neg_inc += shifts[0]
                if (shifts[1] > 0):
                    tot_neu_inc += shifts[1]
                if (shifts[2] > 0):
                    tot_pos_inc += shifts[2]
                if (shifts[3] > 0):
                    tot_noff_inc += shifts[3]
                if (shifts[4] > 0):
                    tot_off_inc += shifts[4]
                
                # print(p_string)
                # print(shifts)

            for val in sentiment_shift:
                val /= num_paraphrases
            if (show_stats == True):
                print("AVERAGE SENTIMENT SHIFTS: {}".format(sentiment_shift))
                print("MAX INCREASE FROM A PHRASE: {}".format(max_inc))
                print("PHRASES THAT CAUSED EACH ^: {}".format(max_inc_para))
                print("TOTAL NEGATIVE INCREASE: {}".format(tot_neg_inc))
                print("TOTAL NEUTRAL INCREASE: {}".format(tot_neu_inc))
                print("TOTAL POSITIVE INCREASE: {}".format(tot_pos_inc))
                print("TOTAL NON-OFFENSIVE INCREASE: {}".format(tot_noff_inc))
                print("TOTAL OFFENSIVE INCREASE: {}".format(tot_off_inc))

            # score = tot_neg_inc + tot_neu_inc + + tot_pos_inc + tot_noff_inc + tot_off_inc
            score = tot_neg_inc + tot_neu_inc + tot_pos_inc + 2*(tot_noff_inc + tot_off_inc)
            # score = sentiment_shift[0]+sentiment_shift[1]+sentiment_shift[3]
            # score = (tot_off_inc/tot_neg_inc)*(tot_neg_inc + tot_neu_inc + tot_off_inc)
            phrase_scores.append((q_string, score))

        phrase_scores = list(sorted(phrase_scores, key=lambda x: x[1], reverse=True))
        return phrase_scores
    
    def detect_euphs(self, s, topic_threshold, num_paraphrases, show_stats=False):
        s = self.preprocess(s)
        print(s)

        bigrammed_phrases = self.bigram_phraser[s.split()]
        trigrammed_phrases = self.trigram_phraser[bigrammed_phrases]
        print("\nDETECTED PHRASES: {}".format(trigrammed_phrases))

        data = []
        data.append(trigrammed_phrases)
        # train model on input data
        self.model.train(data, total_examples=len(data), epochs=10)

        quality_phrases = self.topically_filter_phrases(trigrammed_phrases, self.topic_list, self.stopwords, topic_threshold, show_stats)
        print("\nRELEVANT PHRASES: {}".format(quality_phrases))

        candidate_list = self.get_top_euph_candidates(s, quality_phrases, num_paraphrases, 
                                            self.model, self.sentiment_pack, self.offensive_pack, 
                                            show_stats)
        return candidate_list

In [9]:
euph_detector = Euph_Detection('bigram_phraser_7', 'trigram_phraser_7', 'wv_model_7', 'stopwords.txt', 'sentiment', 'offensive')

#### Input your sentence below

In [11]:
s = 'cant you tell she has a bun in the oven, no not a hot dog bun arnold'

candidate_ranking = euph_detector.detect_euphs(s, topic_threshold=1.45, num_paraphrases=25, show_stats=False)

print("\nEUPH CANDIDATE RANKING: {}".format(candidate_ranking))

cant you tell she has a bun in the oven no not a hot dog bun arnold

DETECTED PHRASES: ['cant', 'you', 'tell', 'she_has', 'a', 'bun_in_the_oven', 'no', 'not', 'a', 'hot_dog_bun', 'arnold']

RELEVANT PHRASES: ['she_has', 'bun_in_the_oven', 'hot_dog_bun']


100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:06<00:00,  2.27s/it]


EUPH CANDIDATE RANKING: [('bun in the oven', 7.889018282294273), ('hot dog bun', 4.71666407212615), ('she has', 2.7513428330421448)]





## Analyzing the Process
After running Euph_Detection on a sentence, you can further look at the intermediate outputs for a specific candidate phrase:

#### Topic Relevance

In [13]:
test_phrase = 'bun_in_the_oven'
similar_topics = []

topic_list = euph_detector.topic_list
model = euph_detector.model

score = 0
for topic in topic_list:
    similarity = model.wv.similarity(test_phrase, topic)
    if (similarity > 0.25):
        similar_topics.append(topic)
    if (similarity > 0):
        score += similarity
    print('{}: {}'.format(topic, similarity))

print('SIMILAR TOPICS: {}'.format(similar_topics))
print('TOTAL SCORE: {}'.format(score))

politics: -0.10268321633338928
death: 0.04841664433479309
kill: 0.14225205779075623
crime: -0.07305203378200531
drugs: 0.007137034088373184
alcohol: 0.002544146031141281
fat: 0.21911388635635376
old: 0.047357894480228424
poor: 0.0035524219274520874
cheap: 0.02102913334965706
sex: 0.09789717197418213
sexual: 0.04411625862121582
employment: -0.06391395628452301
job: 0.1524340808391571
disability: 0.03895317390561104
disabled: 0.06526339799165726
accident: 0.01644478738307953
pregnant: 0.2601613998413086
poop: 0.24594008922576904
sickness: 0.04668925330042839
race: -0.03729817643761635
racial: -0.06883380562067032
vomit: 0.15760526061058044
SIMILAR TOPICS: ['pregnant']
TOTAL SCORE: 1.6169080920517445


#### Sentiment

In [43]:
text = s
q = 'bun_in_the_oven'

sentiment_pack = euph_detector.sentiment_pack
offensive_pack = euph_detector.offensive_pack

orig_scores = list(euph_detector.get_sentiment(s, sentiment_pack))
orig_scores = orig_scores + list(euph_detector.get_sentiment(text, offensive_pack))
print('SENTIMENT OF ORIGINAL SENTENCE: {}'.format(orig_scores))

num_paraphrases=25
paraphrases = []
print('\n'+q)
paraphrases = model.wv.most_similar(q, topn = num_paraphrases) # can swap out

# various sentiment statistics
sentiment_shift = [0, 0, 0, 0, 0] # [neg, neu, pos, off, n-off]
max_inc = [0, 0, 0, 0, 0]
max_inc_para = ["", "", "", "", ""]
tot_neg_inc = 0
tot_neu_inc = 0
tot_pos_inc = 0
tot_noff_inc = 0
tot_off_inc = 0

for p in tqdm(paraphrases):
    p_string = re.sub(r'_', ' ', p[0]) # the underscores are removed for sentiment computation - experiment?
    q_string = re.sub(r'_', ' ', q)

    # replacement
    pattern = re.compile(r'\b'+q_string+r'\b', re.IGNORECASE)
    new_sentence = pattern.sub(p_string, text)
    print(p_string)
    # at this point, we could check the integrity of the paraphrase

    # get the sentiment/offensive scores for this paraphrase
    scores = list(euph_detector.get_sentiment(new_sentence, sentiment_pack))
    scores = scores + list(euph_detector.get_sentiment(new_sentence, offensive_pack))

    # update the quality phrase's sentiment statistics with the sentiment shifts from this paraphrase
    shifts = [0, 0, 0, 0, 0]
    for i in range(0, len(scores)):
        shifts[i] = scores[i] - orig_scores[i]
        sentiment_shift[i] += shifts[i]
        if (shifts[i] > max_inc[i]):
            max_inc[i] = shifts[i]
            max_inc_para[i] = p_string
    # print(shifts)
    # update the relevant scores for detection
    if (shifts[0] > 0):
        tot_neg_inc += shifts[0]
    if (shifts[1] > 0):
        tot_neu_inc += shifts[1]
    if (shifts[2] > 0):
        tot_pos_inc += shifts[2]
    if (shifts[3] > 0):
        tot_noff_inc += shifts[3]
    if (shifts[4] > 0):
        tot_off_inc += shifts[4]     

    for val in sentiment_shift:
        val /= num_paraphrases
    
print("AVERAGE SENTIMENT SHIFTS: {}".format(sentiment_shift))
print("MAX INCREASE FROM A PHRASE: {}".format(max_inc))
print("PHRASES THAT CAUSED EACH ^: {}".format(max_inc_para))
print("TOTAL NEGATIVE INCREASE: {}".format(tot_neg_inc))
print("TOTAL NEUTRAL INCREASE: {}".format(tot_neu_inc))
print("TOTAL POSITIVE INCREASE: {}".format(tot_pos_inc))
print("TOTAL NON-OFFENSIVE INCREASE: {}".format(tot_noff_inc))
print("TOTAL OFFENSIVE INCREASE: {}".format(tot_off_inc))

# score = 0.5*tot_neg_inc + 0.25*tot_neu_inc + 1.5*tot_off_inc
score = tot_neg_inc + tot_neu_inc + tot_pos_inc + 2*(tot_noff_inc + tot_off_inc)
print((q_string, score))
# print((q_string, sentiment_shift[0]+sentiment_shift[1]+sentiment_shift[3]))

SENTIMENT OF ORIGINAL SENTENCE: [0.15074506, 0.766159, 0.08309597, 0.66507876, 0.33492124]

bun_in_the_oven


  0%|                                                                                           | 0/25 [00:00<?, ?it/s]

kyle richards
battle aura


  8%|██████▋                                                                            | 2/25 [00:00<00:02, 11.10it/s]

been spayed


 16%|█████████████▎                                                                     | 4/25 [00:00<00:01, 11.73it/s]

trachiniae
choreographer/partner
sash-bearing


 24%|███████████████████▉                                                               | 6/25 [00:00<00:01, 11.90it/s]

crown-wearing
african violets


 32%|██████████████████████████▌                                                        | 8/25 [00:00<00:01, 11.93it/s]

2938490toolong


 40%|████████████████████████████████▊                                                 | 10/25 [00:00<00:01, 11.96it/s]

mint basil
narrows her eyes
rynestead


 48%|███████████████████████████████████████▎                                          | 12/25 [00:01<00:01, 11.47it/s]

sunchoke
dollop of whipped


 56%|█████████████████████████████████████████████▉                                    | 14/25 [00:01<00:00, 11.48it/s]

bettie page


 64%|████████████████████████████████████████████████████▍                             | 16/25 [00:01<00:00, 11.65it/s]

gadhia-smith
cooked shrimp
holosericea


 72%|███████████████████████████████████████████████████████████                       | 18/25 [00:01<00:00, 11.62it/s]

pas trop
wear a neck brace


 80%|█████████████████████████████████████████████████████████████████▌                | 20/25 [00:01<00:00, 11.92it/s]

school/district


 88%|████████████████████████████████████████████████████████████████████████▏         | 22/25 [00:01<00:00, 12.29it/s]

tall athletic
big chin
manchego cheese


100%|██████████████████████████████████████████████████████████████████████████████████| 25/25 [00:02<00:00, 11.77it/s]

eight-months-pregnant
AVERAGE SENTIMENT SHIFTS: [0.31104613095521927, -0.4423208236694336, 0.13127442076802254, -0.8401100635528564, 0.8401102125644684]
MAX INCREASE FROM A PHRASE: [0.13473663, 0.048584163, 0.07316646, 0.04799378, 0.125934]
PHRASES THAT CAUSED EACH ^: ['been spayed', 'cooked shrimp', 'mint basil', 'battle aura', 'big chin']
TOTAL NEGATIVE INCREASE: 0.6099239438772202
TOTAL NEUTRAL INCREASE: 0.11755067110061646
TOTAL POSITIVE INCREASE: 0.23823228478431702
TOTAL NON-OFFENSIVE INCREASE: 0.14372360706329346
TOTAL OFFENSIVE INCREASE: 0.9838335514068604
('bun in the oven', 3.2208212167024612)



