# Assigning sentiment scores
This code takes a euphemism corpus and assigns each example sentiment/offensiveness scores. This data can then be used for experimentation with BERT.

First, the sentiment packages are loaded using the function below.

In [1]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
import urllib.request
import csv
from scipy.special import softmax
import re

# download the roberta models/data, if it's not already there, and then load the labels, model, and tokenizer
def load_roberta(task):
    # Tasks:
    # emoji, emotion, hate, irony, offensive, sentiment
    # stance/abortion, stance/atheism, stance/climate, stance/feminist, stance/hillary

    # task='sentiment' or 'offensive'
    MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

    tokenizer = AutoTokenizer.from_pretrained(MODEL)

    # download label mapping
    labels=[]
    mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
    with urllib.request.urlopen(mapping_link) as f:
        html = f.read().decode('utf-8').split("\n")
        csvreader = csv.reader(html, delimiter='\t')
    labels = [row[1] for row in csvreader if len(row) > 1]

    # pretrained
    model = AutoModelForSequenceClassification.from_pretrained(MODEL)
    model.save_pretrained(MODEL)
    tokenizer.save_pretrained(MODEL)

    return labels, model, tokenizer

In [2]:
# puts the labels, model, and tokenizer from each roberta model into a list (more concise) 
sentiment_pack = [x for x in load_roberta('sentiment')]
offensive_pack = [x for x in load_roberta('offensive')]

# actually use the labels, model and tokenizer to generate a sentiment/offensiveness score
def get_sentiment(s, pack):
    labels, model, tokenizer = pack[0], pack[1], pack[2]
    encoded_input = tokenizer(s, return_tensors='pt')
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    return scores

# get the sentiment/offensive scores for this paraphrase
s = "You stink!"
scores = list(get_sentiment(s, sentiment_pack))
scores = scores + list(get_sentiment(s, offensive_pack))
print(scores) # in order: NEGATIVE, NEUTRAL, POSITIVE, NON-OFFENSIVE, OFFENSIVE

[0.9507522, 0.039947942, 0.00929983, 0.19243449, 0.80756557]


In [27]:
import pandas as pd
from tqdm.notebook import tqdm

euph_corpus = pd.read_csv('Euphemism_Corpus_v2.1.csv', index_col=0)
sent_corpus = euph_corpus.copy() # initialize new corpus and new columns
sent_corpus['neg_PET'], sent_corpus['neu_PET'], sent_corpus['pos_PET'], sent_corpus['noff_PET'], sent_corpus['off_PET'], sent_corpus['neg_TEXT'], sent_corpus['neu_TEXT'], sent_corpus['pos_TEXT'], sent_corpus['noff_TEXT'], sent_corpus['off_TEXT']  = -1, -1, -1, -1, -1, -1, -1, -1, -1, -1

# for now, let's only work with PETs that have shown to have high agreement in some examples
high_agreement_PETs = ['slim', 'between jobs', 'accident', 'late', 'number one', 'sleep with', 'seasoned', 'wealthy', 'over the hill', 'plump', 'let go of', 'go all the way', 'overweight', 'sober', 'number two', 'slept with', 'dismissed', 'let them go', 'aging', 'expecting', 'stout', 'troubled', 'with child', 'invalid', 'experienced', 'getting clean', 'custodian', 'got clean', 'long sleep', 'mixed up', 'chest', 'same-sex', 'economical', 'passing on', 'neutralize', 'outspoken', 'gluteus maximus', 'sleep around', 'pass on', 'disabled', 'special needs', 'pass away', 'a certain age', 'well off', 'less fortunate', 'mistruths', 'droppings', 'lose your lunch', 'pregnancy termination', 'let him go', 'golden years', 'mentally challenged', 'tinkle', 'demise', 'drinking problem', 'indigent', 'detainee', 'advanced age', 'comfort women', 'time of the month', 'pass gas', 'portly', 'went to heaven', 'venereal disease', 'put to sleep', 'mistruth', 'differently-abled', 'intoxicated', 'economical with the truth', 'lavatory', 'birds and the bees', 'deceased', 'terminating a pregnancy', 'inebriated', 'inner city', 'regime change', 'enhanced interrogation techniques', 'adult beverages', 'to go to heaven', 'dearly departed', 'passed away', 'downsize', 'ethnic cleansing', 'substance abusers', 'broken home', 'made love', 'plus-sized', 'underprivileged', 'rear end', 'armed conflict', 'substance abuse', 'disadvantaged', 'neutralized', 'capital punishment', 'street person', 'making love', 'freedom fighters']
sent_corpus = sent_corpus.loc[sent_corpus['keyword'].isin(high_agreement_PETs)]

for i, row in tqdm(sent_corpus.iterrows()):
    # i = 300 # 300, 1300, 1000
    PET = sent_corpus.loc[i, 'type']
    scores = list(get_sentiment(PET, sentiment_pack))
    scores = scores + list(get_sentiment(PET, offensive_pack))
    sent_corpus.loc[i, 'neg_PET'] = float(scores[0])
    sent_corpus.loc[i, 'neu_PET'] = float(scores[1])
    sent_corpus.loc[i, 'pos_PET'] = float(scores[2])
    sent_corpus.loc[i, 'noff_PET'] = float(scores[3])
    sent_corpus.loc[i, 'off_PET'] = float(scores[4])
    
    text = sent_corpus.loc[i, 'sentence']
    scores = list(get_sentiment(text, sentiment_pack))
    scores = scores + list(get_sentiment(text, offensive_pack))
    sent_corpus.loc[i, 'neg_SENT'] = float(scores[0])
    sent_corpus.loc[i, 'neu_SENT'] = float(scores[1])
    sent_corpus.loc[i, 'pos_SENT'] = float(scores[2])
    sent_corpus.loc[i, 'noff_SENT'] = float(scores[3])
    sent_corpus.loc[i, 'off_SENT'] = float(scores[4])
    
sent_corpus

0it [00:00, ?it/s]

Unnamed: 0,keyword,edited_text,is_euph,category,type,euph_status,sentence,neg_PET,neu_PET,pos_PET,...,neg_TEXT,neu_TEXT,pos_TEXT,noff_TEXT,off_TEXT,neg_SENT,neu_SENT,pos_SENT,noff_SENT,off_SENT
0,tinkle,We're just getting back what was TAKEN from us...,1,body functions/parts,tinkle,always_euph,We're just getting back what was TAKEN from us...,0.215733,0.593470,0.190797,...,-1,-1,-1,-1,-1,0.633027,0.327682,0.039291,0.561716,0.438283
1,tinkle,I think AB390 will pass next year now that the...,1,body functions/parts,tinkle,always_euph,I think AB390 will pass next year now that the...,0.215733,0.593470,0.190797,...,-1,-1,-1,-1,-1,0.513002,0.423893,0.063104,0.887314,0.112686
23,venereal disease,He's being sued by a woman who claims he gave ...,1,sexual activity,venereal disease,always_euph,And Kris Humphries plans on being' relentless'...,0.717446,0.265028,0.017526,...,-1,-1,-1,-1,-1,0.191349,0.667033,0.141619,0.810757,0.189243
24,venereal disease,The pair then allegedly went to Humphries' hot...,1,sexual activity,venereal disease,always_euph,The court documents state the two' had sexual ...,0.717446,0.265028,0.017526,...,-1,-1,-1,-1,-1,0.681177,0.307028,0.011795,0.556158,0.443842
25,venereal disease,For hemorrhoids take tow and put salt on it an...,1,sexual activity,venereal disease,always_euph,For a man who suffers from swelling and from v...,0.717446,0.265028,0.017526,...,-1,-1,-1,-1,-1,0.769499,0.216996,0.013505,0.632338,0.367662
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1960,sleep with,There were other photos she wanted me to see: ...,0,sexual activity,sleep with,sometimes_euph,There were other photos she wanted me to see B...,0.145681,0.730261,0.124058,...,-1,-1,-1,-1,-1,0.017071,0.460846,0.522083,0.755889,0.244111
1961,sleep with,I am relieved to see two pup tents marked STAF...,0,sexual activity,sleep with,sometimes_euph,Thank God I don't have to sleep with Ace Wands,0.145681,0.730261,0.124058,...,-1,-1,-1,-1,-1,0.022057,0.206293,0.771650,0.832420,0.167580
1962,sleep around,"Nothing serious, just long nights of me hackin...",0,sexual activity,sleep around,sometimes_euph,With all my caterwauling it's a wonder anyone ...,0.139588,0.747621,0.112791,...,-1,-1,-1,-1,-1,0.725939,0.246083,0.027978,0.604972,0.395028
1963,with child,sounds more like Jonestown. They cant leave @ ...,0,physical/mental attributes,with child,sometimes_euph,They cant leave best advice I can give them is...,0.182822,0.721971,0.095207,...,-1,-1,-1,-1,-1,0.876646,0.117473,0.005881,0.706067,0.293933


In [28]:
sent_corpus.to_csv("Sentiment_Corpus_v1.2.csv")

In [29]:
import numpy as np
import pandas as pd
sent_corpus = pd.read_csv("Sentiment_Corpus_v1.2.csv", index_col=0)

# analysis to generalize labels in the Excel file
PROPERTY = "neu_PET"
scores = sent_corpus[PROPERTY].unique()
mean_value = np.mean(scores)
median_value = np.median(scores)
print(mean_value)
print(median_value)
THRESHOLD = mean_value # how to determine the quality (choose one that results in best split)

pos = sent_corpus.loc[sent_corpus[PROPERTY] > THRESHOLD]
neg = sent_corpus.loc[sent_corpus[PROPERTY] < THRESHOLD]

pos_1s = pos.loc[pos['is_euph']==1]
pos_0s = pos.loc[pos['is_euph']==0]
neg_1s = neg.loc[neg['is_euph']==1]
neg_0s = neg.loc[neg['is_euph']==0]

print(len(pos_1s))
print(len(pos_0s))
print(len(neg_1s))
print(len(neg_0s))

0.5637176243025203
0.5891607105731964
304
138
540
310


In [30]:
# actually put in the label
# sent_corpus = pd.read_csv('Sentiment_Corpus_v1.0.csv', index_col=0)

sent_corpus['is_' + PROPERTY] = -1
for i, row in sent_corpus.iterrows():
    if (sent_corpus.loc[i, PROPERTY] >= THRESHOLD):
        sent_corpus.loc[i, 'is_' + PROPERTY] = 1
    else:
        sent_corpus.loc[i, 'is_' + PROPERTY] = 0
sent_corpus

Unnamed: 0,keyword,edited_text,is_euph,category,type,euph_status,sentence,neg_PET,neu_PET,pos_PET,...,neu_TEXT,pos_TEXT,noff_TEXT,off_TEXT,neg_SENT,neu_SENT,pos_SENT,noff_SENT,off_SENT,is_neu_PET
0,tinkle,We're just getting back what was TAKEN from us...,1,body functions/parts,tinkle,always_euph,We're just getting back what was TAKEN from us...,0.215733,0.593470,0.190797,...,-1,-1,-1,-1,0.633027,0.327682,0.039291,0.561716,0.438283,1
1,tinkle,I think AB390 will pass next year now that the...,1,body functions/parts,tinkle,always_euph,I think AB390 will pass next year now that the...,0.215733,0.593470,0.190797,...,-1,-1,-1,-1,0.513002,0.423893,0.063104,0.887314,0.112686,1
23,venereal disease,He's being sued by a woman who claims he gave ...,1,sexual activity,venereal disease,always_euph,And Kris Humphries plans on being' relentless'...,0.717446,0.265028,0.017526,...,-1,-1,-1,-1,0.191349,0.667033,0.141619,0.810757,0.189243,0
24,venereal disease,The pair then allegedly went to Humphries' hot...,1,sexual activity,venereal disease,always_euph,The court documents state the two' had sexual ...,0.717446,0.265028,0.017526,...,-1,-1,-1,-1,0.681177,0.307028,0.011795,0.556158,0.443842,0
25,venereal disease,For hemorrhoids take tow and put salt on it an...,1,sexual activity,venereal disease,always_euph,For a man who suffers from swelling and from v...,0.717446,0.265028,0.017526,...,-1,-1,-1,-1,0.769499,0.216996,0.013505,0.632338,0.367662,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1960,sleep with,There were other photos she wanted me to see: ...,0,sexual activity,sleep with,sometimes_euph,There were other photos she wanted me to see B...,0.145681,0.730261,0.124058,...,-1,-1,-1,-1,0.017071,0.460846,0.522083,0.755889,0.244111,1
1961,sleep with,I am relieved to see two pup tents marked STAF...,0,sexual activity,sleep with,sometimes_euph,Thank God I don't have to sleep with Ace Wands,0.145681,0.730261,0.124058,...,-1,-1,-1,-1,0.022057,0.206293,0.771650,0.832420,0.167580,1
1962,sleep around,"Nothing serious, just long nights of me hackin...",0,sexual activity,sleep around,sometimes_euph,With all my caterwauling it's a wonder anyone ...,0.139588,0.747621,0.112791,...,-1,-1,-1,-1,0.725939,0.246083,0.027978,0.604972,0.395028,1
1963,with child,sounds more like Jonestown. They cant leave @ ...,0,physical/mental attributes,with child,sometimes_euph,They cant leave best advice I can give them is...,0.182822,0.721971,0.095207,...,-1,-1,-1,-1,0.876646,0.117473,0.005881,0.706067,0.293933,1


In [32]:
d = {}
for PET in sent_corpus['type'].unique():
    PET_ex = sent_corpus.loc[sent_corpus['type']==PET].reset_index(drop=True)
    # display(PET_ex)
    # break
    is_neu = PET_ex.loc[0, 'is_neu_PET']
    d[PET] = is_neu

In [33]:
from collections import Counter
Counter(d.values())

Counter({1: 51, 0: 35})

In [34]:
sent_corpus.to_csv("Sentiment_Corpus_v1.2.csv")

In [15]:
# ANalysis of Sentiment Corpus
import pandas as pd

sent_corpus = pd.read_csv("VET_Corpus_0.1.csv", index_col=0)
sent_corpus

euph_examples = sent_corpus.loc[sent_corpus['is_euph'] == 1]
noneuph_examples = sent_corpus.loc[sent_corpus['is_euph'] == 0]

print(euph_examples['is_vague'].sum())
print(len(euph_examples))
# print(euph_examples['is_vagu'].mean())
# print(euph_examples['off_PET'].mean())
# print(euph_examples['off_TEXT'].mean())
print()
print(noneuph_examples['is_vague'].sum())
print(len(noneuph_examples))
# print(noneuph_examples['neg_TEXT'].mean())
# print(noneuph_examples['off_PET'].mean())
# print(noneuph_examples['off_TEXT'].mean())

401
1382

368
583
