The code in this notebook was for the following:
- cropping dataset texts to the sentence containing the PET (to better focus the sentiment analysis)
- creating a separate file where PETs are replaced with their literal meaning, based off the list of PETs used
- using roBERTa to compute and compare sentiment 
- grouping the sentiment scores by keyword

In [None]:
import pandas as pd
import re
from utils import *
import warnings
warnings.filterwarnings('ignore')

## Cropping the texts to a single sentence before analysis

In [None]:
df = pd.read_csv("total_1s_max30.csv", encoding = "utf-8")

print('There are',len(df), 'total lines in the data')

df = df[['keyword','orig_text']]
df = pd.DataFrame(df)
df.columns = ['keyword','text']
df.head()

In [None]:
# Remove HTML tags and @ signs
df['text'] = df['text'].str.replace('( <.*?>|&lt;.*?&gt;)', '')
df['text'] = df['text'].str.replace('@', '')

# Replace corpus question marks occurring between 2 lowercase letters with an apostrophe
df['text'] = df['text'].str.replace('(?<=([a-z]|I)) \? (?=[a-z])', ' \'')

# Replace isolated periods, question marks, exclamation marks and periods + quotation marks with a sentence boundary <s>
df['text'] = df['text'].str.replace(' \.( |$)(?=\"?([A-Z]|$))', ' . <s> ')
df['text'] = df['text'].str.replace(' \?( |$)(?=\"?([A-Z]|$))', ' ? <s> ')
df['text'] = df['text'].str.replace(' \!( |$)(?=\"?([A-Z]|$))', ' ! <s> ')
df['text'] = df['text'].str.replace(' \!( |$)(?=\"?([A-Z]|$))', ' ! <s> ')

# Treat hyphens and slashes as separate tokens (e.g. to identify "chest-thumping" or "overweight/obese")
df['text'] = df['text'].str.replace('-', ' - ')
df['text'] = df['text'].str.replace('/', ' / ')

pd.set_option('display.max_colwidth', 0) # Wrap text when viewing df

# df # shows the preprocessed / sentence-separated text

# Here we do the actual cropping, going through each row in the df:
for i, row in df.iterrows():
    text = df.loc[i, 'text']
    keyword = df.loc[i, 'keyword']
    df.loc[i, 'text'] = get_single_sentence_context(text, keyword)

# df # shows the cropped and tagged text

# The code below removes the sentence boundary tags that were put in, undoes the preprocessing tasks, and
# attempts to clean up spacing (however, the spacing can remain messy in some of the cases)

# remove <s> tags
df['text'] = df['text'].str.replace(r' <s>', r'')

# remove opening/closing spaces between parens/quotes, and before punctuation marks
df['text'] = df['text'].str.replace(r'\( (.*?) \)', r'(\1)')
df['text'] = df['text'].str.replace(r'"\s(.*?)\s"', r'"\1"')
df['text'] = df['text'].str.replace(r'\s([.,?!:;\'])', r'\1')

# remove spaces before contractions
df['text'] = df['text'].str.replace(r' (?!I)([A-Za-z]\'[A-Za-z]+)', r'\1')

# undo spaces around hyphens and slashes
df['text'] = df['text'].str.replace(r'\s-\s', r'-')
df['text'] = df['text'].str.replace(r'\s/\s', r'/')

df

In [None]:
df.to_csv('total_1s_max30_cropped.csv') # At this point, may need to manually crop a few texts

## Substituting in literal meanings for sentiment analysis

In [None]:
df = pd.read_csv('total_1s_max30_cropped.csv', index_col = 0, encoding = "utf-8")
euph_list = pd.read_csv("combined_euphs_1-6.csv", encoding = "utf-8")

df['literal'] = "" # supply each row with the literal meaning of the keyword, based off euph_list
for i, row in df.iterrows():
    text = df.loc[i, 'text']
    keyword = df.loc[i, 'keyword']
    # locate the keywords with multiple literal meanings; supply literal meaning manually for these
    if (keyword == "accident" or keyword == "put to sleep" or keyword == "seeing someone"):
        # print(keyword + " located at index " + str(i))
        continue
    else:
        literal = euph_list.loc[euph_list.euphemism == keyword]
        literal_interp = literal['real_meaning'].item()
        df.loc[i, 'literal'] = literal_interp
df
df.to_csv('with_keywords_temp.csv') # in this file, supply the literal meanings of certain phrases

In [None]:
df = pd.read_csv("with_keywords_temp.csv", index_col = 0, encoding = "utf-8") # AFTER labelling the literal meanings
for i, row in df.iterrows():
    text = df.loc[i, 'text']
    keyword = df.loc[i, 'keyword']
    literal = df.loc[i, 'literal']
    orig = df.loc[i, 'text']
    pattern = re.compile(keyword, re.IGNORECASE)
    df.loc[i, 'text'] = pattern.sub(literal, text)
    # df.loc[i, 'text'] = text.replace(keyword, literal) # old; couldn't do case-insensitive
    if (df.loc[i, 'text'] == orig):
        print(i)
df
df.to_csv('with_substitutions_temp.csv') # The examples at the indices listed need their literal meanings manually subbed

## Loading the roBERTa model for sentiment analysis

In [None]:
euphs = pd.read_csv("euph_texts_1-6.csv", index_col = 0, encoding = "utf-8")
literals = pd.read_csv("literal_texts_1-6.csv", index_col = 0, encoding = "utf-8")

In [None]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from scipy.special import softmax
import csv
import urllib.request

def load_roberta_sentiment():
    # Tasks:
    # emoji, emotion, hate, irony, offensive, sentiment
    # stance/abortion, stance/atheism, stance/climate, stance/feminist, stance/hillary

    task='sentiment'
    MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

    tokenizer = AutoTokenizer.from_pretrained(MODEL)

    # download label mapping
    labels=[]
    mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
    with urllib.request.urlopen(mapping_link) as f:
        html = f.read().decode('utf-8').split("\n")
        csvreader = csv.reader(html, delimiter='\t')
    labels = [row[1] for row in csvreader if len(row) > 1]

    # PT
    model = AutoModelForSequenceClassification.from_pretrained(MODEL)
    model.save_pretrained(MODEL)
    tokenizer.save_pretrained(MODEL)
    
    return labels, model, tokenizer

### Using the sentiment analysis on the euphs and literals

In [None]:
euphs['neutral'] = 0
euphs['positive'] = 0
euphs['negative'] = 0

labels, model, tokenizer = load_roberta_sentiment()

for i, row in euphs.iterrows():
    text = euphs.loc[i, 'text']
    keyword = euphs.loc[i, 'keyword']
    encoded_input = tokenizer(text, return_tensors='pt')
    
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    ranking = np.argsort(scores)
    ranking = ranking[::-1]
    #print(labels[ranking[0]])
    
    euphs.loc[i, labels[ranking[0]]] = scores[ranking[0]]
    euphs.loc[i, labels[ranking[1]]] = scores[ranking[1]]
    #print(labels[ranking[2]])
    euphs.loc[i, labels[ranking[2]]] = scores[ranking[2]]

euphs

In [None]:
literals['neutral'] = 0
literals['positive'] = 0
literals['negative'] = 0
for i, row in literals.iterrows():
    text = literals.loc[i, 'text']
    keyword = literals.loc[i, 'keyword']
    encoded_input = tokenizer(text, return_tensors='pt')
    
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    ranking = np.argsort(scores)
    ranking = ranking[::-1]
    
    literals.loc[i, labels[ranking[0]]] = scores[ranking[0]]
    literals.loc[i, labels[ranking[1]]] = scores[ranking[1]]
    literals.loc[i, labels[ranking[2]]] = scores[ranking[2]]

literals

In [None]:
euphs.to_csv('euphs_roBERTa_checkpoint.csv')
literals.to_csv('literals_roBERTa_checkpoint.csv')

## Loading roBERTa for offensiveness analysis

In [None]:
euphs = pd.read_csv("euphs_roBERTa_checkpoint.csv", index_col = 0, encoding = "utf-8").reset_index(drop=True)
literals = pd.read_csv("literals_roBERTa_checkpoint.csv", index_col = 0, encoding = "utf-8").reset_index(drop=True)

In [None]:
def load_roberta_offensive():
    task='offensive'
    MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

    tokenizer = AutoTokenizer.from_pretrained(MODEL)

    # download label mapping
    labels=[]
    mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
    with urllib.request.urlopen(mapping_link) as f:
        html = f.read().decode('utf-8').split("\n")
        csvreader = csv.reader(html, delimiter='\t')
    labels = [row[1] for row in csvreader if len(row) > 1]

    # PT
    model = AutoModelForSequenceClassification.from_pretrained(MODEL)
    model.save_pretrained(MODEL)
    tokenizer.save_pretrained(MODEL)
    
    return labels, model, tokenizer

### Using the offensiveness analysis on the euphs and literals

In [None]:
euphs['offensive'] = 0
euphs['not-offensive'] = 0

labels, model, tokenizer = load_roberta_offensive()

for i, row in euphs.iterrows():
    text = euphs.loc[i, 'text']
    keyword = euphs.loc[i, 'keyword']
    encoded_input = tokenizer(text, return_tensors='pt')
    
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    ranking = np.argsort(scores)
    ranking = ranking[::-1]
    #print(labels[ranking[0]])
    
    euphs.loc[i, labels[ranking[0]]] = scores[ranking[0]]
    euphs.loc[i, labels[ranking[1]]] = scores[ranking[1]]
    #print(labels[ranking[2]])
    # euphs.loc[i, labels[ranking[2]]] = scores[ranking[2]]

euphs

In [None]:
literals['offensive'] = 0
literals['not-offensive'] = 0
for i, row in literals.iterrows():
    text = literals.loc[i, 'text']
    keyword = literals.loc[i, 'keyword']
    encoded_input = tokenizer(text, return_tensors='pt')
    
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    ranking = np.argsort(scores)
    ranking = ranking[::-1]
    #print(labels[ranking[0]])
    
    literals.loc[i, labels[ranking[0]]] = scores[ranking[0]]
    literals.loc[i, labels[ranking[1]]] = scores[ranking[1]]
    #print(labels[ranking[2]])
    # euphs.loc[i, labels[ranking[2]]] = scores[ranking[2]]

literals


### For computing differences in sentiment/offensiveness values when substituting literal meanings

In [None]:
differences = euphs[['keyword','text', 'literal']]
differences['neu_diff'] = 0
differences['pos_diff'] = 0
differences['neg_diff'] = 0
differences['off_diff'] = 0
differences['n-off_diff'] = 0
differences

In [None]:
for i, row in differences.iterrows():
    differences.loc[i, 'neu_diff'] = (literals.loc[i, 'neutral'] - euphs.loc[i, 'neutral'])/euphs.loc[i, 'neutral']
    differences.loc[i, 'pos_diff'] = (literals.loc[i, 'positive'] - euphs.loc[i, 'positive'])/euphs.loc[i, 'positive']
    differences.loc[i, 'neg_diff'] = (literals.loc[i, 'negative'] - euphs.loc[i, 'negative'])/euphs.loc[i, 'negative']
    differences.loc[i, 'off_diff'] = (literals.loc[i, 'offensive'] - euphs.loc[i, 'offensive'])/euphs.loc[i, 'offensive']
    differences.loc[i, 'n-off_diff'] = (literals.loc[i, 'not-offensive'] - euphs.loc[i, 'not-offensive'])/euphs.loc[i, 'not-offensive']
    
differences

In [None]:
print('Mean % change in neutral scores: ', differences['neu_diff'].mean())
print('Mean % change in positive scores: ', differences['pos_diff'].mean())
print('Mean % change in negative scores: ', differences['neg_diff'].mean())
print('Mean % change in offensive scores: ', differences['off_diff'].mean())
print('Mean % change in not-offensive scores: ', differences['n-off_diff'].mean())

In [None]:
differences.to_csv('sentiment_diffs.csv')

### Looking at sentiment/offensiveness differences by keyword

In [None]:
diff_keywords = pd.read_csv('sentiment_diffs.csv', index_col = 0, encoding = 'utf-8')
diff_keywords = diff_keywords.groupby('type')['neu_diff', 'pos_diff', 'neg_diff', 'off_diff', 'n-off_diff'].mean()
display(diff_keywords)
diff_keywords.to_csv('sentiment_diffs_by_type.csv')