## Getting Started with the Project

In [None]:
# modeling
import tensorflow as tf
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.optimizers import Adam

# data preprocessing
from sklearn.model_selection import train_test_split

# data wrangling
import numpy as np
import pandas as pd

#important libraries for preprocessing using NLTK
import nltk
from nltk import word_tokenize, FreqDist
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download
nltk.download('wordnet')
nltk.download('stopwords')

# string manipulation
import re

# hashtag segmenter
import preprocessor as p

## Tweets Preprocessing
One challenge of this project is to preprocess the twitter dataset for toxicity identification for DistilBERT. For this purpose, the preprocessing is divided and handled into 3 sections: Hashtag extraction, Tweet cleaning, and removal of stop words etc.

In [None]:
df = pd.read_csv("FinalBalancedDataset.csv")
df = df.drop(columns = ['Unnamed: 0'])
df.head()

### Tweet cleaning
The removal of URLs and @user mentions is done and a new cleaned text is added as a feature.

In [None]:
# forming a separate feature for cleaned tweets
for i,v in enumerate(df["tweet"]):
    df.loc[i,"text"] = p.clean(df["tweet"][i])
df.head()

### Removal of stop words, punctuations etc...
Now, we can preprocess the cleaned up tweets in the text column further.

In [None]:
def preprocess_data(data):
  # removes numbers
  data = data.astype(str).str.replace('\d+', '')
  lower_text = data.str.lower()
  lemmatizer = nltk.stem.WordNetLemmatizer()

  def lemmatize_text(text):
    return [(lemmatizer.lemmatize(text))]
  def remove_punctuation(words):
    new_words = []
    for word in words:
      new_word = re.sub(r'[^\w\s]', '', (word))
      if new_word != '':
        new_words.append(new_word)
    return new_words
  words = lower_text.apply(lemmatize_text)
  words = words.apply(remove_punctuation)
  return pd.DataFrame(words)

pre_tweets = preprocess_data(df['text'])
df['text'] = pre_tweets
stop_words = set(stopwords.words('english'))
df['text'] = df['text'].apply(lambda x: [item for item in \
                                    x if item not in stop_words])

x = df['text'].copy()
x_str = []
y = df['Toxicity'].copy()
y_bool = []
for i in range(len(x)):
  listToStr1 = ' '.join([str(elem) for elem in \
                                       x[i]])
  x_str.append(listToStr1)

for i in range(len(y)):
  listToStr1 = bool(y[i])
  y_bool.append(listToStr1)
xlen = [len(t) for t in x_str]
df.head()

## Transformer-based Toxicity Classification Model - DISTILBERT
Using DistilBERT, the model is trained on the preprocessed dataset and now we can give an input to it, and get the probability of toxicity back!

### Prerequisites to running the model
**IMPORTANT:** If you want to skip the training step and use the saved model instead, please run the first 3 cells under here as prerequisites and skip to loading the saved model. In order to do this, you must have the saved model downloaded to appropriate directory, or have already run the training and the model is saved on the appropriate place.

Download the saved model from [here](https://drive.google.com/drive/folders/1HB82YQb-W0qGW-ltXTuaoD8tCaj8xyo_?usp=drive_link). Make sure it is under the following directory in your Google Drive: ./gdrive/My Drive/model

In [None]:
import math
import torch
import torch.nn as nn
#import torchtext
from tensorflow.keras import activations, optimizers, losses
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
import pickle

In [None]:
# use mps if available
if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
    
print(device)

In [None]:
def construct_encodings(x, tkzr, max_len, trucation=True, padding=True):
  return tkzr(x, max_length=max_len, truncation=trucation, padding=padding)

def construct_tfdataset(encodings, y=None):
    if y:
        return tf.data.Dataset.from_tensor_slices((dict(encodings),y))
    else:
        return tf.data.Dataset.from_tensor_slices(dict(encodings))

def create_predictor(model, model_name, max_len):
  tkzr = DistilBertTokenizer.from_pretrained(model_name)
  def predict_proba(text):
      x = [text]

      encodings = construct_encodings(x, tkzr, max_len=max_len)
      tfdataset = construct_tfdataset(encodings)
      tfdataset = tfdataset.batch(1)

      preds = model.predict(tfdataset).logits
      preds = activations.softmax(tf.convert_to_tensor(preds)).numpy()
      return preds[0][0]

  return predict_proba

### Training and saving the model (optional - no need if you have the model as specified above)

In [None]:
# set distilbert model name
MODEL_NAME = 'distilbert-base-uncased'
MAX_LEN = 20

# try and see how distilbert tokenization works on input text
tweet0 = x[0]
tkzr = DistilBertTokenizer.from_pretrained(MODEL_NAME)
inputs = tkzr(tweet0, max_length=MAX_LEN, truncation=True, padding=True)

print(f'first tweet: \'{tweet0}\'')
print(f'input ids: {inputs["input_ids"]}')
print(f'attention mask: {inputs["attention_mask"]}')

# -------

# prepare dataset into training and test
TEST_SPLIT = 0.2
BATCH_SIZE = 2

encodings = construct_encodings(x_str, tkzr, max_len=MAX_LEN)
tfdataset = construct_tfdataset(encodings, y_bool)

train_size = int(len(x_str) * (1-TEST_SPLIT))

tfdataset = tfdataset.shuffle(len(x_str))
tfdataset_train = tfdataset.take(train_size)
tfdataset_test = tfdataset.skip(train_size)

tfdataset_train = tfdataset_train.batch(BATCH_SIZE)
tfdataset_test = tfdataset_test.batch(BATCH_SIZE)

# -------

# distilbert fine-tuning (training)
N_EPOCHS = 2

model = TFDistilBertForSequenceClassification.from_pretrained(MODEL_NAME)
optimizer = optimizers.Adam(learning_rate=3e-5)
loss = losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

model.fit(tfdataset_train, batch_size=BATCH_SIZE, epochs=N_EPOCHS)

# ---------

# get accuracy score as evaluation result
benchmarks = model.evaluate(tfdataset_test, return_dict=True, batch_size=BATCH_SIZE)
print(benchmarks)

# ---------

clf = create_predictor(model, MODEL_NAME, MAX_LEN)

# ---------
# test the toxicity identification model by giving text input
print(clf('I do not care who wins the election, I do not like politics'))

# ---------
# save the model to drive
model.save_pretrained('model/toxicityidentifier')
with open('model/info.pkl', 'wb') as f:
    pickle.dump((MODEL_NAME, MAX_LEN), f)

### Load the saved model

In [None]:
# reload the saved model
new_model = TFDistilBertForSequenceClassification.from_pretrained('model/toxicityidentifier')
model_name, max_len = pickle.load(open('model/info.pkl', 'rb'))

clf = create_predictor(new_model, model_name, max_len)

## Explainabiliy with LIME
LIME gives a blackbox model explanation. It works by removing words in the text to check how effective these words are in model's toxicity classification ability. With LIME, we can highlight toxic words in a given text to explain to the user which words strongly suggested toxic speech in a tweet.

In [None]:
from lime import lime_text
from lime.lime_text import LimeTextExplainer
#1 - toxic, 0 - nontoxic
class_names = ["non-toxic", "toxic"]
explainer = LimeTextExplainer(class_names=class_names)

# alternative function for lime to accept
def predict_proba_lime(x):
  tkzr = DistilBertTokenizer.from_pretrained(model_name)
  encodings = construct_encodings(x, tkzr, max_len=max_len)
  tfdataset = construct_tfdataset(encodings)
  tfdataset = tfdataset.batch(1)

  preds = new_model.predict(tfdataset).logits
  preds = activations.softmax(tf.convert_to_tensor(preds)).numpy()
  return preds


str_to_predict = "i hate people when they do that, shut up stupid people, fuck you"
exp = explainer.explain_instance(str_to_predict, predict_proba_lime, num_features=10, num_samples=1000)
exp.show_in_notebook(text=str_to_predict)

## Toxicity Category Assignment with roBERTa
Next, we can give more explainability to toxicity classification with roBERTa that is pre-trained on the Jigsaw Dataset. This model categorizes the toxic text into why it is toxic.

In [None]:
import pandas as pd
from detoxify import Detoxify

def return_key_largest(input_text, toxicity_rate):

    results = Detoxify('unbiased').predict(input_text)

    if(toxicity_rate < 0.50):
        return "all_good"
    else:
        sent_list = list(results.items())[2:]
        sent_dict = dict(sent_list)
        key_largest = max(sent_dict, key=sent_dict.get)
        return key_largest

# categories are: obscene, identity_attack, insult, threat, sexual_explicit

## Detoxification Prompting - Different Prompts for Different Categories
The prompt is written in accordance to the category of why the text is toxic which is given by roBERTa model.

**IMPORTANT**: Due to security reasons, we are unable to provide an API key for you. Please acquire it, and enter it in the place of "ENTER-YOUR-API-KEY-HERE", and assign your API key to openai.api_key variable.

In [None]:
from openai import OpenAI

# set your API key
#api_key = 'ENTER-YOUR-API-KEY-HERE'

# input: text by user
# output: prompt by GPT-4o-mini

# Using OpenAI API, for summarization
def response(prompt, model="gpt-4o-mini"):
    client = OpenAI(api_key=api_key)
    messages = [{"role": "user", "content": prompt}]
    response = client.responses.create(
        model=model,
        input=messages,
        temperature=0.4, # this is the degree of randomness of the model's output
        max_output_tokens=512
    )
    return(response.output_text)

## Detoxification Prompting Additional Experiment - Parallel Examples
The prompt is written to include 10 examples on how to detoxify a text (parallel examples) and GPT is asked to give the 11th detox text which is the detoxified version of the user input. This part is not included into the final system (UI), and the previous prompting technique is included as it works better and informed. This piece of code is there for the sake of completeness.

In [None]:
from datasets import load_dataset

def init():
  dataset = load_dataset("s-nlp/paradetox")
  rows=[]
  for i in range(5000):
    text = "Instead of saying " + '"' + str(dataset['train'][i]['en_toxic_comment']) + '"' + ", i can express this in other way like " + '"' + str(dataset['train'][i]['en_neutral_comment']) +'"'
    rows.append(text)
  df = pd.DataFrame(rows, columns=['texts'])
  df = df.sample(frac = 1)
  return df

def add_few_shot(df, given_text, size=10):
  text = "Complete the last one according to examples given: " + "\n\n"
  for i in range (size):
    text = df['texts'][i] + "\n"
  promptWithEx = text + "Instead of saying " + '"' + given_text + '"' + ", i can express this in other way like "

  completion = openai.Completion.create(
    engine="text-davinci-003",
    prompt=promptWithEx
  )
  response = completion.choices[0].text
  return response

def runDetoxFewShot(given_text):
  df = init()
  detoxSentence = add_few_shot(df, given_text)
  return detoxSentence

## GRADIO for User Interface
A simple UI for user input and outputs which are toxicity rate by distilBERT, highlighted text by LIME for explainability, toxicity category assignment by roBERTa, GPT prompting in detoxification of the user input. After running, you will see the UI which you can interact with.

In [None]:
import gradio as gr
import pandas as pd
import numpy as np

# alternative function for gradio to accept
def predict_proba_gradio(x):

  x_org = x
  tkzr = DistilBertTokenizer.from_pretrained(model_name)
  encodings = construct_encodings([x], tkzr, max_len=max_len)
  tfdataset = construct_tfdataset(encodings)
  tfdataset = tfdataset.batch(1)

  preds = new_model.predict(tfdataset).logits
  preds = activations.softmax(tf.convert_to_tensor(preds)).numpy()
  exp = explainer.explain_instance(x, predict_proba_lime, num_features=10, num_samples=1000)
  highlighted_list = exp.as_list()

  # detoxify for categorization
  reason = return_key_largest(x_org, preds[0][1])

  if(reason == "obscene"):
    explanation_reason = "This text contains obscenity, therefore it is deemed as toxic."
    prompt = f"The following text contains obscenity, therefore it is deemed as toxic: '{x_org}'. Please rephrase this sentence while keeping the content and style preserved."
    detox_1 = response(prompt)

  elif(reason == "identity_attack"):
    explanation_reason = "This text contains an attack on a specific identity, therefore it is deemed as toxic."
    prompt = f"The following text contains an attack on a specific identity, therefore it is deemed as toxic: '{x_org}'. Please rephrase this sentence while keeping the content and style preserved."
    detox_1 = response(prompt)

  elif(reason == "insult"):
    explanation_reason = "This text contains insult, there it is deemed as toxic."
    prompt = f"The following text contains obscentity: '{x_org}'. Please rephrase this sentence while keeping the content and style preserved."
    detox_1 = response(prompt)

  elif(reason == "threat"):
    explanation_reason = "This text contains at least one threat, therefore it is deemed as toxic."
    prompt = f"This following text contains contains at least one threat, therefore it is deemed as toxic: '{x_org}'. Please rephrase this sentence while keeping the content and style preserved."
    detox_1 = response(prompt)

  elif(reason == "sexual_explicit"):
    explanation_reason = "This text contains an sexually explicit content, therefore it is deemed as toxic."
    prompt = f"This following text contains contains an sexually explicit content, therefore it is deemed as toxic: '{x_org}'. Please rephrase this sentence while keeping the content and style preserved."
    detox_1 = response(prompt)

  elif(reason == "all_good"):
    explanation_reason = "No serious toxicity was found in this text, all good!"
    detox_1 = x_org

  return preds[0][1], highlighted_list, explanation_reason, detox_1

In [None]:
# gradio elements to display such as user text input, and the mentioned outputs of the models
# it is advised to follow the running localhost to open in full web page mode
input = gr.Textbox(label='Enter Your Text Here')
toxic_output = gr.Textbox(label='Toxicity Rate')
highlighted_text = gr.HighlightedText(label="Where might be the toxicity?: purple - nontoxic | red - toxic",show_legend=True,)
reason_toxic = gr.Textbox(label='What might be the reason?')
# shows the text itself if the text is not toxic
detox_suggestion_1 = gr.Textbox(label='Does this sound better?')
gr.Interface(fn = predict_proba_gradio, inputs = input, outputs = [toxic_output, highlighted_text, reason_toxic, detox_suggestion_1], title = "DETOXPLAIN").launch()