### Importing Libs

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Bidirectional, Dense, Embedding, TextVectorization

import torch
from torch.utils.data import DataLoader, TensorDataset

from transformers import BertTokenizer, BertForSequenceClassification, AdamW

import warnings
warnings.filterwarnings('ignore')

### Verifying GPU Usage

In [None]:
physical_devices = tf.config.list_physical_devices('GPU')
print(physical_devices)
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)

### Reading Dataset

In [None]:
data = pd.read_csv("train.csv")
train_data = data[:int(0.8*159571)]
test_data = data[int(0.8*159571):]

In [None]:
train_data[train_data["threat"]==1][:10]

### Visualization

In [None]:
plt.hist([len(i) for i in train_data["comment_text"]])

In [None]:
column_labels = train_data.columns.tolist()[2:]
label_counts = train_data[column_labels].sum().sort_values()

plt.figure(figsize=(6, 4))

ax = sns.barplot(x=label_counts.values,
				y=label_counts.index, palette='viridis')

plt.xlabel('Number of Occurrences')
plt.ylabel('Labels')
plt.title('Distribution of Label Occurrences')
plt.show()

In [None]:
train_toxic = train_data[train_data[column_labels].sum(axis=1) > 0]
train_clean = train_data[train_data[column_labels].sum(axis=1) == 0]

num_toxic = len(train_toxic)
num_clean = len(train_clean)

plot_data = pd.DataFrame(
	{'Category': ['Toxic', 'Clean'], 'Count': [num_toxic, num_clean]})

plt.figure(figsize=(6, 4))

ax = sns.barplot(x='Count', y='Category', data=plot_data, palette='viridis')

plt.xlabel('Number of Comments')
plt.ylabel('Category')
plt.title('Distribution of Toxic and Clean Comments')

ax.tick_params()

plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(15, 10))

labels_list = ['toxic', 'severe_toxic', 'obscene',
				'threat', 'insult', 'identity_hate']
plt.title('Correlation of Toxicity Criteria with Each Other')

sns.heatmap(train_data[labels_list].corr(),cmap='rocket_r', annot=True)

In [None]:
non_toxic_undersample = train_data[train_data['toxic'] == 0].sample(n=(train_data['toxic'] == 1).sum(), random_state=201)
non_toxic_undersample
train_data = pd.concat([train_data[train_data['toxic'] == 1], non_toxic_undersample])

In [None]:
train_toxic = train_data[train_data[column_labels].sum(axis=1) > 0]
train_clean = train_data[train_data[column_labels].sum(axis=1) == 0]

num_toxic = len(train_toxic)
num_clean = len(train_clean)

plot_data = pd.DataFrame(
	{'Category': ['Toxic', 'Clean'], 'Count': [num_toxic, num_clean]})

plt.figure(figsize=(6, 4))

ax = sns.barplot(x='Count', y='Category', data=plot_data, palette='viridis')

plt.xlabel('Number of Comments')
plt.ylabel('Category')
plt.title('Distribution of Toxic and Clean Comments')

ax.tick_params()

plt.show()

### Preprocessing

In [None]:
x_train = train_data["comment_text"]
y_train = train_data[train_data.columns[2:]].values
y_train

In [None]:
x_test = test_data["comment_text"]
y_test = test_data[test_data.columns[2:]].values
y_test

In [None]:
MAX_FEATURES = 200000 #Number of words in vocab

In [None]:
vectorizer = TextVectorization(max_tokens=MAX_FEATURES,
                               output_sequence_length=1800,
                               output_mode='int')

In [None]:
vectorizer.adapt(x_train.values)
vectorized_train_text = vectorizer(x_train.values)

In [None]:
vectorizer.adapt(x_test.values)
vectorized_test_text = vectorizer(x_test.values)

In [None]:
#MCSHBAP: map, cache, shuffle, batch, prefetch
train_dataset = tf.data.Dataset.from_tensor_slices((vectorized_train_text,y_train))
train_dataset = train_dataset.cache()
train_dataset = train_dataset.shuffle(160000)
train_dataset = train_dataset.batch(32)
train_dataset = train_dataset.prefetch(8) #Prevents bottleneck
train_dataset.as_numpy_iterator().next()

In [None]:
#MCSHBAP: map, cache, shuffle, batch, prefetch
test_dataset = tf.data.Dataset.from_tensor_slices((vectorized_test_text,y_test))
test_dataset = test_dataset.cache()
test_dataset = test_dataset.shuffle(160000)
test_dataset = test_dataset.batch(32)
test_dataset = test_dataset.prefetch(8) #Prevents bottleneck
test_dataset.as_numpy_iterator().next()

In [None]:
x_batch, y_batch = train_dataset.as_numpy_iterator().next()
x_batch.shape,y_batch.shape

In [None]:
x_batch, y_batch = test_dataset.as_numpy_iterator().next()
x_batch.shape,y_batch.shape

### Model Creation (Bidirectional LSTM)


In [None]:
model = Sequential()
model.add(Embedding(MAX_FEATURES+1, 32))
model.add(Bidirectional(LSTM(32, activation = "tanh")))
model.add(Dense(256, activation="relu"))
model.add(Dense(128, activation="relu"))
model.add(Dense(6, activation="sigmoid"))

In [None]:
model.summary()

In [None]:
model.compile(loss = tf.keras.losses.BinaryCrossentropy(),
               optimizer="adam",
               metrics=["accuracy"])

In [None]:
stop_early = tf.keras.callbacks.EarlyStopping(monitor="accuracy",patience=3)
model_history = model.fit(x = train_dataset,
                            epochs=10,
                            validation_data=test_dataset,
                          callbacks = [stop_early]
                            )

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(model_history.history['accuracy'], label='Accuracy')
plt.plot(model_history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(model_history.history['loss'], label='Loss')
plt.plot(model_history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
model.save("LSTM-Model")

### Using the model to predict

In [None]:
y_pred = model.predict(val)
y_pred = (y_pred>0.5).astype(int)

In [None]:
y_true = []
for i in test_dataset.as_numpy_iterator():
    for j in i[1]:
        y_true += [j]
y_true = np.array(y_true)

In [None]:
cm_y_pred = []
for i in y_pred:
    cm_y_pred += [i[0]]
cm_y_pred = np.array(cm_y_pred)    

In [None]:
cm_y_true = []
for i in y_true:
    cm_y_true += [i[0]]
cm_y_true = np.array(cm_y_true)

In [None]:
confusion_matrix = metrics.confusion_matrix(cm_y_true,cm_y_pred)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix=confusion_matrix,display_labels=["Not Toxic","Toxic"])
cm_display.plot()

In [None]:
MAX_FEATURES = 200000
vectorizer = TextVectorization(max_tokens=MAX_FEATURES,
                               output_sequence_length=1800,
                               output_mode='int')

In [None]:
train_data = pd.read_csv("train.csv")
x = train_data["comment_text"]

In [None]:
vectorizer.adapt(x.values)
vectorized_text = vectorizer(x.values)

In [None]:
model1 = tf.keras.models.load_model('LSTM-Model')

In [None]:
input_text = vectorizer("Fuck your nigger ass")

In [None]:
res = model1.predict(np.expand_dims(input_text,0))

In [None]:
output = (res>0.5).astype(int)

In [None]:
col = train_data.columns[2:]
col = col.to_numpy()
col.shape

In [None]:
count = 0
for index,element in np.ndenumerate(output):
  if(element==1):
    count = count+1
    _,k = index
    print(col[k])
if(count==0):
  print("Not Toxic")

### BERT

In [None]:
data = pd.read_csv("train.csv")
print(data.head())

In [None]:
column_labels = data.columns.tolist()[2:]
label_counts = data[column_labels].sum().sort_values()

In [None]:
train_toxic = data[data[column_labels].sum(axis=1) > 0]
train_clean = data[data[column_labels].sum(axis=1) == 0]

num_toxic = len(train_toxic)
num_clean = len(train_clean)

In [None]:
train_clean_sampled = train_clean.sample(n=16225, random_state=42)
dataframe = pd.concat([train_toxic, train_clean_sampled], axis=0)
dataframe = dataframe.sample(frac=1, random_state=42)

In [None]:
train_texts, test_texts, train_labels, test_labels = train_test_split(
	dataframe['comment_text'], dataframe.iloc[:, 2:], test_size=0.25, random_state=42)

In [None]:
test_texts, val_texts, test_labels, val_labels = train_test_split(
	test_texts, test_labels, test_size=0.5, random_state=42)

In [None]:
def tokenize_and_encode(tokenizer, comments, labels, max_length=128):
	input_ids = []
	attention_masks = []
	for comment in comments:
		encoded_dict = tokenizer.encode_plus(
			comment,
			add_special_tokens=False,
			max_length=max_length,
			pad_to_max_length=True,
			return_attention_mask=True,
			return_tensors='pt'
		)
		input_ids.append(encoded_dict['input_ids'])
		attention_masks.append(encoded_dict['attention_mask'])

	input_ids = torch.cat(input_ids, dim=0)
	attention_masks = torch.cat(attention_masks, dim=0)
	labels = torch.tensor(labels, dtype=torch.float32)

	return input_ids, attention_masks, labels

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',do_lower_case=True)

In [None]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased',num_labels=6)

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = model.to(device)

In [None]:
input_ids, attention_masks, labels = tokenize_and_encode(
	tokenizer,
	train_texts,
	train_labels.values
)
test_input_ids, test_attention_masks, test_labels = tokenize_and_encode(
	tokenizer,
	test_texts,
	test_labels.values
)
val_input_ids, val_attention_masks, val_labels = tokenize_and_encode(
	tokenizer,
	val_texts,
	val_labels.values
)

print('Training Comments :',train_texts.shape)
print('Input Ids		 :',input_ids.shape)
print('Attention Mask :',attention_masks.shape)
print('Labels		 :',labels.shape)

In [None]:
batch_size = 16
train_dataset = TensorDataset(input_ids, attention_masks, labels)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

test_dataset = TensorDataset(test_input_ids, test_attention_masks, test_labels)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

val_dataset = TensorDataset(val_input_ids, val_attention_masks, val_labels)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [None]:
optimizer = AdamW(model.parameters(), lr=2e-5)

In [None]:
def train_model(model, train_loader, optimizer, device, num_epochs):

	loss_line = []
	val_loss_line = []

	for epoch in range(num_epochs):

		model.train()
		total_loss = 0

		for batch in train_loader:
			input_ids, attention_mask, labels = [t.to(device) for t in batch]

			optimizer.zero_grad()

			outputs = model(
				input_ids, attention_mask=attention_mask, labels=labels)
			loss = outputs.loss
			total_loss += loss.item()

			loss.backward()
			optimizer.step()

		model.eval() 

		val_loss = 0
		with torch.no_grad():
			for batch in val_loader:
				input_ids, attention_mask, labels = [
					t.to(device) for t in batch]

				outputs = model(
					input_ids, attention_mask=attention_mask, labels=labels)
				loss = outputs.loss
				val_loss += loss.item()
		print(f'Epoch {epoch+1}, Training Loss: {total_loss/len(train_loader)},Validation loss:{val_loss/len(val_loader)}')
		loss_line += [total_loss/len(train_loader)]
		val_loss_line += [val_loss/len(val_loader)]
	
	plt.figure(figsize=(10, 6))
	plt.plot(loss_line, label='Loss')
	plt.plot(val_loss_line, label='Validation Loss')
	plt.title('Model Loss')
	plt.xlabel('Epoch')
	plt.ylabel('Loss')
	plt.legend()
	plt.grid(True)
	plt.show()

train_model(model, train_loader, optimizer, device, num_epochs=10)

In [None]:
def evaluate_model(model, test_loader, device):
	model.eval()

	true_labels = []
	predicted_probs = []

	with torch.no_grad():
		for batch in test_loader:
			input_ids, attention_mask, labels = [t.to(device) for t in batch]

			outputs = model(input_ids, attention_mask=attention_mask)
			predicted_probs_batch = torch.sigmoid(outputs.logits)
			predicted_probs.append(predicted_probs_batch.cpu().numpy())

			true_labels_batch = labels.cpu().numpy()
			true_labels.append(true_labels_batch)

	true_labels = np.concatenate(true_labels, axis=0)
	predicted_probs = np.concatenate(predicted_probs, axis=0)
	predicted_labels = (predicted_probs > 0.5).astype(int) 

	accuracy = accuracy_score(true_labels, predicted_labels)
	precision = precision_score(true_labels, predicted_labels, average='micro')
	recall = recall_score(true_labels, predicted_labels, average='micro')

	print(f'Accuracy: {accuracy:.4f}')
	print(f'Precision: {precision:.4f}')
	print(f'Recall: {recall:.4f}')

evaluate_model(model, test_loader, device)

In [None]:
output_dir = "BERT-Model"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

### Prediction using BERT

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model_name = "BERT-Model"
Bert_Tokenizer = BertTokenizer.from_pretrained(model_name)
Bert_Model = BertForSequenceClassification.from_pretrained(model_name).to(device)

In [None]:
def predict_user_input(input_text, model=Bert_Model, tokenizer=Bert_Tokenizer, device=device):
	user_input = [input_text]

	user_encodings = tokenizer(
		user_input, truncation=True, padding=True, return_tensors="pt")

	user_dataset = TensorDataset(
		user_encodings['input_ids'], user_encodings['attention_mask'])

	user_loader = DataLoader(user_dataset, batch_size=1, shuffle=False)

	model.eval()
	with torch.no_grad():
		for batch in user_loader:
			input_ids, attention_mask = [t.to(device) for t in batch]
			outputs = model(input_ids, attention_mask=attention_mask)
			logits = outputs.logits
			predictions = torch.sigmoid(logits)

	predicted_labels = (predictions.cpu().numpy() > 0.5).astype(int)
	labels_list = ['toxic', 'severe_toxic', 'obscene',
				'threat', 'insult', 'identity_hate']
	result = dict(zip(labels_list, predicted_labels[0]))
	return result

In [None]:
predict_user_input(input_text="Your mother")