<a href="https://colab.research.google.com/github/nnikolovskiii/machine-learning-with-graphs/blob/master/Offensive_text_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Preprocessing

In [3]:
import pandas as pd

In [4]:
!gdown 1fDeVA12hIHufN3VnbtqcZ4ODB9XnJztn

Downloading...
From: https://drive.google.com/uc?id=1fDeVA12hIHufN3VnbtqcZ4ODB9XnJztn
To: /content/train_en.txt
  0% 0.00/221k [00:00<?, ?B/s]100% 221k/221k [00:00<00:00, 16.2MB/s]


In [5]:
# Replace 'data.txt' with the actual path to your .txt file if it's not in the same directory.
df = pd.read_csv('/content/train_en.txt', sep='\t')

In [6]:
df.head()

Unnamed: 0,Sentence,Label
0,Pussy nobody asked for your input.,1
1,"Ok, this makes no sense. This will create vigi...",1
2,so fucking true. the amount of up and coming r...,1
3,Go f yourself Republican scum who put us here ...,1
4,Dumb fucking take. People want to do things.,1


In [7]:
!pip install transformers



In [8]:
from torchtext.data.utils import get_tokenizer
tokenizer = get_tokenizer('basic_english')

In [9]:
from torchtext.vocab import build_vocab_from_iterator

def sentence_generator():
    for sentence in df["Sentence"]:
        yield tokenizer(sentence)

vocab = build_vocab_from_iterator(sentence_generator())

In [10]:
for i,key in enumerate(vocab.get_stoi()):
  print(f"Key: {key}, Value: {vocab.get_stoi()[key]}")
  if i >5:
    break

Key: 🤦‍♂️, Value: 4363
Key: 😭😭😭😭what, Value: 4360
Key: 😂😂, Value: 4358
Key: 💡, Value: 4357
Key: 👍, Value: 4356
Key: …, Value: 4354
Key: “unarmed, Value: 4350


In [11]:
#transforming data to iter
data = [(row['Label'], row['Sentence']) for index, row in df.iterrows()]

In [12]:
len(data)

3168

In [13]:
total_length = 0
for _, sentence in data:
    total_length += len(sentence)  # Split sentence into words and count them

avg_length = total_length // len(data)

avg_length

65

In [14]:
text_transform = lambda x: [vocab[token] for token in tokenizer(x)]

In [15]:
import torch
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence


def collate_batch(batch, avg_length):
    label_list, text_list = [], []
    for (label, text) in batch:
        label_list.append(label)
        text = torch.tensor(text_transform(text))
        text_list.append(text[:avg_length])  # Trim or pad to avg_length
    return torch.tensor(label_list), pad_sequence(text_list, padding_value=3.0)

#creating a DataLoader
#we will later test with differnet batch sizes
batch_size = 8
shuffle = True
train_dataloader = DataLoader(data, batch_size=batch_size, shuffle=shuffle, collate_fn=lambda batch: collate_batch(batch, avg_length))


In [16]:
vocab_size = len(vocab)
emb_dim = 100
n_layers = 2
dropout = 0.2
output_dim = 1 #beacuse of binary classification
hid_dim = 256

#Creating the model

In [17]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, vocab_size, emb_dim, hid_dim, output_dim, n_layers, dropout):
        super(RNN, self).__init__()

        self.embedding = nn.Embedding(vocab_size, emb_dim)

        self.lstm = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout)

        self.fc_out = nn.Linear(hid_dim, output_dim)

        self.dropout = nn.Dropout(dropout)



    def forward(self, text):
        embedded = self.dropout(self.embedding(text))
        #output is hidden layers at each time frame
        output, (hidden, cell) = self.lstm(embedded)

        # Apply dropout
        hidden = self.dropout(hidden[-1, :, :])  # Take the hidden state of the last layer

        # hidden: [batch_size, hid_dim]

        # Fully connected layer
        output = self.fc_out(hidden)

        return output

#Training the model

In [21]:
# Instantiating the model
model = RNN(vocab_size, emb_dim, hid_dim, output_dim, n_layers, dropout)
criterion = nn.BCEWithLogitsLoss()  # Binary cross-entropy loss
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

In [22]:
num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0

    for batch in train_dataloader:
        optimizer.zero_grad()

        labels,token_ids = batch

        predictions = model(token_ids)

        labels = labels.view(-1, 1)

        loss = criterion(predictions, labels.float())

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    correct_predictions = (predictions > 0.5).eq(labels.byte())

    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch [{epoch + 1}/{num_epochs}]: Avg. Loss: {avg_loss:.4f}")


Epoch [1/5]: Avg. Loss: 0.6928
Epoch [2/5]: Avg. Loss: 0.6896
Epoch [3/5]: Avg. Loss: 0.6878
Epoch [4/5]: Avg. Loss: 0.6805
Epoch [5/5]: Avg. Loss: 0.6346
