In [2]:
import re
import string

import pandas as pd
import numpy as np

from keras.preprocessing.text import Tokenizer
from keras.utils.data_utils import pad_sequences
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from sklearn.model_selection import train_test_split

import torch
from torch.utils.data import Dataset, DataLoader

device = "cuda" if torch.cuda.is_available() else "cpu"

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def preprocess(df: pd.DataFrame) -> pd.DataFrame:
    """Remove stopwords, perform stemming and vectorize

    Args:
        df (pd.DataFrame): Input dataframe

    Returns:
        pd.DataFrame: Cleaned ready to be ML'd into next week
    """

    # Set up stopword and stem vars
    swords = '|'.join(set(stopwords.words('english')))
    swords_re = f"\\b({swords})\\b"
    stemmer = PorterStemmer()

    # Remove stopwords
    def remove_words(text):
        text = re.sub(swords_re, "", text)  # remove all stopwords
        text = re.sub(f"[{string.punctuation}]", "", text) # remove all punc
        text = re.sub("[0-9]+", "", text) # remove all numbers
        text = re.sub("[^a-zA-Z ]", "", text) # remove all non alpha
        text = re.sub(" +", " ", text) # keep everything at 1 space
        text = text.replace(r'\n', '')
        return text
    df["comment_core"] = df["comment_text"].apply(remove_words)

    # Keep only stems
    def stem_words(text):
        words = text.split(' ')
        # Very few words this long
        stems = [stemmer.stem(word) for word in words if len(word)<=14]
        return " ".join(stems)
    df["comment_core_stem"] = df["comment_core"].apply(stem_words)

    # Get tokenizer
    tokenizer = Tokenizer(num_words=20_000)
    tokenizer.fit_on_texts(df["comment_core_stem"])

    # Convert text to vectors and pad
    comments_int = tokenizer.texts_to_sequences(df["comment_core_stem"])
    comments_pad = pad_sequences(comments_int, maxlen=687)#(max([len(x) for x in comments_int])//2))
    comments_pad = pd.DataFrame(comments_pad)

    comments = df[["comment_text","comment_core_stem"]]
    cleaned = df.drop(["id", "comment_text", "comment_core", "comment_core_stem"], axis=1)

    return cleaned.join(comments_pad), comments, tokenizer

class CommentDataset(Dataset):

    def __init__(self, df):
        targets = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
        self.df = df
        X = df.drop(targets, axis=1)
        y = df[targets]
        self.y = torch.from_numpy(y.values).to(device)
        self.X = torch.from_numpy(X.values).to(device)

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, idx):

        return self.X[idx], self.y[idx]

In [4]:
df = pd.read_csv('../data/train.csv')
cleaned, comments, tokenizer = preprocess(df)
vocabulary = tokenizer.index_word
train_data, test_data, _, _ = train_test_split(cleaned, cleaned, test_size=0.2, random_state=2022)
train_data, valid_data, _, _ = train_test_split(train_data, train_data, test_size=0.2, random_state=2022)
train_dataset = CommentDataset(train_data)
valid_dataset = CommentDataset(valid_data)
test_dataset = CommentDataset(test_data)

del df, tokenizer, train_data, test_data, valid_data, stopwords, cleaned

In [5]:
class RNN(torch.nn.Module):

    def __init__(self, embed_size, hidden_size, vocab_size, output_size):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.word_embeddings = torch.nn.Embedding(vocab_size, embed_size)
        self.i2h = torch.nn.Linear(embed_size + hidden_size, hidden_size)
        self.i2o = torch.nn.Linear(embed_size + hidden_size, output_size)
        self.softmax = torch.nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):

        # print(f"Input size: {input.size()}")
        # print(f"Hidden Size: {hidden.size()}")
        embeds = self.word_embeddings(input)
        # print(f"Embedded size: {embeds.size()}")
        combined = torch.cat((embeds.view(1, -1), hidden), 1)
        # print(f"Combined size: {combined.size()}")
        hidden = self.i2h(combined)
        output = self.i2o(combined)
        output = self.softmax(output)
        return output, hidden

    def init_hidden(self):
        return torch.zeros(1, self.hidden_size)


In [6]:
# Set Device

print(f"Using Device: {device}")

# Hyperparams
learning_rate = 0.01
n_hidden = 64
embed_size = 48
rnn = RNN(
    embed_size=embed_size,
    hidden_size=n_hidden,
    output_size=6,
    vocab_size=len(vocabulary)
).to(device)

loss_function = torch.nn.NLLLoss()
optimizer = torch.optim.SGD(rnn.parameters(), lr=learning_rate)
print(f"Vocab Size: {len(vocabulary)}")
print(rnn)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=32, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=True)
# train_feats, train_labels = next(iter(train_dataloader))


Using Device: cuda
Vocab Size: 185669
RNN(
  (word_embeddings): Embedding(185669, 48)
  (i2h): Linear(in_features=112, out_features=64, bias=True)
  (i2o): Linear(in_features=112, out_features=6, bias=True)
  (softmax): LogSoftmax(dim=1)
)


In [12]:
train_losses = []
val_losses = []
epoch_loss = []
epochs = 10
for epoch in range(epochs):
    print(f'Starting on epoch: {epoch+1}')
    for step, (X, y) in enumerate(train_dataloader):
        print(f'\tStarting on step: {step+1}')

        for i in range(len(X)):
            if i%16==0:
                print(f'\t\tStarting on point: {i} of {len(X)}')
            sentence = X[i]
            targets = y[i]

            # Step 1. Remember that Pytorch accumulates gradients.
            # We need to clear them out before each instance
            # Also, we need to clear out the hidden state of the LSTM,
            # detaching it from its history on the last instance.
            hidden = rnn.init_hidden()
            rnn.zero_grad()
            # Step 2. Get our inputs ready for the network, that is, turn them into
            # Tensors of word indices.

            # Step 3. Run our forward pass.
            for i in range(len(sentence)):
                class_scores, hidden = rnn(sentence[i].to(device), hidden.to(device))

            # Class scores are for the last node
            # Step 4. Compute the loss, gradients, and update the parameters by
            #  calling optimizer.step()
            loss = loss_function(class_scores[0].to(device), targets.to(device))
            print(f"Training Loss: {loss}")
            loss.backward()
            optimizer.step()
            break
        break

    with torch.no_grad():
        # Calculate validation at each epoch
        print('\tCalculating Validation Loss')
        for step, (X, y) in enumerate(valid_dataloader):
            if step%100!=0:
                continue

            for i in range(len(X)):
                sentence = X[i]
                targets = y[i]

                for i in range(len(sentence)):
                    class_scores, _ = rnn(sentence[i].to(device), hidden.to(device))

                loss = loss_function(class_scores[0].to(device), targets.to(device))
                val_losses.append(loss)
            print(f"\tGuess: {class_scores}/{targets}")

        print('Total loss for epoch:', loss)
        epoch_loss.append(loss)


Starting on epoch: 1
	Starting on step: 1
		Starting on point: 0 of 32
Loss: 2.327928066253662
Calculating Validation Loss
Total loss for epoch: tensor(2.1802, device='cuda:0')
Starting on epoch: 2
	Starting on step: 1
		Starting on point: 0 of 32
Loss: 2.3719096183776855
Calculating Validation Loss
Total loss for epoch: tensor(1.5853, device='cuda:0')
Starting on epoch: 3
	Starting on step: 1
		Starting on point: 0 of 32
Loss: 1.4840929508209229
Calculating Validation Loss
Total loss for epoch: tensor(1.7042, device='cuda:0')
Starting on epoch: 4
	Starting on step: 1
		Starting on point: 0 of 32
Loss: 2.2166247367858887
Calculating Validation Loss
Total loss for epoch: tensor(2.4714, device='cuda:0')
Starting on epoch: 5
	Starting on step: 1
		Starting on point: 0 of 32
Loss: 1.5216482877731323
Calculating Validation Loss
Total loss for epoch: tensor(1.7088, device='cuda:0')
Starting on epoch: 6
	Starting on step: 1
		Starting on point: 0 of 32
Loss: 1.5628044605255127
Calculating Val

In [7]:
# Convert the sentiment_class from set to list
y_pred = []
y_actual = []
with torch.no_grad():
    for step, (X, y) in enumerate(test_dataloader):
        for i in range(len(X)):
            smp = 14
            sentence = X[smp]
            targets = y[smp]
            hidden = rnn.init_hidden()
            for i in range(len(sentence)):
                class_scores, hidden = rnn(sentence[i].to(device), hidden.to(device))
            print(targets)
            print(class_scores)
            break
        break

tensor([0, 0, 0, 0, 0, 0], device='cuda:0')
tensor([[-1.7920, -1.8169, -1.5521, -1.9179, -2.6748, -1.4139]],
       device='cuda:0')
