In [1]:
import re
import string

import pandas as pd
import numpy as np

from keras.preprocessing.text import Tokenizer
from keras.utils.data_utils import pad_sequences
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from sklearn.model_selection import train_test_split

import torch
from torch.utils.data import Dataset, DataLoader

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def preprocess(df: pd.DataFrame) -> pd.DataFrame:
    """Remove stopwords, perform stemming and vectorize

    Args:
        df (pd.DataFrame): Input dataframe

    Returns:
        pd.DataFrame: Cleaned ready to be ML'd into next week
    """

    # Set up stopword and stem vars
    swords = '|'.join(set(stopwords.words('english')))
    swords_re = f"\\b({swords})\\b"
    stemmer = PorterStemmer()

    # Remove stopwords
    def remove_words(text):
        text = re.sub(swords_re, "", text)  # remove all stopwords
        text = re.sub(f"[{string.punctuation}]", "", text) # remove all punc
        text = re.sub("[0-9]+", "", text) # remove all numbers
        text = re.sub("[^a-zA-Z ]", "", text) # remove all non alpha
        text = re.sub(" +", " ", text) # keep everything at 1 space
        text = text.replace(r'\n', '')
        return text
    df["comment_core"] = df["comment_text"].apply(remove_words)

    # Keep only stems
    def stem_words(text):
        words = text.split(' ')
        # Very few words this long
        stems = [stemmer.stem(word) for word in words if len(word)<=14]
        return " ".join(stems)
    df["comment_core_stem"] = df["comment_core"].apply(stem_words)

    # Get tokenizer
    tokenizer = Tokenizer(num_words=20_000)
    tokenizer.fit_on_texts(df["comment_core_stem"])

    # Convert text to vectors and pad
    comments_int = tokenizer.texts_to_sequences(df["comment_core_stem"])
    comments_pad = pad_sequences(comments_int, maxlen=687)#(max([len(x) for x in comments_int])//2))
    comments_pad = pd.DataFrame(comments_pad)

    comments = df[["comment_text","comment_core_stem"]]
    cleaned = df.drop(["id", "comment_text", "comment_core", "comment_core_stem"], axis=1)

    return cleaned.join(comments_pad), comments, tokenizer

class CommentDataset(Dataset):

    def __init__(self, df):
        self.df = df

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, idx):
        targets = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
        y = self.df.iloc[idx][targets]
        X = self.df.drop(targets, axis=1).iloc[idx]
        y = torch.from_numpy(y.values)
        X = torch.from_numpy(X.values)

        return X, y

In [3]:
df = pd.read_csv('../data/train.csv')
cleaned, comments, tokenizer = preprocess(df)
vocabulary = tokenizer.index_word
train_data, test_data, _, _ = train_test_split(cleaned, cleaned, test_size=0.2, random_state=2022)
train_data, valid_data, _, _ = train_test_split(train_data, train_data, test_size=0.2, random_state=2022)
train_dataset = CommentDataset(train_data)
valid_dataset = CommentDataset(valid_data)
test_dataset = CommentDataset(test_data)

del df, tokenizer, train_data, test_data, valid_data, stopwords, cleaned

In [6]:
class RNN(torch.nn.Module):

    def __init__(self, embed_size, hidden_size, vocab_size, output_size):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.word_embeddings = torch.nn.Embedding(vocab_size, embed_size)
        self.i2h = torch.nn.Linear(embed_size + hidden_size, hidden_size)
        self.i2o = torch.nn.Linear(embed_size + hidden_size, output_size)
        self.softmax = torch.nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):

        print(f"Input size: {input.size()}")
        print(f"Hidden Size: {hidden.size()}")
        embeds = self.word_embeddings(input)
        print(f"Embedded size: {embeds.size()}")
        combined = torch.cat((embeds.view(1, -1), hidden), 1)
        print(f"Combined size: {combined.size()}")
        hidden = self.i2h(combined)
        output = self.i2o(combined)
        output = self.softmax(output)
        return output, hidden

    def init_hidden(self):
        return torch.zeros(1, self.hidden_size)


In [8]:
# Set Device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using Device: {device}")

# Hyperparams
learning_rate = 0.005
n_hidden = 128
rnn = RNN(
    embed_size=100,
    hidden_size=128,
    output_size=6,
    vocab_size=len(vocabulary)
).to(device)

loss_function = torch.nn.NLLLoss()
optimizer = torch.optim.SGD(rnn.parameters(), lr=learning_rate)
print(f"Vocab Size: {len(vocabulary)}")
print(rnn)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=True)
# train_feats, train_labels = next(iter(train_dataloader))

epochs = 1
for epoch in range(epochs):
    if epoch % 5 == 0:
        print("Finnished epoch " + str(epoch / 30 * 100)  + "%")
    for step, (X, y) in enumerate(train_dataloader):

        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        # Also, we need to clear out the hidden state of the LSTM,
        # detaching it from its history on the last instance.
        hidden = rnn.init_hidden()
        rnn.zero_grad()
        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Tensors of word indices.

        # Step 3. Run our forward pass.
        for i in range(len(X)):
            class_scores, hidden = rnn(X[i].to(device), hidden.to(device))

        # Class scores are for the last node
        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(class_scores, y)
        loss.backward()
        optimizer.step()

Using Device: cuda
Vocab Size: 185669
RNN(
  (word_embeddings): Embedding(185669, 100)
  (i2h): Linear(in_features=228, out_features=128, bias=True)
  (i2o): Linear(in_features=228, out_features=6, bias=True)
  (softmax): LogSoftmax(dim=1)
)
Finnished epoch 0.0%
Input size: torch.Size([687])
Hidden Size: torch.Size([1, 128])
Embedded size: torch.Size([687, 100])
Combined size: torch.Size([1, 68828])


RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x68828 and 228x128)