# Google Colab

In [None]:
try:
    from google.colab import files
    uploaded = files.upload() # upload inputs.txt
except: print('Running locally')

# Load Data

In [23]:
#Packages
import torch
import torch.nn as nn
import torch.nn.functional as F

# Hyperparameters
device = 'cuda' if torch.cuda.is_available() else 'cpu'
max_len = 22            # what is the longest word in dataset to encode?
batch_size = 64         # how many independent sequences will we process in parallel?
n_embd = 128            # embedding size per character
n_hidden = 64           # size of LSTM hidden state
n_layer = 1             # number of LSTM layers
max_iters = 100         # total number of batches trained
eval_iters = 100        # number of iterations to evaluate
eval_interval = 100     # validation print interval
learning_rate = 0.0001  # by how much weights update each iteration?

# Same results across different platforms
torch.manual_seed(1337)

# Load data
with open('input.txt') as f: lines = f.read().splitlines()
words, labels = [], []
for line in lines:
    word, label = line.strip().split(',')
    words.append(word.lower())
    labels.append(int(label))

# Build vocabulary
chars = sorted(list(set(''.join(words))))
vocab_size = len(chars)+1
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0

# Encode a single word
def encode_word(w):
    encoded = [stoi.get(c, 0) for c in w]
    if len(encoded) < max_len: encoded += [0] * (max_len - len(encoded))
    else: encoded = encoded[:max_len]
    return encoded

# Encode data
X = torch.tensor([encode_word(w) for w in words], dtype=torch.long)
Y = torch.tensor(labels, dtype=torch.float)

# Split data into train/test subsets
split_idx = int(0.9 * len(X))
X_train, Y_train = X[:split_idx], Y[:split_idx]
X_val,   Y_val   = X[split_idx:], Y[split_idx:]

# Inspect Data

# Model Class

In [24]:
# Model
class LSTM(nn.Module):
    def __init__(self, vocab_size, n_embd, n_hidden, n_layer):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, n_embd, padding_idx=0)             # maps chars to vectors, ignores padding
        self.lstm = nn.LSTM(n_embd, n_hidden, num_layers=n_layer, batch_first=True)  # processes char sequences
        self.fc = nn.Linear(n_hidden, 1)                                             # outputs single logit

    def forward(self, x, targets=None):
        x = self.embedding(x)
        _, (hn, _) = self.lstm(x)
        logits = self.fc(hn[-1]).squeeze(1)
        loss = None
        if targets is not None:
            loss = F.binary_cross_entropy_with_logits(logits, targets)
        return logits, loss

# Init model
model = LSTM(vocab_size, n_embd, n_hidden, n_layer).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

# Evaluate Model

In [31]:
# Data chunk loading
def get_batch(split):
    data_x = X_train if split == 'train' else X_val
    data_y = Y_train if split == 'train' else Y_val
    ix = torch.randint(len(data_x), (batch_size,))
    x = data_x[ix].to(device)
    y = data_y[ix].to(device)
    return x, y

# Evaluate model
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
            out[split] = losses.mean()
    model.train()
    return out

# Train Model

In [None]:
# Training loop
for iter in range(max_iters):
    # Every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # Sample a batch of data
    xb, yb = get_batch('train')

    # Evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

# Save model
torch.save(model.state_dict(), 'profanity_model.pt')

# Download model from Google Colab
try: files.download('profanity_model.pt')
except: print('Running locally')

# Test Model

In [None]:
# Stat counts
bad_count = 0
good_count = 0
bad_correct = 0
bad_wrong = 0
good_correct = 0
good_wrong = 0
threshold = 0.5

# Bulk test
with open('input.txt') as f:
    words = f.read().splitlines()
    for w in words:
        x = torch.tensor(encode_word(w), dtype=torch.long).unsqueeze(0).to(device) # (1, max_len)
        with torch.no_grad():
            logit, _ = model(x)
            prob = torch.sigmoid(logit).item()
            if ',1' in w:
                bad_count += 1
                if prob > threshold: bad_correct += 1
                if prob < threshold:
                    #print(f'should be bad:\t{prob:.4f}\t{w.split(",")[0]}')
                    bad_wrong += 1
            if ',0' in w:
                good_count += 1
                if prob < threshold: good_correct += 1
                if prob > threshold:
                    #print(f'should be good:\t{prob:.4f}\t{w.split(",")[0]}')
                    good_wrong += 1

# Calculate results
good_acc = good_correct / good_count * 100
bad_acc = bad_correct / bad_count * 100
overall_acc = (good_correct + bad_correct) / (good_count + bad_count) * 100
print(f"Bad accuracy: {bad_acc:.2f}%")
print(f"Good accuracy: {good_acc:.2f}%")
print(f"Overall accuracy: {overall_acc:.2f}%")


# Main loop
print("Type a word to check if it's profanity. Type 'exit' to quit.")
while True:
    word = input("Enter word: ").strip().lower()
    if word.lower() == 'exit': break
    x = torch.tensor(encode_word(word), dtype=torch.long).unsqueeze(0).to(device) # (1, max_len)
    with torch.no_grad():
        logit, _ = model(x)
        prob = torch.sigmoid(logit).item()
        print(prob)
        print("Profanity" if prob > 0.5 else "Not profanity")