# BERT + POS tagging for Toxicity Classification 

In [None]:
import pandas as pd
import numpy as np
import re
import json
import regex
import nltk # TODO: learnable POS encoder to add to model
from transformers import BertTokenizer
from model import CombinedEmbeddingModel
from sklearn.model_selection import train_test_split
from nltk import pos_tag
from transformers import AutoTokenizer
import tensorflow as tf
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    print("ok")
    tf.config.experimental.set_memory_growth(gpu, True)
# Download NLTK tagger
nltk.download('averaged_perceptron_tagger_eng')

# Read data
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')

# Preprocessing:
def clean_text(text):
    if not isinstance(text, str):
        text = str(text)
    text = re.sub(r'<.*?>', ' ', text)  # Remove HTML tags
    text = text.lower().strip()
    return text

def load_contractions(file_path="./contractions.json"):
    try:
        with open(file_path, 'r') as file:
            data = json.load(file)  # Load JSON data
        return data
    except (json.JSONDecodeError, FileNotFoundError):
        return {}

contractions = load_contractions()
contractions_re = re.compile(r'\b(' + '|'.join(re.escape(key) for key in contractions.keys()) + r')\b')

def expand_contractions(text):
    return contractions_re.sub(lambda x: contractions[x.group(0)], text)

def process_dataframe(frame):
    frame = frame.dropna(subset=['comment_text'])
    frame["comment_text"] = frame["comment_text"].apply(expand_contractions)
    return frame

TO_REMOVE = '"()+,-./:;<=>[\\]^_`{|}~\t\n“”’\'∞θ÷α•à−β∅³π‘₹´°£€\×™√²—'
OBSCENITY = '!#$%&*?@'

def remove_chars(text):
    pattern = f"[{re.escape(TO_REMOVE)}]"
    text = re.sub(pattern, " ", text)
    pattern = f"[{re.escape(OBSCENITY)}]"
    return re.sub(pattern, "", text)
df_train = process_dataframe(df_train)
df_test = process_dataframe(df_test)
df_train['comment_text'] = df_train['comment_text'].str.replace(r'\bhttp?\S+\b', 'link', regex=True)
df_test['comment_text'] = df_test['comment_text'].str.replace(r'\bhttp?\S+\b', 'link', regex=True)
df_train["comment_text"] = df_train["comment_text"].apply(clean_text)
df_test["comment_text"] = df_test["comment_text"].apply(clean_text)
df_train["comment_text"] = df_train["comment_text"].apply(remove_chars)
df_test["comment_text"] = df_test["comment_text"].apply(remove_chars)

# Train test splitting

train_text = df_train["comment_text"].astype(str)
train_labels = df_train["target"].astype(float)
train_texts, val_texts, train_labels, val_labels = train_test_split(train_text, train_labels, test_size=0.1, random_state=7)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", use_fast=True)
print(tokenizer.is_fast)

# Function for POS tagging
def pos_tag_tokens(tokens, tokenizer):
    decoded = tokenizer.convert_ids_to_tokens(tokens)
    posTags = []
    for token in decoded:
        if token in ['[CLS]', '[SEP]', '[PAD]']:
            posTags.append((token, 'SPECIAL'))
        else:
            word = token.replace('##', '')
            posTags.append(pos_tag([word])[0])
    return posTags
tags = ['LS', 'TO', 'VBN', "''", 'WP', 'UH', 'VBG', 'JJ', 'VBZ', '--', 'VBP', 'NN', 'DT', 'PRP', ':', 'WP$', 'NNPS', 'PRP$', 'WDT', '(', ')', '.', ',', '``', '$', 'RB', 'RBR', 'RBS', 'VBD', 'IN', 'FW', 'RP', 'JJR', 'JJS', 'PDT', 'MD', 'VB', 'WRB', 'NNP', 'EX', 'NNS', 'SYM', 'CC', 'CD', 'POS']
taggerToId = {'SPECIAL': 0, 'NN': 1, 'VB': 2, 'JJ': 3, 'RB': 4, 'IN': 5}
maxInd = 5
for tag in tags:
    if tag not in taggerToId:
        maxInd += 1
        taggerToId[tag] = maxInd
def convertToInd(tags, taggerToId):
    return [taggerToId.get(tag, 0) for word, tag in tags]

# Prepare a tokenizer with batching
def batch_tokenize_and_tag(texts, tokenizer, tokType, batch_size=32, max_length=512):
    input_ids_list = []
    attention_masks_list = []
    pos_tag_list = []
    save_dir = f"Batches/tokens_{tokType}"
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]
        
        # Tokenize the batch with padding and truncation
        encoded_batch = tokenizer(
            batch_texts,
            truncation=True,
            padding="max_length",
            max_length=max_length,
        )
        temp = tf.convert_to_tensor(encoded_batch['input_ids'])
        input_ids_list.append(temp)
        attention_masks_list.append(tf.convert_to_tensor(encoded_batch['attention_mask']))

        # POS tagging (convert tokenized ids to words and tag)
        for ids in temp:
            tokens = ids.numpy().tolist()
            tags = pos_tag_tokens(tokens, tokenizer)
            pos_tag_list.append(convertToInd(tags, taggerToId))
        print(f"Sample number {i}")
    # Concatenate everything into tensors
    input_ids = tf.concat(input_ids_list, axis=0)
    attention_masks = tf.concat(attention_masks_list, axis=0)
    pos_tags_tensor = tf.convert_to_tensor(pos_tag_list, dtype=tf.int32)
    return {"input_ids": input_ids, "attention_mask": attention_masks, "pos_tags": pos_tags_tensor}
# Tokenize and tag data in batches
train_data = batch_tokenize_and_tag(train_texts.tolist()[:204800], tokenizer, "train", batch_size=2048) # Only using first 200k samples
val_data = batch_tokenize_and_tag(val_texts.tolist()[:50000], tokenizer, "val", batch_size=2048) # only using first 50k samples

# Output results
train_toks = {"input_ids": train_data["input_ids"], "attention_mask": train_data["attention_mask"]}
val_toks = {"input_ids": val_data["input_ids"], "attention_mask": val_data["attention_mask"]}
trainTagInds = train_data["pos_tags"]
valTagInds = val_data["pos_tags"]

# Start building model 

num_secondary_embeddings = 46
embedding_dim = 16
dropout_rate = 0.3
model = CombinedEmbeddingModel(num_secondary_embeddings, embedding_dim, dropout_rate)
train_labels = tf.convert_to_tensor(train_labels[:204800])
val_labels = tf.convert_to_tensor(val_labels[:50000])
loss = tf.keras.losses.BinaryCrossentropy(from_logits=False)
metrics = [tf.keras.metrics.BinaryAccuracy(), tf.keras.metrics.AUC()]
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5), loss=loss, metrics=metrics)
val_ins = (val_toks["input_ids"], val_toks["attention_mask"], valTagInds)
train_ins = (train_toks["input_ids"], train_toks["attention_mask"], trainTagInds)
model.fit(x=train_ins, y=train_labels, validation_data=(val_ins, val_labels), epochs=10, batch_size = 64, callbacks=[checkpoint], verbose=1)
test_text = df_test["comment_text"].astype(str)
test_data = batch_tokenize_and_tag(test_text.tolist(), tokenizer, "test", batch_size=2048)
test_toks = {"input_ids": test_data["input_ids"], "attention_mask": test_data["attention_mask"]}
testTagInds = test_data["pos_tags"]
test_ins = (test_toks["input_ids"], test_toks["attention_mask"], testTagInds)
def save_submission(final_prediction, test_df):
    submission_df = pd.DataFrame({
        'id': test_df.id,
        'prediction': final_prediction
    })
    submission_df.to_csv('submission.csv', index=False)
preds = model.predict(test_ins, verbose=1).flatten()
save_submission(preds, df_test)
