In [None]:
import pandas as pd
import numpy as np
import re
import json
import regex
import nltk # TODO: learnable POS encoder to add to model
from transformers import BertTokenizer
from sklearn.model_selection import train_test_split
from nltk import pos_tag
from transformers import AutoTokenizer

In [None]:
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')

In [None]:
def clean_text(text):
    if not isinstance(text, str):
        text = str(text)
    text = re.sub(r'<.*?>', ' ', text)  #regex will remove html tags
    # text = re.sub(r'[^\w\s]', ' ', text)  # Remove punctuation
    text = text.lower()
    text = text.strip()
    return text
def load_contractions(file_path="./contractions.json"):
    try:
        with open(file_path, 'r') as file:
            data = json.load(file)  # Load JSON data
        return data
    except json.JSONDecodeError as e:
        return None
    except FileNotFoundError as e:
        return None
contractions = load_contractions()
contractions_re = re.compile(r'\b(' + '|'.join(re.escape(key) for key in contractions.keys()) + r')\b')
def expand_contractions(text):
    return contractions_re.sub(lambda x: contractions[x.group(0)], text)

def process_dataframe(frame):
    frame = frame.dropna(subset=['comment_text'])
    frame["comment_text"] = frame["comment_text"].apply(expand_contractions)
    return frame
TO_REMOVE = '"()+,-./:;<=>[\\]^_`{|}~\t\n“”’\'∞θ÷α•à−β∅³π‘₹´°£€\×™√²—'
OBSCENITY = '!#$%&*?@'
def remove_chars(text):
    pattern = f"[{re.escape(TO_REMOVE)}]"
    text = re.sub(pattern, " ", text)
    pattern = f"[{re.escape(OBSCENITY)}]"
    return re.sub(pattern, "", text)


In [None]:
df_train = process_dataframe(df_train)
df_test = process_dataframe(df_test)
df_train['comment_text'] = df_train['comment_text'].str.replace(r'\bhttp?\S+\b', 'link', regex=True)
df_test['comment_text'] = df_test['comment_text'].str.replace(r'\bhttp?\S+\b', 'link', regex=True)
df_train["comment_text"]= df_train["comment_text"].apply(clean_text)
df_test["comment_text"] = df_test["comment_text"].apply(clean_text)
df_train["comment_text"] = df_train["comment_text"].apply(remove_chars)
df_test["comment_text"] = df_test["comment_text"].apply(remove_chars)

In [None]:
train_text = df_train["comment_text"].astype(str)
train_labels = df_train["target"].astype(float)
train_texts, val_texts, train_labels, val_labels = train_test_split(train_text, train_labels, test_size=0.1, random_state=7)
def pos_tag_tokens(tokens, tokenizer):
    # Decode tokens back into words (for POS tagging)
    decoded = tokenizer.convert_ids_to_tokens(tokens)
    posTags = []
    for token in decoded:
        if token in ['[CLS]', '[SEP]']:
            posTags.append(('SPECIAL', token))
        else:
            word = token.replace('##', '')
            posTags.append(pos_tag([word])[0])
   
    return posTags
tags = ['LS', 'TO', 'VBN', "''", 'WP', 'UH', 'VBG', 'JJ', 'VBZ', '--', 'VBP', 'NN', 'DT', 'PRP', ':', 'WP$', 'NNPS', 'PRP$', 'WDT', '(', ')', '.', ',', '``', '$', 'RB', 'RBR', 'RBS', 'VBD', 'IN', 'FW', 'RP', 'JJR', 'JJS', 'PDT', 'MD', 'VB', 'WRB', 'NNP', 'EX', 'NNS', 'SYM', 'CC', 'CD', 'POS']
taggerToId = {'SPECIAL': 0, 'NN': 1, 'VB': 2, 'JJ': 3, 'RB': 4, 'IN': 5}
maxInd = 5
for tag in tags:
    if tag in taggerToId:
        continue
    else:
        maxInd += 1
        taggerToId[tag] = tag
def convertToInd(tags):
    return [taggerToId.get(tag, 0) for word, tag in tags]


In [None]:
# tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", use_fast=True)
print(tokenizer.is_fast)
train_toks = tokenizer(train_texts.tolist(), truncation=True, padding=True, max_length=512)
val_toks = tokenizer(val_texts.tolist(), truncation=True, padding=True, max_length=512)

In [None]:
train_tags = [pos_tag_tokens(ids, tokenizer) for ids in train_toks['input_ids']]
val_tags =[pos_tag_tokens(ids, tokenizer) for ids in val_toks['input_ids']]  
trainTagInds = [convertToInd(tags) for tags in train_tags]
valTagInds = [convertToInd(tags) for tags in val_tags]
train_tags
