In [None]:
import pandas as pd
import numpy as np
import re
from google.colab import drive

import torch
from transformers import BertTokenizer, BertModel, BertConfig

from tqdm import tqdm, notebook

In [None]:
df = pd.read_csv('toxic_comments.csv')
n = 400
df_bert_400 = df.sample(n=n, random_state=42).reset_index(drop=True)

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
df_bert_b1 = df.loc[df['toxic'] == 1].sample(n=250, random_state=42)
df_bert_b2 = df.loc[df['toxic'] == 0].sample(n=200, random_state=42)
df_bert_balanced = pd.concat([df_bert_b1, df_bert_b2], ignore_index=True).sample(frac=True, random_state=42).reset_index(drop=True)
print(df_bert_balanced['toxic'].value_counts(normalize=True), \
      df_bert_balanced.shape)

1    0.555556
0    0.444444
Name: toxic, dtype: float64 (450, 2)


In [None]:
def clean(text):
    
    text = text.lower()    
    text = re.sub(r"(?:\n|\r)", " ", text)
    text = re.sub(r"[^a-zA-Z ]+", "", text).strip()
    
    return text

df_bert_balanced['text_clean'] = df_bert_balanced['text'].apply(clean)
display(df_bert_balanced.head())

Unnamed: 0,text,toxic,text_clean
0,"""\n\n Good work \n\n The Tireless Contributor...",0,good work the tireless contributor barnsta...
1,"Actually \n\nDear Sir/Madame,\n\nThank you for...",0,actually dear sirmadame thank you for your ...
2,"No wonder I'm being uncivil, when you're stupi...",1,no wonder im being uncivil when youre stupid l...
3,Hey Riana\n\nDoes it suck to know that even st...,1,hey riana does it suck to know that even stra...
4,"""Complaints are repetitive when they are not r...",0,complaints are repetitive when they are not re...


In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

tokenized = df_bert_balanced['text_clean'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=512))

max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len - len(i)) for i in tokenized.values])

attention_mask = np.where(padded != 0, 1, 0)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
config = BertConfig.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased', config=config)

In [None]:
batch_size = 150
embeddings = []
for i in notebook.tqdm(range(padded.shape[0] // batch_size)):
        batch = torch.LongTensor(padded[batch_size*i:batch_size*(i+1)]) 
        attention_mask_batch = torch.LongTensor(attention_mask[batch_size*i:batch_size*(i+1)])
        
        with torch.no_grad():
            batch_embeddings = model(batch, attention_mask=attention_mask_batch)
        
        embeddings.append(batch_embeddings[0][:,0,:].numpy())

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




In [None]:
features = np.concatenate(embeddings)
print(features.shape, df_bert_balanced.shape)

(450, 768) (450, 3)


In [None]:
pd.DataFrame(features).to_csv('features_bert.csv', index=False)
df_bert_balanced.to_csv('df_bert_balanced.csv', index=False)