In [1]:
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn import model_selection
import torch
import torch.nn as nn
import torch.utils.data
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
from nltk.stem import PorterStemmer
%matplotlib inline
from tqdm import tqdm, notebook
import warnings
warnings.filterwarnings(action='once')
import pickle
import shutil

In [2]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification

# Check for GPU
device = "cuda" if torch.cuda.is_available() else "cpu"

# Define the path to the BERT model directory
MODEL_PATH = "bert/"

# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained(MODEL_PATH)

# Load the model (make sure config.json is present)
model = BertForSequenceClassification.from_pretrained(MODEL_PATH)

# Move model to device
model.to(device)
model.eval()  # Set to evaluation mode

print("BERT Model and Tokenizer Loaded Successfully!")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert/ and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERT Model and Tokenizer Loaded Successfully!


In [3]:
def convert_lines(texts, max_seq_length, tokenizer):
    max_seq_length -= 2  # Account for [CLS] and [SEP] tokens
    all_tokens = []
    for text in tqdm(texts):
        tokens_a = tokenizer.tokenize(text)
        if len(tokens_a) > max_seq_length:
            tokens_a = tokens_a[:max_seq_length]
        one_token = tokenizer.convert_tokens_to_ids(["[CLS]"] + tokens_a + ["[SEP]"]) + [0] * (max_seq_length - len(tokens_a))
        all_tokens.append(one_token)
    return np.array(all_tokens)

# Example usage
texts = ["This is an example sentence.", "Another example sentence."]
MAX_SEQUENCE_LENGTH = 220
sequences = convert_lines(texts, MAX_SEQUENCE_LENGTH, tokenizer)

100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 946.37it/s]


In [4]:
toxiccomment = pd.read_csv("..//Datasets/toxiccomment/toxiccomment.csv")
toxiccomment.head()

Unnamed: 0,id,comment_text,true_label
0,55858b89f99e9bda,Hope he dies \n\nNow this Atheist filth's wife...,1
1,425a1dbdf740e9b8,"2006 (UTC)\n\n Removed Merge 17:15, 5 April",0
2,20c81b99f7adf557,John discuss it here \n\nSeems you don't like ...,0
3,af0dce6ce84974ec,"""\nTo answer your question, no. There is no si...",0
4,a069e6d6d1a2348d,"""\n But Arpad can cite any webpage he finds, o...",0


In [5]:
X_val = convert_lines(toxiccomment['comment_text'].tolist(),MAX_SEQUENCE_LENGTH,tokenizer)

100%|█████████████████████████████████████████████████████████████████████████████| 1378/1378 [00:01<00:00, 839.41it/s]


In [6]:
import torch
torch.cuda.empty_cache()  # Frees up GPU memory

In [8]:
import numpy as np
import torch
from torch.utils.data import TensorDataset, DataLoader
from tqdm.notebook import tqdm  # Use tqdm for progress bar
model.to('cuda')
model.eval()
# Prepare DataLoader
valid_dataset = TensorDataset(torch.tensor(X_val, dtype=torch.long))
valid_loader = DataLoader(valid_dataset, batch_size=16, shuffle=False)

# Initialize predictions array
valid_preds = np.zeros(len(X_val))

# Loop through validation data
for i, (x_batch,) in enumerate(tqdm(valid_loader)):
    # Forward pass through model
    pred = model(x_batch.to(device), attention_mask=(x_batch > 0).to(device))

    # Extract logits (predictions)
    logits = pred.logits  # Correct way to access outputs

    # Store predictions in valid_preds
    valid_preds[i * 16 : (i + 1) * 16] = logits[:, 0].detach().cpu().squeeze().numpy()

print("Inference Completed!")

  0%|          | 0/87 [00:00<?, ?it/s]

Inference Completed!


In [10]:
valid_preds

array([0.16908528, 0.32328048, 0.20984346, ..., 0.16901615, 0.10675704,
       0.32662147])

In [12]:
toxiccomment['pred_probability'] = valid_preds

In [14]:
toxiccomment.head()

Unnamed: 0,id,comment_text,true_label,pred_probability
0,55858b89f99e9bda,Hope he dies \n\nNow this Atheist filth's wife...,1,0.169085
1,425a1dbdf740e9b8,"2006 (UTC)\n\n Removed Merge 17:15, 5 April",0,0.32328
2,20c81b99f7adf557,John discuss it here \n\nSeems you don't like ...,0,0.209843
3,af0dce6ce84974ec,"""\nTo answer your question, no. There is no si...",0,0.16297
4,a069e6d6d1a2348d,"""\n But Arpad can cite any webpage he finds, o...",0,0.068898


In [16]:
toxiccomment.to_csv('bertvanilla_toxiccomment.csv',index=False)