In [1]:
import os
os.chdir(os.getcwd() + '/../../')

In [2]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import transformers
from transformers import AutoModel, BertTokenizerFast
import matplotlib.pyplot as plt
import seaborn as sns

from scripts.bert_utils import *

# specify GPU
device = torch.device("cuda")

In [3]:
comments = pd.read_csv('data/comments/preprocessed_comments.csv', index_col=0)
comments = comments.dropna()
comments.head()

Unnamed: 0,toxic,comment_text
0,0,explanation why the edits make under my userna...
1,0,d'aww ! he match this background colour i be s...
2,0,"hey man , i be really not try to edit war . it..."
3,0,`` more i can not make any real suggestion on ...
4,0,"you , sir , be my hero . any chance you rememb..."


In [4]:
temp_text, test_text, temp_labels, test_labels = train_test_split(comments['comment_text'], 
                                                                  comments['toxic'], 
                                                                  random_state=0, 
                                                                  test_size=0.2, 
                                                                  stratify=comments['toxic'])


train_text, val_text, train_labels, val_labels = train_test_split(temp_text, 
                                                                  temp_labels, 
                                                                  random_state=0, 
                                                                  test_size=0.2, 
                                                                  stratify=temp_labels)

In [5]:
# import BERT-base pretrained model
bert = AutoModel.from_pretrained('bert-base-uncased')

# Load the BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
text = ["This is the first sentence. This is the second sentence. This is the third sentence"]
sent_id = tokenizer.batch_encode_plus(text, padding=True)
print(sent_id)

{'input_ids': [[101, 2023, 2003, 1996, 2034, 6251, 1012, 2023, 2003, 1996, 2117, 6251, 1012, 2023, 2003, 1996, 2353, 6251, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}


In [7]:
text = ["This is the first sentence. This is the second sentence. This is the third sentence"]
sent_id = tokenizer.batch_encode_plus(text,
                                      max_length = 10,
                                      padding='max_length',
                                      truncation=True)
print(sent_id)

{'input_ids': [[101, 2023, 2003, 1996, 2034, 6251, 1012, 2023, 2003, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}


In [8]:
len(comments[comments['toxic'] == 0])

144277

In [9]:
len(comments[comments['toxic'] == 1])

15294