# BERT transformer for binary classification

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from simpletransformers.classification import ClassificationModel
from transformers import AutoTokenizer, BertForSequenceClassification



## Fetch the data, constitute our X and y

In [1]:
merged_df = pd.read_csv('merged_all_modified_csv.csv')
#see naives_bayes notebook to see this dataset

In [2]:
X = merged_df['comment_cleaned_lower']
y = merged_df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

In [3]:
X_train.shape, X_test.shape

((29751,), (12751,))

## BERT

In [4]:
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

inputs = tokenizer(X_train[42000], return_tensors="pt")
with torch.no_grad():
    logits = model(**inputs).logits

predicted_class_id = logits.argmax().item()
model.config.id2label[predicted_class_id]

2023-02-02 16:07:43.841395: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-02 16:07:43.975755: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-02-02 16:07:44.010053: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-02-02 16:07:44.690901: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; 

'LABEL_1'

## Bert for sequence classification

In [5]:
#train_df=  merged_df[:40000]
eval_df = merged_df[40000:]

In [22]:
# Create a TransformerModel
#model = ClassificationModel('roberta', 'roberta-base', use_cuda=False)

# Train the model
#model.train_model(merged_df[['comment_cleaned_lower', 'label']][:3000])

# Evaluate the model
#result, model_outputs, wrong_predictions = model.eval_model(eval_df[:3000])

In [23]:
#see huggingface.com
tokenizer = AutoTokenizer.from_pretrained("textattack/bert-base-uncased-yelp-polarity")
model = BertForSequenceClassification.from_pretrained("textattack/bert-base-uncased-yelp-polarity", problem_type="multi_label_classification")

inputs = tokenizer([str(x) for x in X_train[:1]], return_tensors="pt")

with torch.no_grad():
    logits = model(**inputs).logits

predicted_class_ids = torch.arange(0, logits.shape[-1])[torch.sigmoid(logits).squeeze(dim=0) > 0.5]

# To train a model on `num_labels` classes, you can pass `num_labels=num_labels` to `.from_pretrained(...)`
num_labels = len(model.config.id2label)
model = BertForSequenceClassification.from_pretrained(
    "textattack/bert-base-uncased-yelp-polarity", num_labels=num_labels, problem_type="multi_label_classification"
)

labels = torch.sum(
    torch.nn.functional.one_hot(predicted_class_ids[None, :].clone(), num_classes=num_labels), dim=1
).to(torch.float)
loss = model(**inputs, labels=labels).loss

In [24]:
round(loss.item(), 2)

0.02