<a href="https://colab.research.google.com/github/lyuoveta/git_test_model_1000/blob/master/BERT_1_version/BERT_1_clean(correct_incorrect).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [164]:
!pip install transformers
!pip install pytorch_lightning
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast as BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
import pytorch_lightning as pl
from torchmetrics.functional import accuracy, auroc
from torchmetrics import F1Score
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, multilabel_confusion_matrix
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
import multiprocessing
multiprocessing.freeze_support()
import warnings



In [166]:
url = "https://raw.githubusercontent.com/lyuoveta/git_test_model_1000/master/tagging/tagged_data.csv?token=GHSAT0AAAAAACIBYTVUHIIWV4UMNGLD5CBOZKDQ3TA"
df = pd.read_csv(url)

In [167]:
# deleting additional information
df = df.drop([df.columns[0], df.columns[2], df.columns[3], df.columns[4],df.columns[6], df.columns[7], df.columns[8],
              df.columns[9], df.columns[10], df.columns[11],df.columns[12], df.columns[13], df.columns[14],df.columns[15],
              df.columns[16], df.columns[17], df.columns[18], df.columns[19], df.columns[20], df.columns[21], df.columns[22],
             df.columns[23],df.columns[24], df.columns[25], df.columns[26],df.columns[27], df.columns[28], df.columns[29],df.columns[30],
              df.columns[31], df.columns[32], df.columns[33], df.columns[36]], axis=1)
print(df.head())

   ID                                            reviews  Correct review   \
0   1  I have a bad hair lose, i really concern to lo...                1   
1   2  Biotin has been a game-changer for my hair. Af...                1   
2   3                        Not bad but could be better                1   
3   4  For more than a month, I took Nature Made® Hai...                1   
4   5          This is a test review post done by Amar 4                1   

   Incorrect review   
0                  0  
1                  0  
2                  0  
3                  0  
4                  0  


In [168]:
# split data
train_df, val_df = train_test_split(df, test_size=0.05)
print(train_df.shape, val_df.shape)

(950, 4) (50, 4)


In [169]:
LABEL_COLUMNS = df.columns.tolist()[2:]

In [170]:
train_correct = train_df[train_df.iloc[:, 2] == 1]
train_incorrect = train_df[train_df.iloc[:, 3] == 1]

In [171]:
# output of data quantity
count_ones = len(train_df[train_df.iloc[:, 2] == 1])
count_ones_2 = len(train_df[train_df.iloc[:, 3] == 1])
column_name = train_df.columns[2]
column_name_2 = train_df.columns[3]
print(column_name, count_ones, column_name_2, count_ones_2)

Correct review  788 Incorrect review  163


In [172]:
# sample only 200 correct reviews to combat the imbalance
train_df = pd.concat([
  train_incorrect,
  train_correct.sample(200)
])
print(train_df.shape, val_df.shape)

(363, 4) (50, 4)


In [173]:
MAX_TOKEN_COUNT = 512

In [174]:
# Dataset
warnings.filterwarnings("ignore", category=FutureWarning)


class CorrectReviewsDataset(Dataset):
    def __init__(
      self,
      data: pd.DataFrame,
      tokenizer: BertTokenizer,
      max_token_len: int = 128
    ):
      self.tokenizer = tokenizer
      self.data = data
      self.max_token_len = max_token_len
    def __len__(self):
      return len(self.data)
    def __getitem__(self, index: int):
      data_row = self.data.iloc[index]
      reviews = data_row.reviews
      labels = data_row[LABEL_COLUMNS]
      encoding = self.tokenizer.encode_plus(
        reviews,
        add_special_tokens=True,
        max_length=self.max_token_len,
        return_token_type_ids=False,
        padding="max_length",
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
        )
      return dict(
        reviews=reviews,
        input_ids=encoding["input_ids"].flatten(),
        attention_mask=encoding["attention_mask"].flatten(),
        #labels = torch.FloatTensor(labels.iloc[0]) #debug but tensor([])
        #labels = torch.FloatTensor(labels)
        labels=torch.FloatTensor(labels.iloc[:])
      )


In [175]:
BERT_MODEL_NAME = 'bert-base-cased'
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME)

In [176]:
# a sample item from the dataset
train_dataset = CorrectReviewsDataset(
  train_df,
  tokenizer,
  max_token_len=MAX_TOKEN_COUNT
)
sample_item = train_dataset[0]

In [177]:
# Tokenization
BERT_MODEL_NAME = 'bert-base-cased'
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME)
bert_model = BertModel.from_pretrained(BERT_MODEL_NAME, return_dict=True).cuda()

In [178]:
train_dl = DataLoader(train_dataset, batch_size=8, num_workers=0)

In [179]:
sample_batch = next(iter(train_dl))

In [180]:
sample_batch["input_ids"]

tensor([[  101, 25118,  7937,  ...,     0,     0,     0],
        [  101,  2066,  1408,  ...,     0,     0,     0],
        [  101,  1144,  1251,  ...,     0,     0,     0],
        ...,
        [  101,  1111,  2393,  ...,     0,     0,     0],
        [  101, 18097,  1106,  ...,     0,     0,     0],
        [  101,  4209,  5076,  ...,     0,     0,     0]])

In [181]:
print(sample_batch["input_ids"].shape, sample_batch["attention_mask"].shape)

torch.Size([8, 512]) torch.Size([8, 512])


In [182]:
# BERT
with torch.no_grad():
    output = bert_model(sample_batch["input_ids"].cuda(), sample_batch["attention_mask"].cuda())
    print(output.last_hidden_state.shape, output.pooler_output.shape)
    print(bert_model.config.hidden_size)

torch.Size([8, 512, 768]) torch.Size([8, 768])
768


In [183]:
# custom dataset into a LightningDataModule
class CorrectReviewsDataModule(pl.LightningDataModule):
  def __init__(self, train_df, test_df, tokenizer, batch_size=8, max_token_len=128):
    super().__init__()
    self.batch_size = batch_size
    self.train_df = train_df
    self.test_df = test_df
    self.tokenizer = tokenizer
    self.max_token_len = max_token_len
  def setup(self, stage=None):
    self.train_dataset = CorrectReviewsDataset(
      self.train_df,
      self.tokenizer,
      self.max_token_len
    )
    self.test_dataset = CorrectReviewsDataset(
      self.test_df,
      self.tokenizer,
      self.max_token_len
    )
  def train_dataloader(self):
    return DataLoader(
      self.train_dataset,
      batch_size=self.batch_size,
      shuffle=True,
      num_workers=0
    )
  def val_dataloader(self):
    return DataLoader(
      self.test_dataset,
      batch_size=self.batch_size,
      num_workers=0
    )
  def test_dataloader(self):
    return DataLoader(
      self.test_dataset,
      batch_size=self.batch_size,
      num_workers=0
    )

In [184]:
# an instance of data module
N_EPOCHS = 10
BATCH_SIZE = 4
data_module = CorrectReviewsDataModule(
  train_df,
  val_df,
  tokenizer,
  batch_size=BATCH_SIZE,
  max_token_len=MAX_TOKEN_COUNT
)

In [185]:
# MODEL
class CorrectReviewsTagger(pl.LightningModule):
    def __init__(self, n_classes: int, n_training_steps=None, n_warmup_steps=None):
        super().__init__()
        self.bert = BertModel.from_pretrained(BERT_MODEL_NAME, return_dict=True)
        self.classifier = nn.Linear(self.bert.config.hidden_size, n_classes)
        self.n_training_steps = n_training_steps
        self.n_warmup_steps = n_warmup_steps
        self.criterion = nn.BCELoss()
    def forward(self, input_ids, attention_mask, labels=None):
        with torch.no_grad(): # не сохраняем градиенты
            output = self.bert(input_ids, attention_mask=attention_mask)
        output = self.classifier(output.pooler_output)
        output = torch.sigmoid(output)
        loss = 0
        if labels is not None:
            loss = self.criterion(output, labels)
        return loss, output
    def training_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("train_loss", loss, prog_bar=True, logger=True)
        return {"loss": loss, "predictions": outputs, "labels": labels}
    def validation_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("val_loss", loss, prog_bar=True, logger=True)
        return loss
    def test_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("test_loss", loss, prog_bar=True, logger=True)
        return loss
    def on_train_epoch_end(self):
        '''
        labels = []
        predictions = []
        for output in outputs:
            for out_labels in output["labels"].detach().cpu():
                labels.append(out_labels)
            for out_predictions in output["predictions"].detach().cpu():
                predictions.append(out_predictions)
        labels = torch.stack(labels).int()
        predictions = torch.stack(predictions)
        for i, name in enumerate(LABEL_COLUMNS):
            class_roc_auc = auroc(predictions[:, i], labels[:, i])
            self.logger.experiment.add_scalar(f"{name}_roc_auc/Train", class_roc_auc, self.current_epoch)
        '''
    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr=2e-5)
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=self.n_warmup_steps,
            num_training_steps=self.n_training_steps
        )
        return dict(
            optimizer=optimizer,
            lr_scheduler=dict(
            scheduler=scheduler,
            interval='step'
            )
        )

In [186]:
# Optimizer scheduler
dummy_model = nn.Linear(10, 1)
for param in dummy_model.parameters():
    param.requires_grad = True
optimizer = AdamW(params=dummy_model.parameters(), lr=0.001)
warmup_steps = 20
total_training_steps = 100
scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=warmup_steps,
  num_training_steps=total_training_steps
)
learning_rate_history = []
for step in range(total_training_steps):
    optimizer.step()
    scheduler.step()
    learning_rate_history.append(optimizer.param_groups[0]['lr'])

In [187]:
steps_per_epoch = len(train_df) // BATCH_SIZE
total_training_steps = steps_per_epoch * N_EPOCHS
warmup_steps = total_training_steps // 5
print(warmup_steps, total_training_steps)

180 900


In [188]:
# an instance of the model
model: CorrectReviewsTagger = CorrectReviewsTagger(
  n_classes=len(LABEL_COLUMNS),
  n_warmup_steps=warmup_steps,
  n_training_steps=total_training_steps
)

In [189]:
# Evaluation
criterion = nn.BCELoss()
prediction = torch.FloatTensor(
  [10.95873564, 1.07321467, 1.58524066, 0.03839076, 15.72987556, 1.09513213]
)
labels = torch.FloatTensor(
  [1., 0., 0., 0., 1., 0.]
)
print(torch.sigmoid(prediction))
print(criterion(torch.sigmoid(prediction), labels))

_, predictions = model(sample_batch["input_ids"], sample_batch["attention_mask"])
print(predictions)
print(criterion(predictions, sample_batch["labels"]))

tensor([1.0000, 0.7452, 0.8299, 0.5096, 1.0000, 0.7493])
tensor(0.8725)
tensor([[0.4879, 0.4450],
        [0.4975, 0.3948],
        [0.5140, 0.3662],
        [0.4942, 0.4163],
        [0.4857, 0.4780],
        [0.4957, 0.5250],
        [0.5000, 0.4026],
        [0.4855, 0.3667]], grad_fn=<SigmoidBackward0>)
tensor(0.7739, grad_fn=<BinaryCrossEntropyBackward0>)


In [190]:
# Training

checkpoint_callback = ModelCheckpoint(
  dirpath="checkpoints",
  filename="best-checkpoint",
  save_top_k=1,
  verbose=True,
  monitor="val_loss",
  mode="min"
)

In [191]:
logger = TensorBoardLogger("lightning_logs", name="correct-reviews")

early_stopping_callback = EarlyStopping(monitor='val_loss', patience=2)

In [192]:
trainer = pl.Trainer(
  logger=logger,
  callbacks=[early_stopping_callback, checkpoint_callback],
  max_epochs=N_EPOCHS,
  accelerator="gpu",
  log_every_n_steps = 5
)


INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [193]:
trainer.fit(model, data_module)

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/callbacks/model_checkpoint.py:630: Checkpoint directory /content/checkpoints exists and is not empty.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name       | Type      | Params
-----------------------------------------
0 | bert       | BertModel | 108 M 
1 | classifier | Linear    | 1.5 K 
2 | criterion  | BCELoss   | 0     
-----------------------------------------
108 M     Trainable params
0         Non-trainable params
108 M     Total params
433.247   Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=1` in the `DataLoader` to improve performance.
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=1` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:Epoch 0, global step 91: 'val_loss' reached 0.65496 (best 0.65496), saving model to '/content/checkpoints/best-checkpoint-v2.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:Epoch 1, global step 182: 'val_loss' reached 0.64977 (best 0.64977), saving model to '/content/checkpoints/best-checkpoint-v2.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:Epoch 2, global step 273: 'val_loss' reached 0.64132 (best 0.64132), saving model to '/content/checkpoints/best-checkpoint-v2.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:Epoch 3, global step 364: 'val_loss' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:Epoch 4, global step 455: 'val_loss' was not in top 1


In [194]:
trainer.test(model, datamodule=data_module)

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=1` in the `DataLoader` to improve performance.


Testing: |          | 0/? [00:00<?, ?it/s]

[{'test_loss': 0.6424381136894226}]

In [195]:
# Predictions
trained_model = CorrectReviewsTagger.load_from_checkpoint(
  trainer.checkpoint_callback.best_model_path,
  n_classes=len(LABEL_COLUMNS)
)
trained_model.eval()
trained_model.freeze()

In [196]:
# TEST
test_comment = "I have been taking rhodiola for over a year.  It works great for reducing general anxiety.  If I go longer than two days without it, I get an inner jitteriness and feelings of doom.  I have noticed no side effects."
encoding = tokenizer.encode_plus(
  test_comment,
  add_special_tokens=True,
  max_length=512,
  return_token_type_ids=False,
  padding="max_length",
  return_attention_mask=True,
  return_tensors='pt',
)

In [197]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_ids = encoding["input_ids"].to(device)
attention_mask = encoding["attention_mask"].to(device)

trained_model = trained_model.to(device)

_, test_prediction = trained_model(input_ids, attention_mask)
test_prediction = test_prediction.flatten().cpu().numpy()

for label, prediction in zip(LABEL_COLUMNS, test_prediction):
    print(f"{label}: {prediction}")

Correct review : 0.5514571666717529
Incorrect review : 0.4503340423107147


In [198]:
# TEST 2
THRESHOLD = 0.5
test_comment = "as a nourishment for old dry skin, people can not believe I am 74 years old"
encoding = tokenizer.encode_plus(
  test_comment,
  add_special_tokens=True,
  max_length=512,
  return_token_type_ids=False,
  padding="max_length",
  return_attention_mask=True,
  return_tensors='pt',
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_ids = encoding["input_ids"].to(device)
attention_mask = encoding["attention_mask"].to(device)

trained_model = trained_model.to(device)

with torch.no_grad():
    _, test_prediction = trained_model(input_ids, attention_mask)

test_prediction = test_prediction.flatten().cpu().numpy()

for label, prediction in zip(LABEL_COLUMNS, test_prediction):
  if prediction < THRESHOLD:
    continue
  print(f"{label}: {prediction:.4f}")

Correct review : 0.5591


In [199]:
# Evaluation
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
trained_model = trained_model.to(device)
val_dataset = CorrectReviewsDataset(
  val_df,
  tokenizer,
  max_token_len=MAX_TOKEN_COUNT
)
predictions = []
labels = []
for item in tqdm(val_dataset):
  _, prediction = trained_model(
    item["input_ids"].unsqueeze(dim=0).to(device),
    item["attention_mask"].unsqueeze(dim=0).to(device)
  )
  predictions.append(prediction.flatten())
  labels.append(item["labels"].int())
predictions = torch.stack(predictions).detach().cpu()
labels = torch.stack(labels).detach().cpu()


  0%|          | 0/50 [00:00<?, ?it/s]

In [207]:
task = 'BINARY'
acc = accuracy(predictions, labels, task, threshold=0.5)
print(f"accuracy: {acc.item():.4f}")


accuracy: 0.7900


In [208]:
#the ROC for each tag
print("AUROC per tag")
for i, name in enumerate(LABEL_COLUMNS):
  tag_auroc = roc_auc_score(labels[:, i], predictions[:, i])
  print(f"{name}: {tag_auroc}")

AUROC per tag
Correct review : 0.18
Incorrect review : 0.42250000000000004


In [209]:
y_pred = predictions.numpy()
y_true = labels.numpy()
upper, lower = 1, 0
y_pred = np.where(y_pred > THRESHOLD, upper, lower)
print(classification_report(
  y_true,
  y_pred,
  target_names=LABEL_COLUMNS,
  zero_division=0
))


                   precision    recall  f1-score   support

  Correct review        0.80      1.00      0.89        40
Incorrect review        0.00      0.00      0.00        10

        micro avg       0.78      0.80      0.79        50
        macro avg       0.40      0.50      0.44        50
     weighted avg       0.64      0.80      0.71        50
      samples avg       0.79      0.80      0.79        50

