In [None]:
!sudo apt-get install -y fonts-nanum
!sudo fc-cache -fv
!rm ~/.cache/matplotlib -rf

Reading package lists... Done
Building dependency tree       
Reading state information... Done
fonts-nanum is already the newest version (20170925-1).
0 upgraded, 0 newly installed, 0 to remove and 39 not upgraded.
/usr/share/fonts: caching, new cache contents: 0 fonts, 1 dirs
/usr/share/fonts/truetype: caching, new cache contents: 0 fonts, 3 dirs
/usr/share/fonts/truetype/humor-sans: caching, new cache contents: 1 fonts, 0 dirs
/usr/share/fonts/truetype/liberation: caching, new cache contents: 16 fonts, 0 dirs
/usr/share/fonts/truetype/nanum: caching, new cache contents: 10 fonts, 0 dirs
/usr/local/share/fonts: caching, new cache contents: 0 fonts, 0 dirs
/root/.local/share/fonts: skipping, no such directory
/root/.fonts: skipping, no such directory
/var/cache/fontconfig: cleaning cache directory
/root/.cache/fontconfig: not cleaning non-existent cache directory
/root/.fontconfig: not cleaning non-existent cache directory
fc-cache: succeeded


In [None]:
!pip install transformers==4.12.3 --quiet
!pip install pytorch-lightning==1.5.0 --quiet

In [None]:
import pandas as pd
import numpy as np

from tqdm.auto import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from transformers import BertTokenizerFast as BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup

import pytorch_lightning as pl
# from pytorch_lightning.metrics.functional import accuracy, f1, auroc
# from torchmetrics import accuracy, auroc, f1
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, multilabel_confusion_matrix

import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc

%matplotlib inline
%config InlineBackend.figure_format='retina'

RANDOM_SEED = 42

sns.set(style='whitegrid', palette='muted', font_scale=1.2)
HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]
sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))
rcParams['figure.figsize'] = 12, 8

pl.seed_everything(RANDOM_SEED)

Global seed set to 42


42

## Data load




In [None]:
df = pd.read_excel('review_df_50.xlsx')
df_unlabeled = pd.read_excel('unlabeled_df_50.xlsx')
del df['Unnamed: 0']
del df_unlabeled['Unnamed: 0']

In [None]:
train_df, val_df = train_test_split(df, test_size=0.3)
train_df.shape, val_df.shape

((8092, 6), (3469, 6))

In [None]:
df_unlabeled.head()

Unnamed: 0,book_origin_title,review_rating,review_en_review,review_helpful,pos,neg
0,난장이가 쏘아올린 작은 공,4,Seoul in the 70's. The difference between urba...,27.0,,
1,흰,3,I think that the sensitive thing is not bad as...,5.0,,
2,모든 순간이 너였다,4,I will feel kind.,25.0,,
3,모든 순간이 너였다,4,I was able to read including the author's feel...,7.0,,
4,모든 순간이 너였다,4,Replace and read it\nI was moved by the number...,8.0,,


In [None]:
df.head()

Unnamed: 0,book_origin_title,review_rating,review_en_review,review_helpful,pos,neg
0,한국 최초의 페미니스트 작가들,5,The stories contained in this book are about w...,0.0,1,0
1,난장이가 쏘아올린 작은 공,5,It was a novel that shifted by ordinary expect...,50.0,1,0
2,난장이가 쏘아올린 작은 공,5,"Anyway, it is a strange sense of reading. I ca...",19.0,1,0
3,수박 수영장,5,I could buy it in summer of my child. It is re...,5.0,1,0
4,수박 수영장,5,○ The sentence is quite small\n○ Illustration ...,5.0,1,0


## Tokenization

In [None]:
BERT_MODEL_NAME = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME)

In [None]:
# token_counts = []
# for _, row in train_df.iterrows():
#   token_count = len(tokenizer.encode(row["review"],
#                                      max_length=512,
#                                      truncation=True))
#   token_counts.append(token_count)

In [None]:
# sns.histplot(token_counts)
# plt.xlim([0, 512]);

In [None]:
LABEL_COLUMNS = df.columns.tolist()[-2:]

MAX_TOKEN_COUNT = 400
LABEL_COLUMNS

['pos', 'neg']

## Dataset
레이블->텐서(3차원 이상)로 변환, 토큰화 프로세스를 PyTorch Dataset로 wrapping

In [None]:
class CommentsDataset(Dataset):

  def __init__(
    self,
    data: pd.DataFrame,
    tokenizer: BertTokenizer,
    max_token_len: int = 400
  ):
    self.tokenizer = tokenizer
    self.data = data
    self.max_token_len = max_token_len

  def __len__(self):
    return len(self.data)

  def __getitem__(self, index: int):
    data_row = self.data.iloc[index]

    comment_text = data_row.review_en_review
    labels = data_row[LABEL_COLUMNS]

    encoding = self.tokenizer.encode_plus(
      comment_text,
      add_special_tokens=True,
      max_length=self.max_token_len,
      return_token_type_ids=False,
      padding="max_length",
      truncation=True,
      return_attention_mask=True,
      return_tensors='pt',
    )

    return dict(
      comment_text=comment_text,
      input_ids=encoding["input_ids"].flatten(),
      attention_mask=encoding["attention_mask"].flatten(),
      labels=torch.FloatTensor(labels)
    )

In [None]:
train_dataset = CommentsDataset(
  train_df,
  tokenizer,
  max_token_len=MAX_TOKEN_COUNT
)

sample_item = train_dataset[0]
sample_item.keys()

dict_keys(['comment_text', 'input_ids', 'attention_mask', 'labels'])

BERT 모델을 로드하고 데이터 샘플을 전달

In [None]:
bert_model = BertModel.from_pretrained(BERT_MODEL_NAME, return_dict=True)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
sample_batch = next(iter(DataLoader(train_dataset, batch_size=8, num_workers=2)))
sample_batch["input_ids"].shape, sample_batch["attention_mask"].shape

(torch.Size([8, 400]), torch.Size([8, 400]))

In [None]:
output = bert_model(sample_batch["input_ids"], sample_batch["attention_mask"])

In [None]:
output.last_hidden_state.shape, output.pooler_output.shape

(torch.Size([8, 400, 768]), torch.Size([8, 768]))

In [None]:
bert_model.config.hidden_size

768

LightningDataModule에 데이터셋 랩핑

CommentDataModule은 모든 data loading 로직을 캡슐화하고 필요한 data loader를 반환

In [None]:
class CommentDataModule(pl.LightningDataModule):

  def __init__(self, train_df, test_df, tokenizer, batch_size=8, max_token_len=400):
    super().__init__()
    self.batch_size = batch_size
    self.train_df = train_df
    self.test_df = test_df
    self.tokenizer = tokenizer
    self.max_token_len = max_token_len

  def setup(self, stage=None):
    self.train_dataset = CommentsDataset(
      self.train_df,
      self.tokenizer,
      self.max_token_len
    )

    self.test_dataset = CommentsDataset(
      self.test_df,
      self.tokenizer,
      self.max_token_len
    )

  def train_dataloader(self):
    return DataLoader(
      self.train_dataset,
      batch_size=self.batch_size,
      shuffle=True,
      num_workers=2
    )

  def val_dataloader(self):
    return DataLoader(
      self.test_dataset,
      batch_size=self.batch_size,
      num_workers=2
    )

  def test_dataloader(self):
    return DataLoader(
      self.test_dataset,
      batch_size=self.batch_size,
      num_workers=2
    )

데이터 모듈의 인스턴스를 생성

In [None]:
N_EPOCHS = 10
BATCH_SIZE = 8

data_module = CommentDataModule(
  train_df,
  val_df,
  tokenizer,
  batch_size=BATCH_SIZE,
  max_token_len=MAX_TOKEN_COUNT
)

## Model
Pre-trained Bert Model과 선형 레이어를 이용한 BERT 분류 작업, LightningModule에 wrapping

In [None]:
class CommentTagger(pl.LightningModule):

  def __init__(self, n_classes: int, n_training_steps=None, n_warmup_steps=None):
    super().__init__()
    self.bert = BertModel.from_pretrained(BERT_MODEL_NAME, return_dict=True)
    self.classifier = nn.Linear(self.bert.config.hidden_size, n_classes)
    self.n_training_steps = n_training_steps
    self.n_warmup_steps = n_warmup_steps
    self.criterion = nn.BCELoss()

  def forward(self, input_ids, attention_mask, labels=None):
    output = self.bert(input_ids, attention_mask=attention_mask)
    output = self.classifier(output.pooler_output)
    output = torch.sigmoid(output)
    loss = 0
    if labels is not None:
        loss = self.criterion(output, labels)
    return loss, output

  def training_step(self, batch, batch_idx):
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log("train_loss", loss, prog_bar=True, logger=True)
    return {"loss": loss, "predictions": outputs, "labels": labels}

  def validation_step(self, batch, batch_idx):
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log("val_loss", loss, prog_bar=True, logger=True)
    return loss

  def test_step(self, batch, batch_idx):
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log("test_loss", loss, prog_bar=True, logger=True)
    return loss

  def training_epoch_end(self, outputs):

    labels = []
    predictions = []
    for output in outputs:
      for out_labels in output["labels"].detach().cpu():
        labels.append(out_labels)
      for out_predictions in output["predictions"].detach().cpu():
        predictions.append(out_predictions)

    labels = torch.stack(labels).int()
    predictions = torch.stack(predictions)

    for i, name in enumerate(LABEL_COLUMNS):
      class_roc_auc = auroc(predictions[:, i], labels[:, i], num_classes=2)
      self.logger.experiment.add_scalar(f"{name}_roc_auc/Train", class_roc_auc, self.current_epoch)


  def configure_optimizers(self):

    optimizer = AdamW(self.parameters(), lr=2e-5)

    scheduler = get_linear_schedule_with_warmup(
      optimizer,
      num_warmup_steps=self.n_warmup_steps,
      num_training_steps=self.n_training_steps
    )

    return dict(
      optimizer=optimizer,
      lr_scheduler=dict(
        scheduler=scheduler,
        interval='step'
      )
    )

## Optimizer scheduler
get_linear_schedule_with_warmup로 학습률 조정

In [None]:
dummy_model = nn.Linear(2, 1)

optimizer = AdamW(params=dummy_model.parameters(), lr=0.001)

warmup_steps = 20
total_training_steps = 100

scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=warmup_steps,
  num_training_steps=total_training_steps
)

learning_rate_history = []

for step in range(total_training_steps):
  optimizer.step()
  scheduler.step()
  learning_rate_history.append(optimizer.param_groups[0]['lr'])

In [None]:
# plt.plot(learning_rate_history, label="learning rate")
# plt.axvline(x=warmup_steps, color="red", linestyle=(0, (5, 10)), label="warmup end")
# plt.legend()
# plt.xlabel("Step")
# plt.ylabel("Learning rate")
# plt.tight_layout();

-> 총 100단계를 시뮬레이션, 처음 20개의 워밍업동안 초기 고정값(0.001)까지 증가한 다음 0으로 내려감

-> 훈련 및 준비 단계 수를 계산할 필요 o

-> Epoch당 학습 단계 수는 학습데이터/배치 사이즈랑 같음.

-> 총 훈련단계 수 = Epoch당 훈련단계 * Epoch 수

In [None]:
steps_per_epoch=len(train_df) // BATCH_SIZE
total_training_steps = steps_per_epoch * N_EPOCHS

In [None]:
warmup_steps = total_training_steps // 5
warmup_steps, total_training_steps

(2022, 10110)

Model 생성

In [None]:
model = CommentTagger(
  n_classes=len(LABEL_COLUMNS),
  n_warmup_steps=warmup_steps,
  n_training_steps=total_training_steps
)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


->if you're not using the pooler layer then there's no need to worry about that warning.

https://github.com/huggingface/transformers/issues/5421

## Evaluation
-> multi label 분류는 각 레이블에 대해 이진 분류를 수행하는 것

-> Binary Cross Entropy를 사용해 오류측정, Pytorch의 BCELoss를 쓸 것(+시그모이드 함수)

In [None]:
# criterion = nn.BCELoss()

# prediction = torch.FloatTensor([10.95873564, 1.07321467, 1.58524066, 0.03839076, 15.72987556, 1.09513213])
# labels = torch.FloatTensor([1., 0., 0., 0., 1., 0.])

In [None]:
# torch.sigmoid(prediction)

In [None]:
# criterion(torch.sigmoid(prediction), labels)

In [None]:
# _, predictions = model(sample_batch["input_ids"], sample_batch["attention_mask"])
# predictions

In [None]:
# criterion(predictions, sample_batch["labels"])

## Training

In [None]:
!nvidia-smi

Tue Apr 19 09:22:08 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P8    28W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
!rm -rf lightning_logs/
!rm -rf checkpoints/

In [None]:
# %load_ext tensorboard
# %tensorboard --logdir ./lightning_logs

In [None]:
checkpoint_callback = ModelCheckpoint(
  dirpath="checkpoints",
  filename="best-checkpoint",
  save_top_k=1,
  verbose=True,
  monitor="val_loss",
  mode="min"
)

In [None]:
logger = TensorBoardLogger("lightning_logs", name="comments")

In [None]:
early_stopping_callback = EarlyStopping(monitor='val_loss', patience=2)

In [None]:
trainer = pl.Trainer(
  logger=logger,
  checkpoint_callback=checkpoint_callback,
  callbacks=[early_stopping_callback],
  max_epochs=N_EPOCHS,
  gpus=1,
  progress_bar_refresh_rate=30
)

  f"Setting `Trainer(checkpoint_callback={checkpoint_callback})` is deprecated in v1.5 and will "
  f"Setting `Trainer(progress_bar_refresh_rate={progress_bar_refresh_rate})` is deprecated in v1.5 and"
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [None]:
import torch
from torchmetrics import F1Score
from torchmetrics.functional import accuracy, auroc

In [None]:
trainer.fit(model, data_module)

  f"DataModule.{name} has already been called, so it will not be called again. "
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name       | Type      | Params
-----------------------------------------
0 | bert       | BertModel | 109 M 
1 | classifier | Linear    | 1.5 K 
2 | criterion  | BCELoss   | 0     
-----------------------------------------
109 M     Trainable params
0         Non-trainable params
109 M     Total params
437.935   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Global seed set to 42


Training: 0it [00:00, ?it/s]

  f"One of the returned values {set(extra.keys())} has a `grad_fn`. We will detach it automatically"


Validating: 0it [00:00, ?it/s]

  "Trying to infer the `batch_size` from an ambiguous collection. The batch size we"
  "Trying to infer the `batch_size` from an ambiguous collection. The batch size we"
  "Trying to infer the `batch_size` from an ambiguous collection. The batch size we"
  "Trying to infer the `batch_size` from an ambiguous collection. The batch size we"
  "Trying to infer the `batch_size` from an ambiguous collection. The batch size we"
  "Trying to infer the `batch_size` from an ambiguous collection. The batch size we"
  "Trying to infer the `batch_size` from an ambiguous collection. The batch size we"
  "Trying to infer the `batch_size` from an ambiguous collection. The batch size we"
  "Trying to infer the `batch_size` from an ambiguous collection. The batch size we"
  "Trying to infer the `batch_size` from an ambiguous collection. The batch size we"
  "Trying to infer the `batch_size` from an ambiguous collection. The batch size we"
  "Trying to infer the `batch_size` from an ambiguous collection.

Validating: 0it [00:00, ?it/s]

  "Trying to infer the `batch_size` from an ambiguous collection. The batch size we"
  "Trying to infer the `batch_size` from an ambiguous collection. The batch size we"
  "Trying to infer the `batch_size` from an ambiguous collection. The batch size we"
  "Trying to infer the `batch_size` from an ambiguous collection. The batch size we"
  "Trying to infer the `batch_size` from an ambiguous collection. The batch size we"
  "Trying to infer the `batch_size` from an ambiguous collection. The batch size we"
  "Trying to infer the `batch_size` from an ambiguous collection. The batch size we"
  "Trying to infer the `batch_size` from an ambiguous collection. The batch size we"
  "Trying to infer the `batch_size` from an ambiguous collection. The batch size we"
  "Trying to infer the `batch_size` from an ambiguous collection. The batch size we"
  "Trying to infer the `batch_size` from an ambiguous collection. The batch size we"
  "Trying to infer the `batch_size` from an ambiguous collection.

Validating: 0it [00:00, ?it/s]

In [None]:
trainer.test(model, datamodule=data_module)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_loss': 0.17660553753376007}
--------------------------------------------------------------------------------


[{'test_loss': 0.17660553753376007}]

## Predictions

In [None]:
trained_model = CommentTagger.load_from_checkpoint(
  trainer.checkpoint_callback.best_model_path,
  n_classes=len(LABEL_COLUMNS)
)
trained_model.eval()
trained_model.freeze()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
test_comment = "The characters are attractive. It's one of the best books I've ever read. I am looking forward to the next book by this author."

encoding = tokenizer.encode_plus(
  test_comment,
  add_special_tokens=True,
  max_length=400,
  return_token_type_ids=False,
  padding="max_length",
  return_attention_mask=True,
  return_tensors='pt',
)

_, test_prediction = trained_model(encoding["input_ids"], encoding["attention_mask"])
test_prediction = test_prediction.flatten().numpy()


max=0
max_label=''
for label, prediction in zip(LABEL_COLUMNS, test_prediction):
  if prediction>max:
    max=prediction
    max_label=label

  # print(f"{label}: {prediction}")
print(f"{max_label}: {max}")

pos: 0.9988075494766235


In [None]:
# THRESHOLD = 0.5

test_comment = "Han Kang is the best writer. However, this work was a little dissatisfying. The translation was not smooth and the narrative seemed weak. It would be good to look forward to the next work"
encoding = tokenizer.encode_plus(
  test_comment,
  add_special_tokens=True,
  max_length=400,
  return_token_type_ids=False,
  padding="max_length",
  return_attention_mask=True,
  return_tensors='pt',
)

_, test_prediction = trained_model(encoding["input_ids"], encoding["attention_mask"])
test_prediction = test_prediction.flatten().numpy()



for label, prediction in zip(LABEL_COLUMNS, test_prediction):
  if prediction>max:
    max=prediction
    max_label=label

  # if prediction < THRESHOLD:
  #   continue
  # print(f"{label}: {prediction}")
print(f"{max_label}: {max}")

pos: 0.9988075494766235


## Evaluation

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
trained_model = trained_model.to(device)

val_dataset = CommentsDataset(
  val_df,
  tokenizer,
  max_token_len=MAX_TOKEN_COUNT
)

predictions = []
labels = []


for item in tqdm(val_dataset):
  _, prediction = trained_model(
    item["input_ids"].unsqueeze(dim=0).to(device),
    item["attention_mask"].unsqueeze(dim=0).to(device)
  )
  predictions.append(prediction.flatten())
  labels.append(item["labels"].int())

predictions = torch.stack(predictions).detach().cpu()
labels = torch.stack(labels).detach().cpu()

  0%|          | 0/3469 [00:00<?, ?it/s]

In [None]:
# accuracy(predictions, labels, threshold=THRESHOLD)
accuracy(predictions, labels)

tensor(0.9644)

In [None]:
print("AUROC per tag")
for i, name in enumerate(LABEL_COLUMNS):
  tag_auroc = auroc(predictions[:, i], labels[:, i], num_classes=2)
  print(f"{name}: {tag_auroc}")

AUROC per tag
pos: 0.9337482452392578
neg: 0.9337649345397949


In [None]:
y_pred = predictions.numpy()
y_true = labels.numpy()

A = np.matrix(y_pred)
indices = A.argmax(axis=1)


A= pd.get_dummies(np.asarray(indices).flatten())
y_pred = A.to_numpy()
# y_true
# y_pred

In [None]:
# upper, lower = 1, 0

# y_pred = np.where(y_pred == np.amax(y_pred,axis=0), upper, lower)

# print(y_pred)
print(classification_report(y_true, y_pred,
                            target_names=LABEL_COLUMNS,
                            zero_division=0))

              precision    recall  f1-score   support

         pos       0.97      1.00      0.98      3247
         neg       0.91      0.50      0.64       222

   micro avg       0.96      0.96      0.96      3469
   macro avg       0.94      0.75      0.81      3469
weighted avg       0.96      0.96      0.96      3469
 samples avg       0.96      0.96      0.96      3469



In [None]:
# y_pred

## Unlabeled data

In [None]:
df_unlabeled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14544 entries, 0 to 14543
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   book_origin_title  14544 non-null  object 
 1   review_rating      14017 non-null  object 
 2   review_en_review   14544 non-null  object 
 3   review_helpful     14541 non-null  float64
 4   pos                0 non-null      float64
 5   neg                0 non-null      float64
dtypes: float64(3), object(3)
memory usage: 681.9+ KB


In [None]:
class Unlabeled_CommentsDataset(Dataset):

  def __init__(
    self,
    data: pd.DataFrame,
    tokenizer: BertTokenizer,
    max_token_len: int = 400
  ):
    self.tokenizer = tokenizer
    self.data = data
    self.max_token_len = max_token_len

  def __len__(self):
    return len(self.data)

  def __getitem__(self, index: int):
    data_row = self.data.iloc[index]
    # print(data_row)
    comment_text = data_row.review_en_review

    encoding = self.tokenizer.encode_plus(
      comment_text,
      add_special_tokens=True,
      max_length=self.max_token_len,
      return_token_type_ids=False,
      padding="max_length",
      truncation=True,
      return_attention_mask=True,
      return_tensors='pt',
    )

    return dict(
      comment_text=comment_text,
      input_ids=encoding["input_ids"].flatten(),
      attention_mask=encoding["attention_mask"].flatten()
    )

In [None]:
unlabel_dataset = Unlabeled_CommentsDataset(
  df_unlabeled,
  tokenizer,
  max_token_len=MAX_TOKEN_COUNT
)

predictions = []

for item in tqdm(unlabel_dataset):
  _, prediction = trained_model(
    item["input_ids"].unsqueeze(dim=0).to(device),
    item["attention_mask"].unsqueeze(dim=0).to(device)
  )
  predictions.append(prediction.flatten())

predictions = torch.stack(predictions).detach().cpu()

  0%|          | 0/14544 [00:00<?, ?it/s]

In [None]:
prediction = predictions.numpy()
prediction

array([[0.9983675 , 0.00181111],
       [0.99708575, 0.00332552],
       [0.9983925 , 0.00183043],
       ...,
       [0.9984718 , 0.00169945],
       [0.9988846 , 0.00139084],
       [0.9986883 , 0.00149161]], dtype=float32)

In [None]:
A = np.matrix(prediction)
indices = A.argmax(axis=1)
indices

matrix([[0],
        [0],
        [0],
        ...,
        [0],
        [0],
        [0]])

In [None]:
dd = pd.concat([df_unlabeled, pd.DataFrame(indices)], axis=1)
dd.rename(columns = {0: "sentiment"}, inplace=True)
dd["pos"] = dd["sentiment"].apply(lambda x : 1 if x==0 else 0)
dd["neg"] = dd["sentiment"].apply(lambda x : 1 if x==1 else 0)
dd['sentiment'] = dd["sentiment"].apply(lambda x : 'pos' if x==0 else 'neg')
dd

Unnamed: 0,book_origin_title,review_rating,review_en_review,review_helpful,pos,neg,sentiment
0,난장이가 쏘아올린 작은 공,4,Seoul in the 70's. The difference between urba...,27.0,1,0,pos
1,흰,3,I think that the sensitive thing is not bad as...,5.0,1,0,pos
2,모든 순간이 너였다,4,I will feel kind.,25.0,1,0,pos
3,모든 순간이 너였다,4,I was able to read including the author's feel...,7.0,1,0,pos
4,모든 순간이 너였다,4,Replace and read it\nI was moved by the number...,8.0,1,0,pos
...,...,...,...,...,...,...,...
14539,풀,4,This is a beautiful graphic novel that tells t...,3.0,1,0,pos
14540,풀,4,Thanks to this Graphic Novel I discovered the ...,1.0,1,0,pos
14541,풀,3,The content level book is gorgeous. Highly rec...,,1,0,pos
14542,풀,4,From this Graphic Novel you can expect: Net an...,,1,0,pos


In [None]:
dd[(dd['pos']==1) & (dd['neg']==1)]

Unnamed: 0,book_origin_title,review_rating,review_en_review,review_helpful,pos,neg,sentiment


In [None]:
dd[(dd['pos']==0) & (dd['neg']==0)]

Unnamed: 0,book_origin_title,review_rating,review_en_review,review_helpful,pos,neg,sentiment


In [None]:
df['sentiment'] = df["pos"].apply(lambda x : 'pos' if x==1 else 'neg')
df

Unnamed: 0,book_origin_title,review_rating,review_en_review,review_helpful,pos,neg,sentiment
0,한국 최초의 페미니스트 작가들,5,The stories contained in this book are about w...,0.0,1,0,pos
1,난장이가 쏘아올린 작은 공,5,It was a novel that shifted by ordinary expect...,50.0,1,0,pos
2,난장이가 쏘아올린 작은 공,5,"Anyway, it is a strange sense of reading. I ca...",19.0,1,0,pos
3,수박 수영장,5,I could buy it in summer of my child. It is re...,5.0,1,0,pos
4,수박 수영장,5,○ The sentence is quite small\n○ Illustration ...,5.0,1,0,pos
...,...,...,...,...,...,...,...
11556,풀,5,"A very nice book, tells the western eye a face...",,1,0,pos
11557,풀,5,I gave this assessment because I found this su...,,1,0,pos
11558,풀,5,I really enjoyed this work. The advice,,1,0,pos
11559,풀,5,It is a book that has the violence of a burnin...,4.0,1,0,pos


In [None]:
new_df = pd.concat([df,dd])
new_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26105 entries, 0 to 14543
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   book_origin_title  26105 non-null  object 
 1   review_rating      25578 non-null  object 
 2   review_en_review   26105 non-null  object 
 3   review_helpful     26081 non-null  float64
 4   pos                26105 non-null  int64  
 5   neg                26105 non-null  int64  
 6   sentiment          26105 non-null  object 
dtypes: float64(1), int64(2), object(4)
memory usage: 1.6+ MB


In [None]:
new_df.to_excel('sent_bert_3.xlsx')