<a href="https://colab.research.google.com/github/ojw92/NLP-for-Text-Classification/blob/main/BERT_practice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### BERT Practice
#### v. 20230118

In [1]:
# pip3 install torch torchvision torchaudio     # for PyTorch without GPU, just CPU


In [2]:
!pip install transformers -U

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
# Check if the GPU can be detected
import tensorflow as tf

device_name = tf.test.gpu_device_name()              # '/device:GPU:0' means GPU is enabled
if device_name != '/device:GPU:0':
    raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [4]:
import torch
# torch.cuda.is_available = lambda : False
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

'Tesla T4'

In [5]:
!cat /proc/meminfo            # check memory resources available

MemTotal:       13297200 kB
MemFree:         3972752 kB
MemAvailable:   10334024 kB
Buffers:          361024 kB
Cached:          6131104 kB
SwapCached:            0 kB
Active:           968524 kB
Inactive:        7917196 kB
Active(anon):        912 kB
Inactive(anon):  2379256 kB
Active(file):     967612 kB
Inactive(file):  5537940 kB
Unevictable:           0 kB
Mlocked:               0 kB
SwapTotal:             0 kB
SwapFree:              0 kB
Dirty:              1492 kB
Writeback:             0 kB
AnonPages:       2393672 kB
Mapped:           963384 kB
Shmem:             13516 kB
KReclaimable:     181644 kB
Slab:             227812 kB
SReclaimable:     181644 kB
SUnreclaim:        46168 kB
KernelStack:        4544 kB
PageTables:        14280 kB
NFS_Unstable:          0 kB
Bounce:                0 kB
WritebackTmp:          0 kB
CommitLimit:     6648600 kB
Committed_AS:    4884948 kB
VmallocTotal:   34359738367 kB
VmallocUsed:       58308 kB
VmallocChunk:          0 kB
Percpu:          

In [6]:
# install wandb for tracking data on dashboard
!pip install datasets wandb evaluate -qU
!wget https://raw.githubusercontent.com/huggingface/transformers/master/examples/pytorch/text-classification/run_glue.py

--2023-01-26 18:28:36--  https://raw.githubusercontent.com/huggingface/transformers/master/examples/pytorch/text-classification/run_glue.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 27264 (27K) [text/plain]
Saving to: ‘run_glue.py.1’


2023-01-26 18:28:36 (97.6 MB/s) - ‘run_glue.py.1’ saved [27264/27264]



In [7]:
# the run_glue.py script requires transformers dev
!pip install -q git+https://github.com/huggingface/transformers

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [8]:
import wandb

# log in to have data synced to account
wandb.login()

# log every trained model
%env WANDB_LOG_MODEL=true

ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mojw92[0m. Use [1m`wandb login --relogin`[0m to force relogin


env: WANDB_LOG_MODEL=true


In [9]:

# https://github.com/PradipNichite/Youtube-Tutorials/blob/main/FineTune_BERT_Model_Youtube.ipynb



import pandas as pd

# filter training data to desired dates (after 2022.2.22)
df = pd.read_csv('/S22_total.csv', index_col=0).drop_duplicates()

# filter training data to desired dates
from datetime import datetime, timedelta

df = df[df.Date.apply(lambda x: datetime.strptime(x,'%Y-%m-%d %H:%M:%S').date() > datetime(2022,2,21).date())]

# For class imbalance, use roughly same ratio of R & N
# BERT doesn't need class imbalance addressed
# df = pd.concat([df[df.Class=='R'], df[df.Class=='N'].iloc[::3, :]])

# Reduce data to reduce training time to test features
# df22 = df22.sample(frac=0.01, random_state=5)




In [10]:
# Define Title + Content concatenator
from timeit import default_timer as timer
import re

def data_concat(df22, R_known=True):

    # Split the input dataframe into Text (Title + Contents) and Classes dataframes
    # input must have 3 columns of string entries (Title, Contents, and Classes)

    # check if every row entry of each column is string type (some are NaN, so no)
    # print(df22.applymap(lambda x : type(x).__name__).eq({'Title': 'str', 'Content': 'str', 'Class':'str'}))

    # convert NaN to empty strings (NaN -> str)
        # df22.apply(str) converts all columns to str, as well
    df22 = df22.replace(float('nan'), '', regex=True)

    # concatenate strings of title & content with a " " in between (1 body of text)
    df22['Text'] = df22['Title'] + " " + df22['Content']      # slicing DataFrame via .iloc[:,0] makes it a Series
    df22 = df22.loc[: , ['Text', 'Class']]    # so initialize it as a DataFrame. pd.DataFrame(some_Series) works

    if R_known == True:
    # R, r, YR = 1;     N, n, YN = 0
        R_cases = re.compile('R|YR', re.IGNORECASE)
        N_cases = re.compile('N|YN', re.IGNORECASE)
        df22['Class'] = df22['Class'].replace(to_replace=R_cases, value=1)
        df22['Class'] = df22['Class'].replace(to_replace=N_cases, value=0)
    else:
        # R_known == False; prepping not yet classified data
        Y_N_cases = re.compile('Y|N', re.IGNORECASE)
        df22['Class'] = df22['Class'].replace(to_replace=Y_N_cases, value=0)     # all N's for simplicity

    df22['Class'] = df22['Class'].astype('int32')


    return df22


In [11]:
# combine title & content as text22, clean the text, then combine it with labels to a single df  
df = data_concat(df)
df.head()

Unnamed: 0,Text,Class
452,Zfold 3 or S22 ultra Trying to decide between ...,0
453,S22 video cam Anyone tried out the video camer...,0
454,Thinking about trading my S21Ultra for S22+ Ev...,0
455,S21 Ultra vs S22+ Both phones are currently at...,0
456,"S21 or S22 Base/Standard Model Hey All,\n\nLoo...",0


In [12]:
df['Class'].value_counts()

0    17752
1     5218
Name: Class, dtype: int64

In [13]:

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import torch
from transformers import TrainingArguments, Trainer


from transformers import BertTokenizer, BertForSequenceClassification
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased',num_labels=2)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [14]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [15]:
# model = model.to( torch.device('cuda') )     # need NVIDIA driver for 'cuda'; currently have AMD on work laptop
# model = model.to('cpu')         # train on CPU

model = model.to('cuda')          # or  model.cuda()


In [16]:

test_data = ["This is possibly the worst battery I have ever seen on a mobile device",
            "How is my device running so smoothly?"]
tokenizer(test_data, padding=True, truncation=True, max_length=512)

{'input_ids': [[101, 2023, 2003, 4298, 1996, 5409, 6046, 1045, 2031, 2412, 2464, 2006, 1037, 4684, 5080, 102], [101, 2129, 2003, 2026, 5080, 2770, 2061, 15299, 1029, 102, 0, 0, 0, 0, 0, 0]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]]}

In [17]:
X = list(df["Text"])
y = list(df["Class"])
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2,stratify=y)
X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=512)
X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length=512)

In [18]:
X_train_tokenized.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [19]:
print(X_train_tokenized['attention_mask'][0])

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [20]:
len(X_train),len(X_val)


(18376, 4594)

In [21]:
# Create torch dataset
class VOC_Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [22]:
train_dataset = VOC_Dataset(X_train_tokenized, y_train)
val_dataset = VOC_Dataset(X_val_tokenized, y_val)

In [23]:
train_dataset[5]


{'input_ids': tensor([  101,  1055, 19317, 14516,  2630, 19392,  3314,  2006,  2655,  2007,
          3765,  1012,  2026,  3602,  2184,  2499,  2307,  1012,  7172,  2000,
          1996,  1055, 19317,  1998,  2085,  2009,  9010,  1996,  4434,  2000,
          2026,  2482,  2096,  1045,  1005,  1049,  1999,  4455,  1012,  2009,
          2145,  3065,  4198,  2021,  2758,  2655,  3092,  1012,  2009,  2224,
          2000,  2036,  6865,  2039,  2043,  1045,  2699,  2000,  9585,  5882,
          2157,  2044,  2008,  1012, 19102,  2359,  2033,  2000,  4604,  2026,
          3042,  1999,  1998,  2031,  2053,  3042,  2096,  2009,  2001,  2108,
         16330,  2021,  2059,  2310, 21885,  2239,  2056,  2027,  2052, 19948,
          2009,  2007,  1037,  2066,  2047,  3042,  1012,  6635,  2168,  3314,
          2006,  1996,  6110,  1012,  1045,  2031,  2699,  2296,  6749, 13460,
         23416,  2075,  3357,  2013, 19102,  1998,  2310, 21885,  2239,  1012,
          1045,  2036,  2147,  1999,  6

In [24]:
def compute_metrics(m):
    print(type(m))
    pred, labels = m
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}
     

In [25]:
# Define Trainer
args = TrainingArguments(
    report_to = 'wandb',                     # enable logging to W&B
    output_dir="output",                     # output directory
    num_train_epochs=4,
    per_device_train_batch_size=8
    # overwrite_output_dir = True,
    # evaluation_strategy = 'steps',          # check evaluation metrics at each epoch
    # learning_rate = 5e-5,                   # we can customize learning rate
    # max_steps = 30000,
    # logging_steps = 100,                    # we will log every 100 steps
    # eval_steps = 5000,                      # we will perform evaluation every 500 steps
    # save_steps = 10000,
    # load_best_model_at_end = True,
    # metric_for_best_model = 'accuracy',
    # run_name = 'custom_training'            # name of the W&B run

)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,            # for padding batched data
    compute_metrics=compute_metrics
)

Setting `WANDB_LOG_MODEL` from true to `end` instead


In [29]:
# https://github.com/facebookresearch/fairseq/issues/1933
# https://huggingface.co/spaces/OFA-Sys/OFA-Generic_Interface/blob/8fc1d8aafce5821301443744696303bac6227f52/fairseq/examples/roberta/commonsense_qa/README.md

# PyTorch fairseq

MAX_UPDATES=2300      # Number of training steps, where   training step = training set / batch size. Originally 3000
WARMUP_UPDATES=150    # Linearly increase LR over this many steps
MAX_EPOCH=4           # Number of training epochs.
LR=1e-05              # Peak LR for fixed LR scheduler. 1e-05 default; try 1e-06 as initial LR
NUM_CLASSES=2
MAX_SENTENCES=8       # Batch size per GPU.
UPDATE_FREQ=32        # Accumulate gradients to simulate training on 8 GPUs. 
DATA_DIR='VOC_final_output'
ROBERTA_PATH='/SiERoBERT_large/model.pt'

! CUDA_VISIBLE_DEVICES=0 fairseq-train $DATA_DIR --ddp-backend=no_c10d \
  --restore-file $ROBERTA_PATH \
  --reset-optimizer --reset-dataloader --reset-meters \
  --no-epoch-checkpoints --no-last-checkpoints --no-save-optimizer-state \
  #--best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \
  --task sentence_ranking \
  --num-classes $NUM_CLASSES \
  --init-token 0 --separator-token 2 \
  --max-option-length 128 \
  --max-positions 512 \
  --truncate-sequence \
  --arch roberta_large \
  --dropout 0.1 --attention-dropout 0.1 --weight-decay 0.01 \
  --criterion sentence_ranking \
  --optimizer adam --adam-betas '(0.9, 0.98)' --adam-eps 1e-06 \
  --clip-norm 0.0 \
  --lr-scheduler fixed --lr $LR \
  --warmup-updates $WARMUP_UPDATES --total-num-update $MAX_UPDATES \
  --memory-efficient-fp16 --fp16-init-scale 4 --threshold-loss-scale 1 --fp16-scale-window 128 \
  --batch-size $MAX_SENTENCES \
  --required-batch-size-multiple 1 \
  --update-freq $UPDATE_FREQ \
  --max-epoch $MAX_EPOCH \
  --log-interval 100 \

# --memory-efficient-fp16 instead of --fp16 solves 'CUDA out of memory' problem, but slow training
# --max-sentences $MAX_SENTENCES , not batch-size
# --fp16-scale-window 128 default

/bin/bash: fairseq-train: command not found


In [30]:
# clear cache before training
torch.cuda.empty_cache()

In [31]:
from timeit import default_timer as timer
start = timer()

trainer.train()


end = timer()
print("%4f seconds, %4f minutes elapsed" % (float(end-start), float((end-start)/60)))

***** Running training *****
  Num examples = 18376
  Num Epochs = 4
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 9188
  Number of trainable parameters = 109483778


Step,Training Loss
500,0.392
1000,0.344
1500,0.331
2000,0.3318
2500,0.2807
3000,0.2571
3500,0.2384
4000,0.2506
4500,0.2434
5000,0.1857


Saving model checkpoint to output/checkpoint-500
Configuration saved in output/checkpoint-500/config.json
Model weights saved in output/checkpoint-500/pytorch_model.bin
Saving model checkpoint to output/checkpoint-1000
Configuration saved in output/checkpoint-1000/config.json
Model weights saved in output/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to output/checkpoint-1500
Configuration saved in output/checkpoint-1500/config.json
Model weights saved in output/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to output/checkpoint-2000
Configuration saved in output/checkpoint-2000/config.json
Model weights saved in output/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to output/checkpoint-2500
Configuration saved in output/checkpoint-2500/config.json
Model weights saved in output/checkpoint-2500/pytorch_model.bin
Saving model checkpoint to output/checkpoint-3000
Configuration saved in output/checkpoint-3000/config.json
Model weights saved in output/check

7438.645936 seconds, 123.977432 minutes elapsed


In [32]:
trainer.evaluate()


***** Running Evaluation *****
  Num examples = 4594
  Batch size = 8


<class 'transformers.trainer_utils.EvalPrediction'>


{'eval_loss': 0.4679601788520813,
 'eval_accuracy': 0.9040052242054855,
 'eval_precision': 0.7924345295829291,
 'eval_recall': 0.7825670498084292,
 'eval_f1': 0.7874698795180722,
 'eval_runtime': 169.9539,
 'eval_samples_per_second': 27.031,
 'eval_steps_per_second': 3.383,
 'epoch': 4.0}

In [33]:
np.set_printoptions(suppress=True)


In [34]:
text = "Super charging is working very well."
# text = "so many issues with this phone."
inputs = tokenizer(text,padding = True, truncation = True, return_tensors='pt').to('cuda')
outputs = model(**inputs)
print(outputs)
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)    # also gelu(), silu()
print(predictions)
predictions = predictions.cpu().detach().numpy()
predictions

SequenceClassifierOutput(loss=None, logits=tensor([[ 3.0979, -2.3832]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
tensor([[0.9959, 0.0041]], device='cuda:0', grad_fn=<SoftmaxBackward0>)


array([[0.99585235, 0.00414764]], dtype=float32)

In [35]:
trainer.save_model('BertPractice')

Saving model checkpoint to BertPractice
Configuration saved in BertPractice/config.json
Model weights saved in BertPractice/pytorch_model.bin


In [36]:
# trainer.save_model('/content/drive/MyDrive/Youtube Tutorials/toxic')
# model_2 = BertForSequenceClassification.from_pretrained("/content/drive/MyDrive/Youtube Tutorials/toxic")
# model_2.to('cuda')

In [37]:
model_2 = BertForSequenceClassification.from_pretrained('BertPractice')
model_2.to('cuda')

loading configuration file BertPractice/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.27.0.dev0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file BertPractice/pytorch_model.bin
All model checkpoint weights were used when initializing BertForSequenceClassification.

All the weights of BertForSequenceClassification w

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [38]:
# text = "Super charging is working very well."
text = "so many issues with this phone."
inputs = tokenizer(text,padding = True, truncation = True, return_tensors='pt').to('cuda')
outputs = model_2(**inputs)
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
predictions = predictions.cpu().detach().numpy()
predictions

array([[0.01567061, 0.98432946]], dtype=float32)

In [39]:
# Make predictions on daily data

def pred_day(voctoday):
  voclist = list(voctoday['Text'])
  predlist = []

  for i in range(len(voclist)):
    inputs = tokenizer(voclist[i], padding = True, truncation = True, return_tensors='pt').to('cuda')
    outputs = model_2(**inputs)
    predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
    predictions = predictions.cpu().detach().numpy()
    predlist.append(predictions)

  organizedzip = zip([x[:25] for x in voclist],
                    predlist,
                    voctoday['Class'])
  
  # results as voc text, predictions, and actual value
  vocresult = pd.DataFrame(list(organizedzip), columns=['Text','Prediction','Actual'])

  return vocresult


In [40]:

# test VOC
# import file and combine title & content

#voc0123 = pd.read_csv('/20230123_test.csv', index_col=0).drop_duplicates()
#voc0123 = data_concat(voc0123.loc[: , 'Title':'Class'])

voc0124 = pd.read_csv('/20230124_test.csv', index_col=0).drop_duplicates()
voc0124 = data_concat(voc0124.loc[: , 'Title':'Class'])

voc0125 = pd.read_csv('/20230125_test.csv', index_col=0).drop_duplicates()
voc0125 = data_concat(voc0125.loc[: , 'Title':'Class'])


#vocresult0123 = pred_day(voc0123)
#vocresult0123

In [41]:
vocresult0124 = pred_day(voc0124)
vocresult0124

Unnamed: 0,Text,Prediction,Actual
0,"Switch from Fold2, feel l","[[0.99934095, 0.00065899285]]",1
1,How did you buy / get you,"[[0.9998149, 0.00018515547]]",0
2,Music volume issue? When,"[[0.01084301, 0.98915696]]",1
3,Sometimes 3 random lines,"[[0.011239918, 0.9887601]]",1
4,Difference between adapti,"[[0.99970967, 0.00029036344]]",0
5,Adjust default camera bri,"[[0.99601054, 0.003989435]]",1
6,"Attention Galaxy users, u","[[0.99980754, 0.00019253642]]",0
7,Notifications not vibrati,"[[0.011130082, 0.9888699]]",1
8,Bluetooth Phone Issues [S,"[[0.011285276, 0.98871475]]",1
9,What are the first things,"[[0.9997662, 0.0002338739]]",0


In [42]:
vocresult0125 = pred_day(voc0125)
vocresult0125

Unnamed: 0,Text,Prediction,Actual
0,S22U trouble with Google,"[[0.011216863, 0.9887832]]",1
1,Minimize - maximize requi,"[[0.878022, 0.121977985]]",1
2,Trade in values from 2022,"[[0.9998166, 0.00018347712]]",0
3,Anyone has any idea how t,"[[0.9997528, 0.00024713058]]",0
4,S908U1UES2BWA2 Update and,"[[0.99891615, 0.0010838634]]",0
5,Case for Power Share? Are,"[[0.99980813, 0.00019190264]]",0
6,apps going to sleep didn',"[[0.010677507, 0.9893225]]",1
7,45w small charger for the,"[[0.99976295, 0.00023706724]]",0
8,Plastic film that came wi,"[[0.99979407, 0.0002059807]]",0
9,Not able to make calls wh,"[[0.9953694, 0.0046305773]]",0


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=e5caba9f-cd36-4d50-aaa3-2cf59957a2f4' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>