### BERT VOC (Colab local)
#### v. 20230821

In [None]:
# Install necessary packages to avoid import error `Trainer` with `PyTorch` requires `accelerate>=0.20.1`: Please run `pip install transformers[torch]` or `pip install accelerate -U`

!pip install -U accelerate
!pip install transformers -U
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu117      # install PyTorch

import accelerate
import transformers

transformers.__version__, accelerate.__version__

In [None]:
# Check if the GPU can be detected
# tensorflow-gpu has been removed. tensorflow package supports GPU accelerated operations via Nvidia CUDA.
import tensorflow as tf

device_name = tf.test.gpu_device_name()              # '/device:GPU:0' means GPU is enabled
if device_name != '/device:GPU:0':
    raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

In [None]:
import torch

# use my own GPU to train; move tensor to my GPU
if torch.cuda.is_available():
   dev = "cuda:0"
else:
    dev = "cpu"
device = torch.device(dev)
print(device)
t1 = torch.zeros(4,3)
print(t1)
t1 = t1.to(device)
print(t1)

In [None]:
print(tf.config.list_physical_devices('GPU'))

# device_lib.list_local_devices()

torch.cuda.is_available()

In [None]:
# Make sure the same device is used for tensor allocation during all operations

a = t1.get_device()                    # returns the index of the GPU on which the tensor resides
b = torch.tensor(t1.shape).to(dev)     # use this index to direct placement for new tensors

print(a)
print(b)

In [None]:
# Check running environment info regarding Cuda and devices

import sys
print('__Python VERSION:', sys.version)
print('__pyTorch VERSION:', torch.__version__)
print('__CUDA VERSION', )

from subprocess import call
# call(["nvcc", "--version"]) does not work
! nvcc --version
print('__CUDNN VERSION:', torch.backends.cudnn.version())
print('__Number CUDA Devices:', torch.cuda.device_count())
print('__Devices')

# call(["nvidia-smi", "--format=csv", "--query-gpu=index,name,driver_version,memory.total,memory.used,memory.free"])
print('Active CUDA Device: GPU', torch.cuda.current_device())
print ('Available devices ', torch.cuda.device_count())
print ('Current cuda device ', torch.cuda.current_device())

In [None]:
# and check device information of my graphics driver

!pip install pycuda
import pycuda.driver as cuda
cuda.init()

# Get Id of default device
torch.cuda.current_device()
# 0
cuda.Device(0).name()         # '0' is the id of my GPU

In [None]:
!cat /proc/meminfo            # check memory resources available

In [None]:
# install wandb for tracking data on dashboard
!pip install datasets wandb evaluate -qU
!wget https://raw.githubusercontent.com/huggingface/transformers/master/examples/pytorch/text-classification/run_glue.py

In [None]:
# the run_glue.py script requires transformers dev
!pip install -q git+https://github.com/huggingface/transformers

In [None]:
import wandb

# log in to have data synced to account
wandb.login()

# log every trained model
%env WANDB_LOG_MODEL=true

In [None]:
import pandas as pd

# filter training data to desired dates (after 2022.2.22)
df = pd.read_csv('/test.csv', index_col=0).drop_duplicates()

# filter training data to desired dates
from datetime import datetime, timedelta

df = df[df.Date.apply(lambda x: datetime.strptime(x,'%Y-%m-%d %H:%M:%S').date() > datetime(2022,2,21).date())]

# Consider reducing data to cut training time for quick feature testing
# df22 = df22.sample(frac=0.01, random_state=5)

In [None]:
# Define Title + Content concatenator
from timeit import default_timer as timer
import re

def data_concat(df22, R_known=True):

    # Split the input dataframe into Text (Title + Contents) and Classes dataframes
    # input must have 3 columns of string entries (Title, Contents, and Classes)

    # check if every row entry of each column is string type (some are NaN, so no)
    # print(df22.applymap(lambda x : type(x).__name__).eq({'Title': 'str', 'Content': 'str', 'Class':'str'}))

    # convert NaN to empty strings (NaN -> str)
        # df22.apply(str) converts all columns to str, as well
    df22 = df22.replace(float('nan'), '', regex=True)

    # concatenate strings of title & content with a " " in between (1 body of text)
    df22['Text'] = df22['Title'] + " " + df22['Content']      # slicing DataFrame via .iloc[:,0] makes it a Series
    df22 = df22.loc[: , ['Text', 'Class']]    # so initialize it as a DataFrame. pd.DataFrame(some_Series) works

    if R_known == True:
    # R, r, YR = 1;     N, n, YN = 0
        R_cases = re.compile('R|YR', re.IGNORECASE)
        N_cases = re.compile('N|YN', re.IGNORECASE)
        df22['Class'] = df22['Class'].replace(to_replace=R_cases, value=1)
        df22['Class'] = df22['Class'].replace(to_replace=N_cases, value=0)
    else:
        # R_known == False; prepping not yet classified data
        Y_N_cases = re.compile('Y|N', re.IGNORECASE)
        df22['Class'] = df22['Class'].replace(to_replace=Y_N_cases, value=0)     # all N's for simplicity

    df22['Class'] = df22['Class'].astype('int32')


    return df22


In [None]:
# combine title & content as text22, clean the text, then combine it with labels to a single df
df = data_concat(df)
df.head()

In [None]:
df['Class'].value_counts()

In [None]:
# reinstalling packages to get around errors
!pip uninstall tokenizers, transformers
!pip install transformers==4.27.4 -U
!pip install diffusers==0.12.1
import transformers

In [None]:
# Note 20230821:

# Below error kept popping up when trying to run 'from transformers import TrainingArguments, Trainer' and 'from transformers import BertTokenizer, BertForSequenceClassification'
"""
RuntimeError: Failed to import transformers.training_args because of the following error (look up to see its traceback):
cannot import name 'is_torch_npu_available' from 'transformers.utils' (/usr/local/lib/python3.10/dist-packages/transformers/utils/__init__.py) site:stackoverflow.com
"""
# Temporary workaround was to change BertTokenizer into transformers.BertTokenizer and BertForSequenceClassification to transformers.BertForSequenceClassification

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
#from transformers import TrainingArguments, Trainer


#from transformers import BertTokenizer, BertForSequenceClassification
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')
model = transformers.BertForSequenceClassification.from_pretrained('bert-base-uncased',num_labels=2)

In [None]:
model

In [None]:
# model = model.to( torch.device('cuda') )     # need NVIDIA driver for 'cuda'; currently have AMD on work laptop
# model = model.to('cpu')         # train on CPU

model = model.to('cuda')          # or  model.cuda()


In [None]:
test_data = ["This is possibly the worst battery I have ever seen on a mobile device",
            "How is my device running so smoothly?"]
tokenizer(test_data, padding=True, truncation=True, max_length=512)

In [None]:
X = list(df["Text"])
y = list(df["Class"])
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2,stratify=y)
X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=512)
X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length=512)

In [None]:
X_train_tokenized.keys()

In [None]:
print(X_train_tokenized['attention_mask'][0])

In [None]:
len(X_train),len(X_val)

In [None]:
# Create torch dataset
class VOC_Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [None]:
train_dataset = VOC_Dataset(X_train_tokenized, y_train)
val_dataset = VOC_Dataset(X_val_tokenized, y_val)

In [None]:
train_dataset[5]

In [None]:
def compute_metrics(m):
    print(type(m))
    pred, labels = m
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}


In [None]:
# Define Trainer
args = transformers.TrainingArguments(
    report_to = 'wandb',                      # enable logging to W&B
    output_dir="output",                      # output directory
    num_train_epochs=4,
    per_device_train_batch_size=8
    # overwrite_output_dir = True,
    # evaluation_strategy = 'steps',          # check evaluation metrics at each epoch
    # learning_rate = 5e-5,                   # we can customize learning rate
    # max_steps = 30000,
    # logging_steps = 100,                    # log every 100 steps
    # eval_steps = 5000,                      # perform evaluation every 5000 steps
    # save_steps = 10000,
    # load_best_model_at_end = True,
    # metric_for_best_model = 'accuracy',
    # run_name = 'custom_training'            # name of the W&B run

)
trainer = transformers.Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,                 # for padding batched data
    compute_metrics=compute_metrics
)

In [None]:
# clear cache before training
torch.cuda.empty_cache()

In [None]:
# train the model
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
np.set_printoptions(suppress=True)

In [None]:
text = "Super charging is working very well."
# text = "so many issues with this phone."
inputs = tokenizer(text,padding = True, truncation = True, return_tensors='pt').to('cuda')
outputs = model(**inputs)
print(outputs)
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)    # also gelu(), silu()
print(predictions)
predictions = predictions.cpu().detach().numpy()
predictions

In [None]:
trainer.save_model('BertPractice')

In [None]:
model_2 = transformers.BertForSequenceClassification.from_pretrained('BertPractice')
model_2.to('cuda')

In [None]:
# text = "Super charging is working very well."
text = "so many issues with this phone."
inputs = tokenizer(text,padding = True, truncation = True, return_tensors='pt').to('cuda')
outputs = model_2(**inputs)
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
predictions = predictions.cpu().detach().numpy()
predictions

In [None]:
# Define a function to make predictions on daily data

def pred_day(voctoday):
  voclist = list(voctoday['Text'])
  predlist = []

  for i in range(len(voclist)):
    inputs = tokenizer(voclist[i], padding = True, truncation = True, return_tensors='pt').to('cuda')
    outputs = model_2(**inputs)
    predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
    predictions = predictions.cpu().detach().numpy()
    predlist.append(predictions)

  organizedzip = zip([x[:50] for x in voclist],
                    predlist,
                    voctoday['Class'])

  # results as voc text, predictions, and actual value
  vocresult = pd.DataFrame(list(organizedzip), columns=['Text','Prediction','Actual'])

  return vocresult


In [None]:
# test VOC for 8/14 (Mon) - 8/21 (Mon) using model trained on ~8/11 (Fri) data

voc814 = pd.read_csv('/814testraw.csv', index_col=0, delimiter=",").drop_duplicates()     # use delimiter ',' for original .csv file created from my test file generator
voc814 = data_concat(voc814.loc[: , 'Title':'Class'])

voc815 = pd.read_csv('/815testraw.csv', index_col=0, delimiter=",").drop_duplicates()
voc815 = data_concat(voc815.loc[: , 'Title':'Class'])

voc816 = pd.read_csv('/816testraw.csv', index_col=0, delimiter=",").drop_duplicates()
voc816 = data_concat(voc816.loc[: , 'Title':'Class'])

voc817 = pd.read_csv('/817testraw.csv', index_col=0, delimiter=",").drop_duplicates()
voc817 = data_concat(voc817.loc[: , 'Title':'Class'])

voc818 = pd.read_csv('/818testraw.csv', index_col=0, delimiter=",").drop_duplicates()
voc818 = data_concat(voc818.loc[: , 'Title':'Class'])

voc821 = pd.read_csv('/821testraw.csv', index_col=0, delimiter=",").drop_duplicates()
voc821 = data_concat(voc821.loc[: , 'Title':'Class'])

In [None]:
vocresult814 = pred_day(voc814)
vocresult814

In [None]:
vocresult815 = pred_day(voc815)
vocresult815

In [None]:
vocresult816 = pred_day(voc816)
vocresult816

In [None]:
vocresult817 = pred_day(voc817)
vocresult817

In [None]:
vocresult818 = pred_day(voc818)
vocresult818

In [None]:
vocresult821 = pred_day(voc821)
vocresult821

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=e5caba9f-cd36-4d50-aaa3-2cf59957a2f4' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>