In [1]:
from transformers import BertForSequenceClassification
from transformers import BertTokenizer
from transformers import BertConfig
import transformers
import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import TensorDataset, DataLoader, random_split
import pandas as pd
import random
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, classification_report
import ujson as json
import pandas
import time

USE_CUDA = torch.cuda.is_available()
device = torch.device("cuda" if USE_CUDA else "cpu")
print("Using CUDA device." if USE_CUDA else "CUDA device not found!")

CUDA device not found!


In [None]:
t0 = time.perf_counter()
df = pandas.read_csv("./data/large_sentiment.csv", encoding="cp1252", header=None)
df = df.to_numpy()

y = list(df[:,0])#[0:10000]
x = list(df[:,5])#[0:10000]

t1 = time.perf_counter()

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
#x = tokenizer(x, return_tensors='pt', padding=True, truncation=True)['input_ids']
x = tokenizer.batch_encode_plus(x, return_tensors='pt', padding=True)['input_ids']

l_dict = {0:0, 2:1, 4:2}
y = torch.tensor([l_dict[e] for e in y])

t2 = time.perf_counter()

print("File Reading Time:",f"{t1-t0:.2f}")
print("Tokenization Time:",f"{t2-t1:.2f}")

dataset = TensorDataset(x, y)
n = len(dataset)
train_data, val_data, test_data = random_split(dataset, [int(0.8*n), int(0.1*n), int(0.1*n)])

In [None]:
def validate(model, val_loader, final=False):
    val_loss = 0
    y_true, y_pred = [], []

    model.eval()
    for i, (x, y) in enumerate(val_loader):
        with torch.no_grad():
            x, y = x.to(device), y.to(device)

            loss, logits = model(x, labels=y)[:2]

            y_true.extend(y.detach().cpu().numpy().tolist())
            y_pred.extend(torch.argmax(logits, dim=1).detach().cpu().numpy().tolist())
            val_loss += loss.item()
    
    val_loss = val_loss / len(val_loader)
    accuracy = sum(np.array(y_true) == np.array(y_pred)) / len(y_pred)
    f1 = f1_score(y_true, y_pred)

    if(final):
        print("Results")
        print("\tAvg Loss:", val_loss)
        print("\tAccuracy:", accuracy)
        print("\tF1 Score:", f1)
        print(classification_report(y_true, y_pred))
    else:
        print("Validation", "Avg Loss:", val_loss, " - Accuracy:", accuracy,"\n\n")
        return val_loss, accuracy

def train_classifier(model, train_loader, val_loader=None, epochs=1):
    model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    #wandb.init(project="toxic-bert")
    #config = wandb.config
    #wandb.watch(model)

    val_losses, val_accuracy, losses, seconds = [], [], [], []
    if (val_loader != None):
      val_loss, val_acc = validate(model, val_loader)
      val_losses.append(val_loss)
      val_accuracy.append(val_acc)

    for e in range(epochs):
        model.train()
        t1 = time.perf_counter()
        for i, (x, y) in enumerate(train_loader):
            optimizer.zero_grad()

            x, y = x.to(device), y.to(device)

            loss, logits = model(x, labels=y)[:2]
            #wandb.log({"loss": loss})

            loss.backward()
            optimizer.step()

            losses.append(loss.item())
            t2 = time.perf_counter()
            eta = (t2 - t1) * len(train_loader) / i + 1
            print(f'\repoch: {e+1}/{epochs} | ETA: {eta} | batch: {i+1}/{len(train_loader)} | loss: {loss.item():.4f}'), end="")
            
            if((i+1) % 100_000 == 0):
                if (val_loader != None):
                    print()
                    val_loss, val_acc = validate(model, val_loader)
                    val_losses.append(val_loss)
                    val_accuracy.append(val_acc)
                #model.save_pretrained(path + "/saves/sent_{}_{}.pt".format(e, i))
        print()
        #model.save_pretrained("./saves/sent_{}.pt".format(e))
        t2 = time.perf_counter()
        seconds.append(t2 - t1)
        
    return val_losses, val_accuracy, losses, seconds

In [None]:
batch_size = 5
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=4)
val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=True, num_workers=4)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=True, num_workers=4)

model = BertForSequenceClassification.from_pretrained('bert-base-uncased').to(device)
val_losses, val_accuracy, losses, seconds = train_classifier(model, train_loader, val_loader=val_loader, epochs=1)

validate(model, val_loader, final=True)

In [None]:
#f = open("./data/sentiment_tweets3.csv", "r")