In [1]:
import torch
from torchsummary import summary
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.functional as F
from sentence_transformers import SentenceTransformer
from transformers import BertForSequenceClassification,BertTokenizer
from transformers import BertForPreTraining

import fasttext.util

from tqdm import tqdm
import re
from pathlib import Path


import pandas as pd
import numpy as np
import os

In [2]:
os.listdir("../data/raw/starwars")

['SW_EpisodeVI.txt', 'wordcloud_masks', 'SW_EpisodeV.txt', 'SW_EpisodeIV.txt']

# StarWars



In [3]:
base_dir = "../data/raw/starwars"

In [4]:
folder_ep4 = os.path.join(base_dir,"SW_EpisodeIV.txt")
folder_ep5 = os.path.join(base_dir,"SW_EpisodeV.txt")
folder_ep6 = os.path.join(base_dir,"SW_EpisodeVI.txt")

In [5]:
df_ep4 = pd.read_csv(folder_ep4, sep =' ', header=0, escapechar='\\')
df_ep5 = pd.read_csv(folder_ep5, sep =' ', header=0, escapechar='\\')
df_ep6 = pd.read_csv(folder_ep6, sep =' ', header=0, escapechar='\\')


In [6]:
Y = pd.concat([df_ep4['character'],df_ep5['character'],df_ep6['character']]).tolist()
X = pd.concat([df_ep4['dialogue'],df_ep5['dialogue'],df_ep6['dialogue']]).tolist()

In [7]:
labels = np.unique(Y)
label_count = [sum(i == np.array(Y)) for i in labels]
for i,(a,b) in enumerate(zip(labels,label_count)):
    if b < 40:
        labels[i] = "Other"
labels = np.unique(labels)


In [8]:
char2ind = {i:j for i,j in zip(labels,range(len(labels)))}
ind2char = {j:i for i,j in zip(labels,range(len(labels)))}

In [9]:
new_x = X.copy()
new_y = []
for idx in range(len(new_x)):
    
    if Y[idx] in labels:
        label_point = char2ind[Y[idx]]
    else:
        label_point = char2ind["Other"]
    new_y.append(label_point)

In [10]:
for each in ind2char.keys():
    print("name: ",ind2char[each], "  count ",np.sum(np.array(new_y)==each))

name:  BEN   count  115
name:  EMPEROR   count  44
name:  HAN   count  459
name:  LANDO   count  101
name:  LEIA   count  227
name:  LUKE   count  494
name:  Other   count  593
name:  THREEPIO   count  301
name:  VADER   count  140
name:  YODA   count  49


In [11]:
import pickle5 as pickle
if os.path.exists("dict_of_words.pickle"):
    with open('dict_of_words.pickle', 'rb') as handle:
        dict_of_synonyms = pickle.load(handle)
else:
    fasttext.util.download_model('en', if_exists='ignore')  # English
    ft = fasttext.load_model('cc.en.300.bin') 
    words = np.unique([j for i in new_x for j in i.split()])
    dict_of_synonyms = {x:ft.get_nearest_neighbors(x) for x in words}

In [12]:
model = SentenceTransformer('bert-base-nli-mean-tokens')


In [13]:
import nltk
from nltk.corpus import wordnet
import random
nltk.download('wordnet')
nltk.download('omw-1.4')
def add_noise(text, p=0.2):
    words = text.split()
    num_noise_words = int(len(words) * p)
    for i in range(num_noise_words):
        idx = random.randint(0, len(words)-1)
        word = words[idx]
        synsets = wordnet.synsets(word)
        if synsets:
            synset = random.choice(synsets)
            synonyms = synset.lemmas()
            if random.random() < p/2:
                synonym = random.choice(synonyms)
                words[idx] = synonym.name()
            # Replace word with nearest neighbor with probability p/2
            else:
                if word in dict_of_synonyms:
                    
                    nn = random.choice(dict_of_synonyms[word])
                    # print("number is nn: ",nn)
                    words[idx] = nn[1]
                else:
                    synonym = random.choice(synonyms)
                    words[idx] = synonym.name()
    # print(words)
    return ' '.join(words)

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/alenadamyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/alenadamyan/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [14]:
path = Path("encoded_bertdata_train.csv")

def str_to_float_list(s):
    float_list = [float(x) for x in re.findall(r'[-+]?\d*\.\d+e[-+]?\d+|[-+]?\d+\.\d+|[-+]?\d+', s)]
    return float_list

if not path.exists():
    train_encoded_X = []
    train_encoded_X_y = []
    val_encoded_X = []
    val_encoded_X_y = []

    temp = list(zip(new_x, new_y))
    random.shuffle(temp)
    new_x, new_y = zip(*temp)
    # res1 and res2 come out as tuples, and so must be converted to lists.
    new_x, new_y = list(new_x), list(new_y)
    
    #number of random tests
    for _ in range(10):
        for x,y in zip(new_x[:2000],new_y[:2000]):
            x = model.encode(add_noise(x, p = .7))
            train_encoded_X.append(x)
            train_encoded_X_y.append(y)
        #number of random tests
    for x,y in zip(new_x[2000:],new_y[2000:]):
        x = model.encode(x)
        val_encoded_X.append(x)
        val_encoded_X_y.append(y)
    pd.DataFrame({"X":train_encoded_X,"y":train_encoded_X_y}).to_csv("encoded_bertdata_train.csv")
    pd.DataFrame({"X":val_encoded_X,"y":val_encoded_X_y}).to_csv("encoded_bertdata_val.csv")
else:
    df_train = pd.read_csv("encoded_bertdata_train.csv")     
    df_val = pd.read_csv("encoded_bertdata_val.csv")        
   
    train_encoded_X = df_train["X"].apply(str_to_float_list)
    train_encoded_X_y = df_train["y"]
    val_encoded_X = df_val["X"].apply(str_to_float_list)
    val_encoded_X_y = df_val["y"]

In [15]:
model.encode("hey dude").shape

(768,)

In [16]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
option = 1
class TabularClassifer(nn.Module):
    def __init__(self, num_labels):
        super(TabularClassifer, self).__init__()

        # # Option 1
        # if option == 1:
        #     self.bert_training = BertForSequenceClassification.from_pretrained('bert-base-uncased', problem_type="multi_label_classification").bert
        # elif option == 2:
        #     self.bert_training = SentenceTransformer('bert-base-nli-mean-tokens')

        
        
        # for param in self.bert_training.parameters():
        #     param.requires_grad = False
        self.dropout_rate = 0.1
        self.lin1 = nn.Linear(768, 256)
        self.lin_layers = nn.ModuleList([nn.Linear(256, 256) for i in range(2)])
        self.lin2 = nn.Linear(256, num_labels)

    def forward(self, data):

        x = nn.functional.relu(self.lin1(data))

        x = nn.functional.dropout(x, self.dropout_rate)

        for lin_layer in self.lin_layers:

            x = nn.functional.relu(lin_layer(x))
            x = nn.functional.dropout(x, self.dropout_rate)

        x = self.lin2(x)
        x = nn.functional.softmax(x)

        return x
model = TabularClassifer(len(labels))

In [17]:
model.train()

TabularClassifer(
  (lin1): Linear(in_features=768, out_features=256, bias=True)
  (lin_layers): ModuleList(
    (0): Linear(in_features=256, out_features=256, bias=True)
    (1): Linear(in_features=256, out_features=256, bias=True)
  )
  (lin2): Linear(in_features=256, out_features=10, bias=True)
)

In [18]:
from transformers import AdamW
optimizer = AdamW(model.parameters(), lr=1e-3)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=1e-5)



In [19]:
len(train_encoded_X)

20000

In [20]:
class TextClassificationDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):

        text = self.X[idx]
        label = self.y[idx]
        return torch.tensor(text), torch.tensor(label)


# Create the dataset
train_set = TextClassificationDataset(train_encoded_X, train_encoded_X_y)
val_set = TextClassificationDataset(val_encoded_X, val_encoded_X_y)

# Create the data loader
batch_size = 4

train_loader = DataLoader(train_set,batch_size=4,shuffle=True,drop_last=True)
val_loader = DataLoader(val_set,batch_size=4,shuffle=True,drop_last=True)



In [21]:

import os
from sys import platform

import torch
if platform == "darwin":
    device = "mps" if torch.backends.mps.is_available() else "cpu"
else:
    device = "cuda" if torch.cuda.is_available() else "cpu"

    torch.backends.cudnn.benchmark = True
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.enabled = True
    torch.autograd.set_detect_anomaly(True)
    os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
    os.environ['TORCH_USE_CUDA_DSA'] = '1'


    device = "cuda" if torch.cuda.is_available() else "cpu"


In [22]:
def accuracy(preds, labels):
    _, predictions = torch.max(preds, dim=1)
    correct = (predictions == labels).sum().item()
    return correct / len(labels)

def train(model, optimizer, train_loader, val_loader,num_epochs):
    model = model.to(device)
    criterion = nn.CrossEntropyLoss()

    for epoch in range(num_epochs):
        running_loss = 0.0
        running_accuracy = 0.0
        num_batches = 0

        # Training step
        model.train()
        loop = tqdm(train_loader, total=len(train_loader))
        for data, labels in loop:

            data = data.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()
            outputs = model(data)
            loss = criterion(outputs, labels)
            acc = accuracy(outputs, labels)

            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            running_accuracy += acc
            num_batches += 1
            loop.set_description(f"Epoch [{epoch + 1}] (Training)")
            loop.set_postfix(loss=loss.item())

        epoch_loss = running_loss / num_batches
        epoch_accuracy = running_accuracy / num_batches
        print(f"Epoch [{epoch+1}] (Training) Loss: {epoch_loss:.4f} Accuracy: {epoch_accuracy:.4f}")

        # Evaluation step
        running_accuracy = 0.0
        num_batches = 0
        model.eval()
        with torch.no_grad():
            for data, labels in val_loader:
                data = data.to(device)
                labels = labels.to(device)

                outputs = model(data)
                acc = accuracy(outputs, labels)

                running_accuracy += acc
                num_batches += 1

        val_accuracy = running_accuracy / num_batches
        print(f"Epoch [{epoch+1}] (Validation) Accuracy: {val_accuracy:.4f}")

    return model


In [23]:
train(model,optimizer, train_loader,val_loader,5)

  x = nn.functional.softmax(x)
Epoch [1] (Training): 100%|██████████| 5000/5000 [01:10<00:00, 70.96it/s, loss=2.02]


Epoch [1] (Training) Loss: 2.2022 Accuracy: 0.2460
Epoch [1] (Validation) Accuracy: 0.2577


Epoch [2] (Training): 100%|██████████| 5000/5000 [01:05<00:00, 76.26it/s, loss=2.17]


Epoch [2] (Training) Loss: 2.1449 Accuracy: 0.3153
Epoch [2] (Validation) Accuracy: 0.2885


Epoch [3] (Training): 100%|██████████| 5000/5000 [01:04<00:00, 77.86it/s, loss=2.15]


Epoch [3] (Training) Loss: 2.1174 Accuracy: 0.3470
Epoch [3] (Validation) Accuracy: 0.2923


Epoch [4] (Training): 100%|██████████| 5000/5000 [01:04<00:00, 77.34it/s, loss=2.03]


Epoch [4] (Training) Loss: 2.0973 Accuracy: 0.3665
Epoch [4] (Validation) Accuracy: 0.3096


Epoch [5] (Training):  27%|██▋       | 1351/5000 [00:17<00:46, 77.99it/s, loss=2.44]

In [None]:
train(model,optimizer, train_loader,val_loader,5)

Epoch [1] (Training): 100%|██████████| 5000/5000 [01:07<00:00, 74.07it/s, loss=1.67] 


Epoch [1] (Training) Loss: 1.5666 Accuracy: 0.4386
Epoch [1] (Validation) Accuracy: 0.3692


Epoch [2] (Training):  94%|█████████▍| 4703/5000 [01:00<00:03, 78.44it/s, loss=1.47] 

In [None]:
def save_to_onnx(model, dummy_input_ids, dummy_attention_mask, output_path):
    torch.onnx.export(
        model,
        (dummy_input_ids, dummy_attention_mask),
        output_path,
        export_params=True,
        opset_version=11,  # Use the ONNX version you need
        do_constant_folding=True,
        input_names=["input_ids", "attention_mask"],
        output_names=["output"],
        dynamic_axes={
            "input_ids": {0: "batch_size", 1: "sequence_length"},
            "attention_mask": {0: "batch_size", 1: "sequence_length"},
            "output": {0: "batch_size"},
        },
    )

In [None]:
#model.load_state_dict(torch.load("your_pretrained_model.pth"))
# Create dummy input tensors of size (Batch, 512)
dummy_input_ids = torch.ones(1, 512, dtype=torch.long)
dummy_attention_mask = torch.ones(1, 512, dtype=torch.long)

In [None]:
# Save the model to ONNX
save_to_onnx(model, dummy_input_ids, dummy_attention_mask, "bert_sentence_classifier.onnx")

In [None]:
tokenizer.save_pretrained("local-pt-checkpoint")

In [None]:
model.save_pretrained("local-pt-checkpoint")

In [None]:
torch.save(model.state_dict(), "saved_model.pt")