In [32]:
import torch
from torchsummary import summary
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.functional as F
from sentence_transformers import SentenceTransformer
from transformers import BertForSequenceClassification,BertTokenizer
from transformers import BertForPreTraining

import fasttext.util

from tqdm import tqdm



import pandas as pd
import numpy as np
import os

In [33]:
os.listdir("../data/raw/starwars")

['SW_EpisodeVI.txt', 'wordcloud_masks', 'SW_EpisodeV.txt', 'SW_EpisodeIV.txt']

# StarWars



In [34]:
base_dir = "../data/raw/starwars"

In [35]:
folder_ep4 = os.path.join(base_dir,"SW_EpisodeIV.txt")
folder_ep5 = os.path.join(base_dir,"SW_EpisodeV.txt")
folder_ep6 = os.path.join(base_dir,"SW_EpisodeVI.txt")

In [36]:
df_ep4 = pd.read_csv(folder_ep4, sep =' ', header=0, escapechar='\\')
df_ep5 = pd.read_csv(folder_ep5, sep =' ', header=0, escapechar='\\')
df_ep6 = pd.read_csv(folder_ep6, sep =' ', header=0, escapechar='\\')


In [37]:
Y = pd.concat([df_ep4['character'],df_ep5['character'],df_ep6['character']]).tolist()
X = pd.concat([df_ep4['dialogue'],df_ep5['dialogue'],df_ep6['dialogue']]).tolist()

In [38]:
labels = np.unique(Y)
label_count = [sum(i == np.array(Y)) for i in labels]
for i,(a,b) in enumerate(zip(labels,label_count)):
    if b < 40:
        labels[i] = "Other"
labels = np.unique(labels)


In [39]:
char2ind = {i:j for i,j in zip(labels,range(len(labels)))}
ind2char = {j:i for i,j in zip(labels,range(len(labels)))}

In [40]:
new_x = X.copy()
new_y = []
for idx in range(len(new_x)):
    
    if Y[idx] in labels:
        label_point = char2ind[Y[idx]]
    else:
        label_point = char2ind["Other"]
    new_y.append(label_point)

In [42]:
for each in ind2char.keys():
    print("name: ",ind2char[each], "  count ",np.sum(np.array(new_y)==each))

name:  BEN   count  115
name:  EMPEROR   count  44
name:  HAN   count  459
name:  LANDO   count  101
name:  LEIA   count  227
name:  LUKE   count  494
name:  Other   count  593
name:  THREEPIO   count  301
name:  VADER   count  140
name:  YODA   count  49


In [43]:
(np.array(new_y)==23).sum()

0

In [44]:
ind2char

{0: 'BEN',
 1: 'EMPEROR',
 2: 'HAN',
 3: 'LANDO',
 4: 'LEIA',
 5: 'LUKE',
 6: 'Other',
 7: 'THREEPIO',
 8: 'VADER',
 9: 'YODA'}

In [45]:
import pickle5 as pickle
if os.path.exists("dict_of_words.pickle"):
    with open('dict_of_words.pickle', 'rb') as handle:
        dict_of_synonyms = pickle.load(handle)
else:
    fasttext.util.download_model('en', if_exists='ignore')  # English
    ft = fasttext.load_model('cc.en.300.bin') 
    words = np.unique([j for i in new_x for j in i.split()])
    dict_of_synonyms = {x:ft.get_nearest_neighbors(x) for x in words}

In [46]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
option = 1
class BertSentenceClassifier(nn.Module):
    def __init__(self, num_labels):
        super(BertSentenceClassifier, self).__init__()

        # Option 1
        if option == 1:
            self.bert_training = BertForSequenceClassification.from_pretrained('bert-base-uncased', problem_type="multi_label_classification").bert
        elif option == 2:
            self.bert_training = SentenceTransformer('bert-base-nli-mean-tokens')

        
        
        for param in self.bert_training.parameters():
            param.requires_grad = False
        self.dropout_rate = 0.1
        self.lin1 = nn.Linear(768, 256)
        self.lin_layers = nn.ModuleList([nn.Linear(256, 256) for i in range(1)])
        self.lin2 = nn.Linear(256, num_labels)

    def forward(self, input_ids, attention_mask):
        # bert_1 = self.bert_freezed(input_ids=input_ids, attention_mask=attention_mask)
        if option == 1:
            bert_output = self.bert_training(input_ids=input_ids, attention_mask=attention_mask).pooler_output
        else:
            pass
        x = nn.functional.relu(self.lin1(bert_output))

        x = nn.functional.dropout(x, self.dropout_rate)

        for lin_layer in self.lin_layers:

            x = nn.functional.relu(lin_layer(x))
            x = nn.functional.dropout(x, self.dropout_rate)

        x = self.lin2(x)
        x = nn.functional.softmax(x)

        return x
model = BertSentenceClassifier(len(labels))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [47]:
model.train()

BertSentenceClassifier(
  (bert_training): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, eleme

In [48]:
from transformers import AdamW
optimizer = AdamW(model.parameters(), lr=1e-3)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=1e-5)



In [49]:
import nltk
from nltk.corpus import wordnet
import random
nltk.download('wordnet')
nltk.download('omw-1.4')
def add_noise(text, p=0.2):
    words = text.split()
    num_noise_words = int(len(words) * p)
    for i in range(num_noise_words):
        idx = random.randint(0, len(words)-1)
        word = words[idx]
        synsets = wordnet.synsets(word)
        if synsets:
            synset = random.choice(synsets)
            synonyms = synset.lemmas()
            if random.random() < p/2:
                synonym = random.choice(synonyms)
                words[idx] = synonym.name()
            # Replace word with nearest neighbor with probability p/2
            else:
                if word in dict_of_synonyms:
                    
                    nn = random.choice(dict_of_synonyms[word])
                    # print("number is nn: ",nn)
                    words[idx] = nn[1]
                else:
                    synonym = random.choice(synonyms)
                    words[idx] = synonym.name()
    # print(words)
    return ' '.join(words)

class TextClassificationDataset(Dataset):
    def __init__(self, X, y, tokenizer):
        self.X = X
        self.y = y
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        text = add_noise(str(self.X[idx]), p = .8)
        label = self.y[idx]
        encoding = self.tokenizer(
            text, 
            add_special_tokens=True, 
            max_length=512, 
            padding='max_length', 
            truncation=True, 
            return_attention_mask=True, 
            return_tensors='pt'
        )
        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()

        return input_ids, attention_mask, torch.tensor(label)

# Assuming X is a list of texts and y is a list of labels
X = ["I love Pixar.", "I don't care for Pixar."]
y = [1, 0]

# Load the pre-trained tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Create the dataset
dataset = TextClassificationDataset(new_x, new_y, tokenizer)

# Create the data loader
batch_size = 4

train_set, val_set = torch.utils.data.random_split(dataset, [2000, 523])

train_loader = DataLoader(train_set,batch_size=4,shuffle=True,drop_last=True)
val_loader = DataLoader(val_set,batch_size=4,shuffle=True,drop_last=True)



[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/alenadamyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/alenadamyan/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [50]:

import os
from sys import platform

import torch
if platform == "darwin":
    device = "mps" if torch.backends.mps.is_available() else "cpu"
else:
    device = "cuda" if torch.cuda.is_available() else "cpu"

    torch.backends.cudnn.benchmark = True
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.enabled = True
    torch.autograd.set_detect_anomaly(True)
    os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
    os.environ['TORCH_USE_CUDA_DSA'] = '1'


    device = "cuda" if torch.cuda.is_available() else "cpu"


In [51]:
def accuracy(preds, labels):
    _, predictions = torch.max(preds, dim=1)
    correct = (predictions == labels).sum().item()
    return correct / len(labels)

def train(model, optimizer, train_loader, val_loader,num_epochs):
    model = model.to(device)
    criterion = nn.CrossEntropyLoss()

    for epoch in range(num_epochs):
        running_loss = 0.0
        running_accuracy = 0.0
        num_batches = 0

        # Training step
        model.train()
        loop = tqdm(train_loader, total=len(train_loader))
        for input_ids, attention_mask, labels in loop:

            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = criterion(outputs, labels)
            acc = accuracy(outputs, labels)

            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            running_accuracy += acc
            num_batches += 1
            loop.set_description(f"Epoch [{epoch + 1}] (Training)")
            loop.set_postfix(loss=loss.item())

        epoch_loss = running_loss / num_batches
        epoch_accuracy = running_accuracy / num_batches
        print(f"Epoch [{epoch+1}] (Training) Loss: {epoch_loss:.4f} Accuracy: {epoch_accuracy:.4f}")

        # Evaluation step
        running_accuracy = 0.0
        num_batches = 0
        model.eval()
        with torch.no_grad():
            for input_ids, attention_mask, labels in val_loader:
                input_ids = input_ids.to(device)
                attention_mask = attention_mask.to(device)
                labels = labels.to(device)

                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                acc = accuracy(outputs, labels)

                running_accuracy += acc
                num_batches += 1

        val_accuracy = running_accuracy / num_batches
        print(f"Epoch [{epoch+1}] (Validation) Accuracy: {val_accuracy:.4f}")

    return model


In [52]:
train(model,optimizer, train_loader,val_loader,5)

Epoch [1] (Training): 100%|██████████| 500/500 [01:52<00:00,  4.46it/s, loss=2.1] 


Epoch [1] (Training) Loss: 2.2136 Accuracy: 0.1655
Epoch [1] (Validation) Accuracy: 0.2135


Epoch [2] (Training): 100%|██████████| 500/500 [01:51<00:00,  4.47it/s, loss=2.16]


Epoch [2] (Training) Loss: 2.0661 Accuracy: 0.2205
Epoch [2] (Validation) Accuracy: 0.2346


Epoch [3] (Training):  26%|██▋       | 132/500 [00:29<01:22,  4.46it/s, loss=1.96]

In [None]:
def save_to_onnx(model, dummy_input_ids, dummy_attention_mask, output_path):
    torch.onnx.export(
        model,
        (dummy_input_ids, dummy_attention_mask),
        output_path,
        export_params=True,
        opset_version=11,  # Use the ONNX version you need
        do_constant_folding=True,
        input_names=["input_ids", "attention_mask"],
        output_names=["output"],
        dynamic_axes={
            "input_ids": {0: "batch_size", 1: "sequence_length"},
            "attention_mask": {0: "batch_size", 1: "sequence_length"},
            "output": {0: "batch_size"},
        },
    )

In [None]:
#model.load_state_dict(torch.load("your_pretrained_model.pth"))
# Create dummy input tensors of size (Batch, 512)
dummy_input_ids = torch.ones(1, 512, dtype=torch.long)
dummy_attention_mask = torch.ones(1, 512, dtype=torch.long)

In [None]:
# Save the model to ONNX
save_to_onnx(model, dummy_input_ids, dummy_attention_mask, "bert_sentence_classifier.onnx")

In [None]:
tokenizer.save_pretrained("local-pt-checkpoint")

In [None]:
model.save_pretrained("local-pt-checkpoint")

In [None]:
torch.save(model.state_dict(), "saved_model.pt")

In [None]:
# from transformers import BertForSequenceClassification, BertTokenizer

# # Load the tokenizer and the model
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# # model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

# # Define your inputs and labels
# text_batch = ["I love Pixar.", "I don't care for Pixar."]
# encoding = tokenizer(text_batch, return_tensors='pt', padding=True, truncation=True)
# input_ids = encoding['input_ids']
# attention_mask = encoding['attention_mask']
# labels = torch.tensor([1, 0])  # The ground truth labels for your examples

# # Forward pass through the model
# outputs = model(input_ids, attention_mask=attention_mask)
# logits = outputs.logits

# # Compute the loss
# loss_fn = torch.nn.CrossEntropyLoss()
# loss = loss_fn(logits, labels)

# # Backward pass and optimization step
# loss.backward()
# optimizer.step()


In [None]:
len(outputs)

In [None]:
new_model = BertSentenceClassifier(len(labels))

In [None]:
new_model.load_state_dict(torch.load("saved_model.pt"))


In [None]:
new_model