In [2]:
import torch
from torchsummary import summary
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.functional as F
from sentence_transformers import SentenceTransformer
from tqdm import tqdm



import pandas as pd
import numpy as np
import os

  from .autonotebook import tqdm as notebook_tqdm


# StarWars



In [3]:
base_dir = "../data/raw/StarWarsEpisodes"

In [4]:
folder_ep4 = os.path.join(base_dir,"SW_EpisodeIV.txt")
folder_ep5 = os.path.join(base_dir,"SW_EpisodeV.txt")
folder_ep6 = os.path.join(base_dir,"SW_EpisodeVI.txt")

In [5]:
df_ep4 = pd.read_csv(folder_ep4, sep =' ', header=0, escapechar='\\')
df_ep5 = pd.read_csv(folder_ep5, sep =' ', header=0, escapechar='\\')
df_ep6 = pd.read_csv(folder_ep6, sep =' ', header=0, escapechar='\\')


In [6]:
df_ep4


Unnamed: 0,character,dialogue
1,THREEPIO,Did you hear that? They've shut down the main...
2,THREEPIO,We're doomed!
3,THREEPIO,There'll be no escape for the Princess this time.
4,THREEPIO,What's that?
5,THREEPIO,I should have known better than to trust the l...
...,...,...
1006,LUKE,"Oh, no!"
1007,THREEPIO,"Oh, my! Artoo! Can you hear me? Say somethi..."
1008,TECHNICIAN,We'll get to work on him right away.
1009,THREEPIO,"You must repair him! Sir, if any of my circui..."


In [7]:
Y = pd.concat([df_ep4['character'],df_ep5['character'],df_ep6['character']]).tolist()
X = pd.concat([df_ep4['dialogue'],df_ep5['dialogue'],df_ep6['dialogue']]).tolist()

In [8]:
labels = np.unique(Y)
label_count = [sum(i == np.array(Y)) for i in labels]
for i,(a,b) in enumerate(zip(labels,label_count)):
    if b < 10:
        labels[i] = "Other"
labels = np.unique(labels)


In [9]:
labels

array(['ACKBAR', 'BEN', 'BIGGS', 'COMMANDER', 'CREATURE', 'EMPEROR',
       'GOLD LEADER', 'HAN', 'JABBA', 'LANDO', 'LEIA', 'LUKE', 'OFFICER',
       'OWEN', 'Other', 'PIETT', 'RED LEADER', 'RIEEKAN', 'TARKIN',
       'THREEPIO', 'TROOPER', 'VADER', 'WEDGE', 'YODA'], dtype='<U30')

In [10]:
char2ind = {i:j for i,j in zip(labels,range(len(labels)))}
ind2char = {j:i for i,j in zip(labels,range(len(labels)))}

In [11]:
new_x = X.copy()
new_y = []

In [12]:
for idx in range(len(new_x)):
    
    if Y[idx] in labels:
        label_point = char2ind[Y[idx]]
    else:
        label_point = char2ind["Other"]
    new_y.append(label_point)

In [13]:
with torch.no_grad():
    sentence_model = SentenceTransformer('bert-base-nli-mean-tokens')
    new_x = sentence_model.encode(X)


In [14]:
# char_names = movie_lines.iloc[:,0]
# movie_names = movie_lines.iloc[:,1]
# char_names = np.unique(list(set(char_names.values)))
# movie_names = np.unique(list(set(movie_names.values)))

In [15]:
class CustomStarWarsDataset(Dataset):
    def __init__(self, X, Y,transform=None, target_transform=None):
        self.X = X
        self.Y = Y

        self.transform = transform
        self.target_transform = target_transform

    def __len__(self):
        return len(self.Y)

    def __getitem__(self, idx):
        data_point = self.X[idx]

        label_point = self.Y[idx]

        # print("data_point is:",data_point)
        # print("label_point is:",label_point)
        # 768 
        # print(sentence_encoded,label_point)
        return data_point, label_point

In [16]:
len(Y)

2523

In [17]:
class BertSentenceClassifier(nn.Module):
    def __init__(self):
        super(BertSentenceClassifier, self).__init__()
        self.dropout_rate = .5
        self.lin1 = nn.Linear(768,256)
        self.lin_layers = nn.ModuleList([nn.Linear(256,256) for i in range(4)])


        self.lin2 = nn.Linear(256, len(labels))



    def forward(self, data):

        x = nn.functional.relu(self.lin1(data))
        x = nn.functional.dropout(x,self.dropout_rate)
        for i in self.lin_layers:
            x = nn.functional.relu(i(x))
            x = nn.functional.dropout(x,self.dropout_rate)

        x = self.lin2(x)

        x = nn.functional.softmax(x,dim = 1)

        
        return x

In [18]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import train_test_split

In [19]:
# x_train,x_test,y_train,y_test = train_test_split(new_x,new_y)

In [20]:
# model = RandomForestClassifier(max_leaf_nodes=25)
# model.fit(x_train,y_train)

In [21]:
# model.score(x_train,y_train),model.score(x_test,y_test)

In [22]:
train_dataset = CustomStarWarsDataset(new_x,new_y)
train_set, val_set = torch.utils.data.random_split(train_dataset, [2000, 523])

train_loader = DataLoader(train_set,batch_size=32,shuffle=True,drop_last=True)
val_loader = DataLoader(val_set,batch_size=32,shuffle=True,drop_last=True)

In [23]:
model = BertSentenceClassifier()
optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)

In [24]:
summary(model)

Layer (type:depth-idx)                   Param #
├─Linear: 1-1                            196,864
├─ModuleList: 1-2                        --
|    └─Linear: 2-1                       65,792
|    └─Linear: 2-2                       65,792
|    └─Linear: 2-3                       65,792
|    └─Linear: 2-4                       65,792
├─Linear: 1-3                            6,168
Total params: 466,200
Trainable params: 466,200
Non-trainable params: 0


Layer (type:depth-idx)                   Param #
├─Linear: 1-1                            196,864
├─ModuleList: 1-2                        --
|    └─Linear: 2-1                       65,792
|    └─Linear: 2-2                       65,792
|    └─Linear: 2-3                       65,792
|    └─Linear: 2-4                       65,792
├─Linear: 1-3                            6,168
Total params: 466,200
Trainable params: 466,200
Non-trainable params: 0

In [25]:
device = "mps" if torch.backends.mps.is_available() else "cpu"


In [26]:
def accuracy(preds, labels):
    _, predictions = torch.max(preds, dim=1)
    correct = (predictions == labels).sum().item()
    return correct / len(labels)

In [1]:

num_epochs = 1000
print(device)
model = model.to(device)

for epoch in range(num_epochs):
    loop = tqdm(train_loader, total=len(train_loader))
    model.train()
    running_loss = 0.0
    running_accuracy = 0.0
    num_batches = 0
    for text, author_labels in loop:  # Assuming data_loader is set up to provide batches of data
        data = text.to(device)
        author_labels = author_labels.to(device)
        optimizer.zero_grad()

        data = model(data)
        # print(data.shape)
        # print("data and author labels ::::: ",data,author_labels)
        loss = nn.CrossEntropyLoss()(data,author_labels)
        acc = accuracy(data, author_labels)

        loss.backward()
        running_loss += loss.item()
        running_accuracy += acc
        num_batches += 1

        optimizer.step()
        loop.set_description(f"Epoch [{epoch + 1}/{num_epochs}]")
        loop.set_postfix(loss=loss.item())

     # Validation loop
    model.eval()  # Set the model to evaluation mode
    val_running_accuracy = 0.0
    val_num_batches = 0
    with torch.no_grad():  # Disable gradient calculation during validation
        for val_text, val_labels in val_loader:
            val_data = val_text.to(device)
            val_labels = val_labels.to(device)

            val_output = model(val_data)
            val_acc = accuracy(val_output, val_labels)

            val_running_accuracy += val_acc
            val_num_batches += 1
    avg_loss = running_loss / num_batches
    avg_accuracy = running_accuracy / num_batches
    val_avg_accuracy = val_running_accuracy / val_num_batches
    print(f"Validation Accuracy: {val_avg_accuracy:.4f} Accuracy: {avg_accuracy:.4f}")
    
    print(f"Epoch: {epoch + 1}, Loss: {loss.item()}")


NameError: name 'device' is not defined