In [1]:
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.functional as F

import pandas as pd
import numpy as np
import os

# MovieDialogCorpus


In [2]:
base_dir = "../data/raw/MovieDialogCorpus"

In [3]:
movie_characters_metadata = pd.read_csv(os.path.join(base_dir,"movie_characters_metadata.tsv"),
                                        sep='\t',
                                        names = ['charID','charName','movieID','movieName','gender','score'],
                                        index_col=['charID'])
movie_lines = pd.read_csv(os.path.join(base_dir,"movie_lines.tsv"),
                            encoding='utf-8-sig', 
                            sep='\t', 
                            on_bad_lines='skip', 
                            header = None,
                            names = ['lineID', 'charID', 'movieID', 'charName', 'text'],
                            index_col=['lineID'])

In [4]:
movie_lines = movie_lines[movie_lines.notnull().all(1)]

np.unique(a).shape

In [5]:
char_names = movie_lines.iloc[:,0]
movie_names = movie_lines.iloc[:,1]
char_names = np.unique(list(set(char_names.values)))
movie_names = np.unique(list(set(movie_names.values)))

In [6]:
movie_names

array(['m0', 'm1', 'm10', 'm100', 'm101', 'm102', 'm103', 'm104', 'm105',
       'm106', 'm109', 'm11', 'm110', 'm111', 'm112', 'm113', 'm114',
       'm115', 'm117', 'm118', 'm119', 'm12', 'm120', 'm121', 'm122',
       'm123', 'm124', 'm125', 'm126', 'm127', 'm128', 'm129', 'm13',
       'm130', 'm131', 'm132', 'm133', 'm134', 'm135', 'm136', 'm137',
       'm138', 'm139', 'm14', 'm140', 'm141', 'm142', 'm143', 'm144',
       'm145', 'm146', 'm147', 'm148', 'm149', 'm15', 'm150', 'm151',
       'm152', 'm153', 'm154', 'm155', 'm156', 'm157', 'm158', 'm159',
       'm16', 'm160', 'm161', 'm162', 'm163', 'm164', 'm165', 'm166',
       'm167', 'm168', 'm169', 'm17', 'm170', 'm171', 'm172', 'm173',
       'm174', 'm175', 'm176', 'm177', 'm178', 'm179', 'm18', 'm180',
       'm181', 'm182', 'm183', 'm184', 'm185', 'm186', 'm187', 'm188',
       'm189', 'm19', 'm190', 'm191', 'm192', 'm193', 'm194', 'm195',
       'm196', 'm197', 'm198', 'm199', 'm2', 'm20', 'm200', 'm201',
       'm202', 

In [7]:
char2ind = {i:j for i,j in zip(char_names,range(len(char_names)))}
ind2char = {j:i for i,j in zip(char_names,range(len(char_names)))}
movie2ind = {i:j for i,j in zip(movie_names,range(len(movie_names)))}
ind2movie = {j:i for i,j in zip(movie_names,range(len(movie_names)))}

In [8]:
len(char2ind)

8754

In [9]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('bert-base-nli-mean-tokens')

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
model.encode("Hello").shape

(768,)

In [11]:
class CustomMovieDialogDataset(Dataset):
    def __init__(self, base_dir, transform=None, target_transform=None):
        self.movie_characters_metadata = pd.read_csv(os.path.join(base_dir,"movie_characters_metadata.tsv"),
                                        sep='\t',
                                        names = ['charID','charName','movieID','movieName','gender','score'],
                                        index_col=['charID'])
        self.movie_lines = pd.read_csv(os.path.join(base_dir,"movie_lines.tsv"),
                            encoding='utf-8-sig', 
                            sep='\t', 
                            on_bad_lines='skip', 
                            header = None,
                            names = ['lineID', 'charID', 'movieID', 'charName', 'text'],
                            index_col=['lineID'],skip_blank_lines=True)
        self.movie_lines = self.movie_lines[self.movie_lines.notnull().all(1)]
        self.sentence_model = SentenceTransformer('bert-base-nli-mean-tokens')
        self.base_dir = base_dir
        self.transform = transform
        self.target_transform = target_transform

    def __len__(self):
        return len(self.movie_lines)

    def __getitem__(self, idx):
        data_point = self.movie_lines.iloc[idx, 3]
        label_point = char2ind[self.movie_lines.iloc[idx,0]]
        # print("data_point is:",data_point)
        # print("label_point is:",label_point)
        sentence_encoded = self.sentence_model.encode(data_point)
        # 768 
        # print(sentence_encoded,label_point)
        return sentence_encoded, label_point

In [12]:
movie_lines.iloc[244310].name

'L513119'

In [13]:
class BertSentenceClassifier(nn.Module):
    def __init__(self):
        super(BertSentenceClassifier, self).__init__()

        self.lin1 = nn.Linear(768,4*768)
        self.lin2 = nn.Linear(4*768, 8758)


    def forward(self, data):
        x = nn.functional.relu(self.lin1(data))
        x = self.lin2(x)
        x = nn.functional.softmax(x,dim = 1)

        
        return x

In [14]:
train_dataset = CustomMovieDialogDataset(base_dir)
train_loader = DataLoader(train_dataset,batch_size=32,shuffle=True,drop_last=True)

In [15]:
model = BertSentenceClassifier()
optimizer = torch.optim.Adam(model.parameters(), lr=3e-5)

In [16]:
device = "mps" if torch.backends.mps.is_available() else "cpu"


In [17]:
print(f"PyTorch version: {torch.__version__}"),device


PyTorch version: 2.0.0


(None, 'mps')

In [18]:

num_epochs = 3
device = "mps" if torch.backends.mps.is_available() else "cpu"
print(device)
model = model.to(device)

for epoch in range(num_epochs):
    for text, author_labels in train_loader:  # Assuming data_loader is set up to provide batches of data
        data = text.to(device)
        author_labels = author_labels.to(device)
        optimizer.zero_grad()

        data = model(data)
        # print(data.shape)
        # print("data and author labels ::::: ",data,author_labels)
        loss = nn.CrossEntropyLoss()(data,author_labels)

        loss.backward()

        optimizer.step()

    print(f"Epoch: {epoch + 1}, Loss: {loss.item()}")


mps


KeyboardInterrupt: 