In [6]:
import torch
from torch.utils.data import Dataset
import pandas as pd
import numpy as np
import os

# MovieDialogCorpus


In [2]:
base_dir = "../data/raw/MovieDialogCorpus"

In [3]:
movie_characters_metadata = pd.read_csv(os.path.join(base_dir,"movie_characters_metadata.tsv"),
                                        sep='\t',
                                        names = ['charID','charName','movieID','movieName','gender','score'],
                                        index_col=['charID'])
movie_lines = pd.read_csv(os.path.join(base_dir,"movie_lines.tsv"),
                            encoding='utf-8-sig', 
                            sep='\t', 
                            on_bad_lines='skip', 
                            header = None,
                            names = ['lineID', 'charID', 'movieID', 'charName', 'text'],
                            index_col=['lineID'])

In [43]:
a = np.array(list(set(char_names.values)))

In [52]:
np.unique(a).shape

(8758,)

In [57]:
char_names = movie_lines.iloc[:,0]
movie_names = movie_lines.iloc[:,1]
char_names = np.unique(list(set(char_names.values)))
movie_names = np.unique(list(set(movie_names.values)))

In [62]:
char2ind = {i:j for i,j in zip(char_names,range(len(char_names)))}
ind2char = {j:i for i,j in zip(char_names,range(len(char_names)))}
movie2ind = {i:j for i,j in zip(movie_names,range(len(movie_names)))}
ind2movie = {j:i for i,j in zip(movie_names,range(len(movie_names)))}

In [5]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('bert-base-nli-mean-tokens')

  from .autonotebook import tqdm as notebook_tqdm
Downloading pytorch_model.bin: 100%|██████████| 438M/438M [00:54<00:00, 8.06MB/s] 
Downloading (…)nce_bert_config.json: 100%|██████████| 53.0/53.0 [00:00<00:00, 18.1kB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 112/112 [00:00<00:00, 25.0kB/s]
Downloading (…)821d1/tokenizer.json: 466kB [00:00, 2.19MB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 399/399 [00:00<00:00, 143kB/s]
Downloading (…)8d01e821d1/vocab.txt: 232kB [00:00, 994kB/s]
Downloading (…)1e821d1/modules.json: 100%|██████████| 229/229 [00:00<00:00, 43.6kB/s]


In [50]:
model.encode("Hello").shape

(768,)

In [65]:
class CustomMovieDialogDataset(Dataset):
    def __init__(self, annotations_file, base_dir, transform=None, target_transform=None):
        self.movie_characters_metadata = pd.read_csv(os.path.join(base_dir,"movie_characters_metadata.tsv"),
                                        sep='\t',
                                        names = ['charID','charName','movieID','movieName','gender','score'],
                                        index_col=['charID'])
        self.movie_lines = pd.read_csv(os.path.join(base_dir,"movie_lines.tsv"),
                            encoding='utf-8-sig', 
                            sep='\t', 
                            on_bad_lines='skip', 
                            header = None,
                            names = ['lineID', 'charID', 'movieID', 'charName', 'text'],
                            index_col=['lineID'])
        self.sentence_model = SentenceTransformer('bert-base-nli-mean-tokens')
        self.base_dir = base_dir
        self.transform = transform
        self.target_transform = target_transform

    def __len__(self):
        return len(self.movie_lines)

    def __getitem__(self, idx):
        data_point = self.movie_lines.iloc[idx, 3]
        label_point = movie2ind[self.movie_lines.iloc[idx,2]]
        
        sentence_encoded = self.sentence_model.encode(data_point)
        # 768 
        return sentence_encoded, label_point