In [6]:
import torch
from torch.utils.data import Dataset
import pandas as pd
import numpy as np
import os

# MovieDialogCorpus


In [2]:
base_dir = "../data/raw/MovieDialogCorpus"

In [3]:
movie_characters_metadata = pd.read_csv(os.path.join(base_dir,"movie_characters_metadata.tsv"),
                                        sep='\t',
                                        names = ['charID','charName','movieID','movieName','gender','score'],
                                        index_col=['charID'])
movie_lines = pd.read_csv(os.path.join(base_dir,"movie_lines.tsv"),
                            encoding='utf-8-sig', 
                            sep='\t', 
                            on_bad_lines='skip', 
                            header = None,
                            names = ['lineID', 'charID', 'movieID', 'charName', 'text'],
                            index_col=['lineID'])

In [43]:
a = np.array(list(set(char_names.values)))

In [44]:
a

array(['nan', 'u7261', 'u2097', ..., 'u1955', 'u7809', 'u5513'],
      dtype='<U32')

In [27]:
char_names = movie_lines.iloc[:,0]
movie_names = movie_lines.iloc[:,1]
char_names = np.unique(char_names.values)
movie_names = np.unique(movie_names.values)

TypeError: '<' not supported between instances of 'float' and 'str'

In [26]:
char_names.values

array(['u0', 'u2', 'u0', ..., 'u9030', 'u9030', 'u9034'], dtype=object)

In [19]:
char_names = movie_lines.iloc[:,3]
movie_lines.iloc[0,:]

charID                u0
movieID               m0
charName          BIANCA
text        They do not!
Name: L1045, dtype: object

In [4]:
movie_characters_metadata

Unnamed: 0_level_0,charName,movieID,movieName,gender,score
charID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
u0,BIANCA,m0,10 things i hate about you,f,4
u1,BRUCE,m0,10 things i hate about you,?,?
u2,CAMERON,m0,10 things i hate about you,m,3
u3,CHASTITY,m0,10 things i hate about you,?,?
u4,JOEY,m0,10 things i hate about you,m,6
...,...,...,...,...,...
u9030,DURNFORD,m616,zulu dawn,?,?
u9031,MELVILL,m616,zulu dawn,?,?
u9032,NORRIS-NEWMAN,m616,zulu dawn,?,?
u9033,STUART SMITH,m616,zulu dawn,?,?


In [5]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('bert-base-nli-mean-tokens')

  from .autonotebook import tqdm as notebook_tqdm
Downloading pytorch_model.bin: 100%|██████████| 438M/438M [00:54<00:00, 8.06MB/s] 
Downloading (…)nce_bert_config.json: 100%|██████████| 53.0/53.0 [00:00<00:00, 18.1kB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 112/112 [00:00<00:00, 25.0kB/s]
Downloading (…)821d1/tokenizer.json: 466kB [00:00, 2.19MB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 399/399 [00:00<00:00, 143kB/s]
Downloading (…)8d01e821d1/vocab.txt: 232kB [00:00, 994kB/s]
Downloading (…)1e821d1/modules.json: 100%|██████████| 229/229 [00:00<00:00, 43.6kB/s]


In [13]:
model.encode("")

array([ 6.88885152e-03, -5.33143044e-01,  2.44534397e+00,  2.51658678e-01,
        1.31864011e-01,  7.68279910e-01, -1.60852641e-01,  8.45775723e-01,
        3.01931292e-01, -1.93646364e-02, -5.08157253e-01,  1.79568857e-01,
       -2.50009708e-02,  6.39337182e-01,  1.21816564e+00, -3.75457220e-02,
       -5.85425377e-01, -1.12067133e-01,  4.66072381e-01, -7.71655321e-01,
        1.17680691e-01,  7.24948406e-01, -5.69397993e-02, -9.47605729e-01,
       -2.03660935e-01, -9.77066159e-01,  3.01732898e-01, -1.98827338e+00,
        4.14110348e-02,  3.80420685e-02,  2.71777749e-01, -2.73361355e-01,
        9.59229708e-01,  8.53208378e-02, -3.73194143e-02,  3.04376245e-01,
       -4.72824931e-01,  9.08551961e-02,  8.92774835e-02, -2.70623803e-01,
        1.25249553e+00,  3.21440309e-01,  9.68861699e-01,  3.76673043e-01,
       -1.06764726e-01,  2.64202595e-01, -2.12547511e-01,  3.04400444e-01,
       -2.32380956e-01, -1.16813540e+00, -1.03447926e+00, -9.35310006e-01,
        7.64125049e-01,  

In [None]:
class CustomMovieDialogDataset(Dataset):
    def __init__(self, annotations_file, base_dir, transform=None, target_transform=None):
        self.movie_characters_metadata = pd.read_csv(os.path.join(base_dir,"movie_characters_metadata.tsv"),
                                        sep='\t',
                                        names = ['charID','charName','movieID','movieName','gender','score'],
                                        index_col=['charID'])
        self.movie_lines = pd.read_csv(os.path.join(base_dir,"movie_lines.tsv"),
                            encoding='utf-8-sig', 
                            sep='\t', 
                            on_bad_lines='skip', 
                            header = None,
                            names = ['lineID', 'charID', 'movieID', 'charName', 'text'],
                            index_col=['lineID'])
        self.sentence_model = SentenceTransformer('bert-base-nli-mean-tokens')
        self.base_dir = base_dir
        self.transform = transform
        self.target_transform = target_transform

    def __len__(self):
        return len(self.movie_lines)

    def __getitem__(self, idx):
        data_point = self.movie_lines.iloc[idx, 3]
        label_point = self.movie_lines.iloc[idx,2]
        
        sentence_encoded = self.sentence_model.encode(data_point)
        
        return sentence_encoded, label_point