In [None]:
from transformers import BertModel, BertTokenizer

model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
bert_model = BertModel.from_pretrained(model_name, output_hidden_states = True)

bert_model.eval()

text = ["this is a test sentence", "this is the second", "a word"]
input_seq = tokenizer(text, padding=True, return_tensors='pt')
outputs = bert_model(**input_seq)

Xs = outputs.hidden_states[0].to(DEVICE)
Hst = outputs.last_hidden_state.to(DEVICE)

Xt = torch.rand(32, 20, 512).to(DEVICE)

encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8, batch_first=True)
encoder = nn.TransformerEncoder(encoder_layer, num_layers=3).to(DEVICE)

decoder_layer = nn.TransformerDecoderLayer(d_model=512, nhead=8, batch_first=True)
decoder = nn.TransformerDecoder(decoder_layer, num_layers=3).to(DEVICE)

Hst = encoder(Xs)
Xt_hat = decoder(Xt, Hst)

Hts = encoder(Xt_hat)
Xs_hat = decoder(Xs, Hts)

Xs_hat.shape
# out.shape

In [None]:
from mmsdk import mmdatasdk

cmumosi_highlevel=mmdatasdk.mmdataset(mmdatasdk.cmu_mosi.highlevel, 'cmumosi/')
cmumosi_highlevel.add_computational_sequences(mmdatasdk.cmu_mosi.raw, 'cmumosi/')
cmumosi_highlevel.align('Opinion Segment Labels')

In [281]:
cmumosi_highlevel.keys()

dict_keys(['glove_vectors', 'FACET_4.1', 'FACET_4.2', 'OpenSmile-emobase2010', 'OpenSMILE', 'OpenFace_1', 'OpenFace_2', 'COVAREP', 'Opinion Segment Labels', 'words', 'phonemes'])

In [109]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [463]:
def generate_labels_df(cmumosi_highlevel):
    labels_df = pd.DataFrame(cmumosi_highlevel['Opinion Segment Labels'].data).T
    labels_df.columns = ['intervals', 'label']
    labels_df['t_start'] = labels_df['intervals'].apply(lambda x: x[0][0])
    labels_df['t_start'] = labels_df['t_start'].apply(lambda x: round(x, 4))
    labels_df['t_end'] = labels_df['intervals'].apply(lambda x: x[0][1])
    labels_df['t_end'] = labels_df['t_end'].apply(lambda x: round(x, 4))
    labels_df = labels_df.drop(columns='intervals')
    labels_df['label'] = labels_df['label'].apply(lambda x: x[0][0])
    return labels_df.reset_index().rename(columns={'index': 'sample_id'})


labels_df = generate_labels_df(cmumosi_highlevel)
labels_df


Unnamed: 0,sample_id,label,t_start,t_end
0,03bSnISJMiM[0],2.40,51.9045,55.9454
1,03bSnISJMiM[1],-0.80,56.0451,66.7807
2,03bSnISJMiM[2],-1.00,66.7807,68.7363
3,03bSnISJMiM[3],-1.75,68.7363,70.5422
4,03bSnISJMiM[4],0.00,70.5422,71.6995
...,...,...,...,...
2178,zhpQhgha_KU[30],-1.00,130.0070,136.9213
2179,zhpQhgha_KU[31],0.80,136.9213,143.0075
2180,zhpQhgha_KU[32],-0.40,143.0075,147.7367
2181,zhpQhgha_KU[33],1.20,157.5943,159.6696


In [467]:
def generate_language_modality(cmumosi_highlevel, labels_df):
    language_df = pd.DataFrame(columns=['segment_id', 'word', 't_start', 't_end'])
    words_data = cmumosi_highlevel['words'].data
    
    for segment_id in words_data.keys():
        words = [word[0].decode('utf-8') for word in words_data[segment_id]['features']]
        intervals = [interval for interval in words_data[segment_id]['intervals']]
        words_df = pd.DataFrame({'segment_id': segment_id, 'word': words, 'intervals': intervals})
        words_df = words_df[words_df.word != 'sp'].reset_index().drop(columns=['index'])
        words_df['intervals'] = words_df.intervals.apply(lambda x: [round(i, 4) for i in x])
        words_df['t_start'] = words_df['intervals'].apply(lambda x: x[0])
        words_df['t_end'] = words_df['intervals'].apply(lambda x: x[1])
        words_df = words_df.drop(columns='intervals')
 
        language_df = pd.concat([language_df, words_df])
   
    samples_intervals = labels_df[['sample_id', 't_start', 't_end']].rename(columns={'t_start': 'seg_t_start', 't_end': 'seg_t_end'})
    samples_intervals['segment_id'] = samples_intervals.sample_id.str.split('[').str[0]
    merged = pd.merge(samples_intervals, language_modality, on='segment_id')
    merged = merged[(merged['t_start'] >= merged['seg_t_start']) & (merged['t_end'] <= merged['seg_t_end'])]
    merged = merged.drop(columns=['seg_t_start', 'seg_t_end']).reset_index().drop(columns=['index'])
    
    return merged

language_modality = generate_language_modality(cmumosi_highlevel, labels_df)
language_modality

Unnamed: 0,sample_id,segment_id,word,t_start,t_end
0,03bSnISJMiM[0],03bSnISJMiM,anyhow,52.7327,53.5508
1,03bSnISJMiM[0],03bSnISJMiM,it,54.5585,54.8079
2,03bSnISJMiM[0],03bSnISJMiM,was,54.8079,55.0175
3,03bSnISJMiM[0],03bSnISJMiM,really,55.0673,55.5363
4,03bSnISJMiM[0],03bSnISJMiM,good,55.5363,55.9454
...,...,...,...,...,...
27535,zhpQhgha_KU[34],zhpQhgha_KU,review,164.5485,165.0075
27536,zhpQhgha_KU[34],zhpQhgha_KU,it,165.0075,165.1073
27537,zhpQhgha_KU[34],zhpQhgha_KU,is,165.1073,165.3168
27538,zhpQhgha_KU[34],zhpQhgha_KU,very,165.3168,165.8356


In [472]:
df2 = language_modality.word.apply(lambda x: tokenizer(x, padding=True, return_tensors='pt'))

In [473]:
df2.iloc[0]

{'input_ids': tensor([[  101,  2151, 14406,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1]])}

In [None]:
from torch.utils.data import Dataset, DataLoader

class CMUMOSI(Dataset):
    def __init__(self, csv_file, root_dir, transform=None):
        self.df = pd.read_csv(csv_file, index_col='segment_id')
        self.root_dir = root_dir
        self.transform = transform
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        segment_id = self.df.iloc[idx, 0]
        features = self.df.iloc[idx, 1]
        intervals = self.df.iloc[idx, 2]
        
        sample = {'segment_id': segment_id, 'features': features, 'intervals': intervals}
        
        if self.transform:
            sampl

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')