In [6]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, RobertaForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader,random_split
import contractions
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
nltk.download('omw-1.4')
import tqdm

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/omkarmasur/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/omkarmasur/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/omkarmasur/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [71]:
def get_device():
    if torch.cuda.is_available():
        dev = 'cuda:0'
    elif torch.backends.mps.is_available():
        dev = 'mps:0'
    else:
        dev = 'cpu'
    device = torch.device(dev)
    return device

# Dataset

## Train Data

In [8]:
df = pd.read_csv(r'./data/train.csv')

In [9]:
df_data = df.copy()
## pre-processing
# Before data cleaning length
before_data_cleaning = df_data['Text'].str.len().mean()

# Remove blank rows
df_data['Text'].dropna(inplace=True)
# Remove numbers
df_data['Text'] = df_data['Text'].apply(lambda x: re.sub(r'\d+', '', str(x)))
# Converting to lower case
# df_data['Text'] = df_data['Text'].apply(str.lower)
# Remove punctuation
df_data['Text'] = df_data['Text'].apply(lambda x: re.sub(r'[^\w\s\']', ' ', x))
# Remove white spaces
df_data['Text'] = df_data['Text'].apply(str.strip)
# Removing contractions
# def expand(text):
#   return contractions.fix(text)
# df_data['Text'] = df_data['Text'].apply(expand)

# After data cleaning length
after_data_cleaning = df_data['Text'].str.len().mean()

print("Before cleaning : " + str(before_data_cleaning) + ", After cleaning : " + str(after_data_cleaning))

## stop word removal
# stop = stopwords.words('english')
# df_data['Text'] = df_data['Text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

## tokenization/lemmatization
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()
def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]
df_data['Text'] = df_data['Text'].apply(lemmatize_text)
df_data['Text'] = df_data['Text'].apply(lambda x: ' '.join([word for word in x]))

# After data pre-processing length
df_data['Text'].str.len().mean()

df_train = df_data.copy()

df_train

Before cleaning : 57.31430957683742, After cleaning : 56.257516703786195


Unnamed: 0.1,Unnamed: 0,ID,Timestamp,Text,Emotion,Valence,Activation,Dominance
0,52,Ses04M_script03_2_F052,[300.8200 - 302.3500],Swine,1,1.5000,4.5000,3.5000
1,18,Ses05M_script02_2_F018,[233.4900 - 234.9800],No,2,2.3333,2.3333,1.6667
2,10,Ses04F_impro03_F010,[56.4800 - 65.9800],Everybody's going to come up We're going to st...,3,4.0000,4.0000,4.0000
3,21,Ses04M_script02_2_F021,[210.0500 - 212.3855],No,2,2.0000,2.0000,3.0000
4,41,Ses05M_script01_1b_M002,[16.8300 - 24.8300],About four this morning I heard it crack and I...,2,2.5000,2.5000,3.5000
...,...,...,...,...,...,...,...,...
3587,24,Ses04F_impro06_M013,[208.2700 - 223.4500],That's right That's right I mean he would want...,2,4.0000,3.0000,3.0000
3588,67,Ses03F_script01_3_M031,[263.5792 - 266.6760],Just about all,2,2.0000,2.5000,3.0000
3589,11,Ses04F_impro06_M000,[11.4830 - 23.4953],Hey I'm uh I'm really sorry about what happene...,2,2.5000,2.0000,2.5000
3590,55,Ses03M_impro03_M012,[79.5700 - 82.0655],Sara and I,3,4.0000,3.5000,3.5000


## Test Data

In [10]:
df = pd.read_csv(r'./data/test.csv')

In [11]:
df_data = df.copy()
## pre-processing
# Before data cleaning length
before_data_cleaning = df_data['Text'].str.len().mean()

# Remove blank rows
df_data['Text'].dropna(inplace=True)
# Remove numbers
df_data['Text'] = df_data['Text'].apply(lambda x: re.sub(r'\d+', '', str(x)))
# Converting to lower case
# df_data['Text'] = df_data['Text'].apply(str.lower)
# Remove punctuation
df_data['Text'] = df_data['Text'].apply(lambda x: re.sub(r'[^\w\s\']', ' ', x))
# Remove white spaces
df_data['Text'] = df_data['Text'].apply(str.strip)
# Removing contractions
# def expand(text):
#   return contractions.fix(text)
# df_data['Text'] = df_data['Text'].apply(expand)

# After data cleaning length
after_data_cleaning = df_data['Text'].str.len().mean()

print("Before cleaning : " + str(before_data_cleaning) + ", After cleaning : " + str(after_data_cleaning))

## stop word removal
# stop = stopwords.words('english')
# df_data['Text'] = df_data['Text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

## tokenization/lemmatization
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()
def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]
df_data['Text'] = df_data['Text'].apply(lemmatize_text)
df_data['Text'] = df_data['Text'].apply(lambda x: ' '.join([word for word in x]))

# After data pre-processing length
df_data['Text'].str.len().mean()

df_test = df_data.copy()

df_test

Before cleaning : 53.9086859688196, After cleaning : 52.88307349665924


Unnamed: 0.1,Unnamed: 0,ID,Timestamp,Text,Emotion,Valence,Activation,Dominance
0,18,Ses02F_impro04_M000,[5.1392 - 9.5052],So uh so how's it been going I mean you have a...,0,3.0,3.5000,2.5000
1,55,Ses02F_script03_1_M024,[142.8200 - 153.4900],I don't care what you do See you could paint y...,0,4.0,2.6667,3.3333
2,15,Ses04F_script01_3_F015,[138.3300 - 145.8400],I'll never forgive you Why did you wait all th...,3,4.5,4.0000,4.5000
3,30,Ses04M_script03_2_F030,[175.8400 - 178.6900],Mind your own business,1,2.0,3.0000,3.5000
4,57,Ses02F_impro07_M019,[110.3312 - 112.5500],I mean because you went to the campus before r...,3,4.0,3.0000,3.0000
...,...,...,...,...,...,...,...,...
893,36,Ses05F_script03_2_F036,[219.3900 - 221.6700],I I hate you,1,1.5,4.0000,4.0000
894,22,Ses02M_script03_1_F022,[164.1655 - 169.3167],Oh Charles that wa his name Charles It did wig...,3,4.0,3.0000,2.5000
895,41,Ses01F_script02_2_F041,[413.7500 - 419.3655],So maybe we are in the wrong spot but we are w...,0,3.0,1.5000,2.5000
896,41,Ses03F_impro04_M014,[122.2157 - 125.1925],Have you been going to audition and stuff,0,2.5,2.5000,3.0000


In [56]:
class MultiModalDataset(Dataset):
  def __init__(self, dataset:pd.DataFrame, tokenizer,input_sound):
    tokenized_dataset = tokenizer(dataset['Text'].tolist(), padding='max_length', truncation=True, return_tensors='pt',max_length=99)
    self.input_ids = tokenized_dataset['input_ids']
    self.attention_mask = tokenized_dataset['attention_mask']
    self.labels = dataset['Emotion'].to_list()
    self.input_sound = input_sound

    

  def __len__(self):
    return len(self.labels)

  def __getitem__(self, idx):
    sound = torch.from_numpy(self.input_sound[idx])
    return {
        'ids': self.input_ids[idx],
        'mask': self.attention_mask[idx],
        'target': torch.tensor(self.labels[idx], dtype=torch.long),
        "sound":sound.to(torch.float)
    }

In [49]:
tokenizer = AutoTokenizer.from_pretrained('roberta-base')

In [14]:
final_data_dictionary = {}

In [15]:
import pickle

In [16]:
with open('final_data_dictionary.pickle', 'rb') as f:
    final_data_dictionary = pickle.load(f)

In [17]:
final_data_dictionary

{'Ses01F_impro01_F000': {'vector': array([[[-3.68428955e+01, -3.18272133e+01, -2.86984444e+01, ...,
           -8.00000000e+01, -8.00000000e+01, -8.00000000e+01],
          [-3.16944847e+01, -2.93041382e+01, -2.64917603e+01, ...,
           -8.00000000e+01, -8.00000000e+01, -8.00000000e+01],
          [-3.42897491e+01, -2.84053497e+01, -2.93596401e+01, ...,
           -8.00000000e+01, -8.00000000e+01, -8.00000000e+01],
          ...,
          [-8.69678743e-02, -9.70625840e-02, -8.33992213e-02, ...,
            6.44022655e-02,  5.10166790e-02, -5.30884415e-02],
          [ 3.04456555e-02,  2.40520948e-02,  2.70215856e-03, ...,
            1.01589728e-01,  1.19904662e-01,  6.99902211e-02],
          [ 1.56739494e-02,  1.49491690e-02, -7.71329552e-03, ...,
            8.92513199e-03, -1.26060829e-02, -1.81609271e-02]]]),
  'label': 'neu'},
 'Ses01F_impro01_F001': {'vector': array([[[-3.78399429e+01, -3.46647072e+01, -3.89421425e+01, ...,
           -8.00000000e+01, -8.00000000e+01, -8.00

In [18]:
train_ids = df_train["ID"].to_list()
train_sounds = []

for id in train_ids:
    train_sounds.append(final_data_dictionary[id]["vector"])

In [19]:
test_ids = df_test["ID"].to_list()
test_sounds = []

for id in test_ids:
    test_sounds.append(final_data_dictionary[id]["vector"])

In [57]:
train_dataset = MultiModalDataset(df_train, tokenizer,train_sounds)

In [58]:
test_dataset = MultiModalDataset(df_test, tokenizer,test_sounds)

In [60]:
train_dataset[0]["ids"].shape

torch.Size([99])

In [61]:
class SoundClassifier(nn.Module):
    def __init__(self, num_classes):
        super(SoundClassifier, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        self.bn1 = nn.BatchNorm2d(32)
        self.relu1 = nn.ReLU(inplace=True)
        self.pool1 = nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2))
        self.conv2 = nn.Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        self.bn2 = nn.BatchNorm2d(64)
        self.relu2 = nn.ReLU(inplace=True)
        self.pool2 = nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2))
        self.fc1 = nn.Linear(64*70*31, 512)
        self.relu3 = nn.ReLU(inplace=True)
        self.dropout = nn.Dropout(0.5)
        self.fc2 = nn.Linear(512, num_classes)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu1(x)
        x = self.pool1(x)
        x = self.conv2(x)
        x = self.bn2(x)
        x = self.relu2(x)
        x = self.pool2(x)
        return x

In [64]:
class MultiModalModel(nn.Module):
    def __init__(self,text_model:str,audio_model:str,number_of_classes=4) -> None:
        super().__init__()
        #Load Text model
        self.text_classifier =RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=4)

        state_dict = torch.load(text_model,map_location="cpu")
        self.text_classifier.load_state_dict(state_dict)
        self.text_classifier.classifier = nn.Identity()

        #Load audio model
        self.sound_classifier = SoundClassifier(num_classes=4)

        state_dict = torch.load(audio_model,map_location="cpu")
        self.sound_classifier.load_state_dict(state_dict)

        self.fc1 = nn.Linear(214912,32)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.4)
        self.output = nn.Linear(32,number_of_classes)
    

    def forward(self,data):
        ids = data['ids']
        mask = data['mask']

        inputs = {
            'input_ids':ids,
                'attention_mask': mask,
        }
        text_output = self.text_classifier(input_ids=ids,attention_mask=mask)

        text_output = text_output[0]

        sound_output = self.sound_classifier(data["sound"])



        text_output = text_output.view(text_output.shape[0],-1)
        sound_output = sound_output.view(sound_output.shape[0],-1)

        total_outut = torch.cat([text_output,sound_output],dim=1)

        x = self.relu(self.fc1(total_outut))

        x = self.output(x)

        
        return x
        
            
            



In [65]:
model = MultiModalModel("./models/text/roberta_0.701559.pt","./models/sound/audio_only_model_pvxok_64.25389755011136.pt",4)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'lm_head.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classi

In [66]:
model

MultiModalModel(
  (text_classifier): RobertaForSequenceClassification(
    (roberta): RobertaModel(
      (embeddings): RobertaEmbeddings(
        (word_embeddings): Embedding(50265, 768, padding_idx=1)
        (position_embeddings): Embedding(514, 768, padding_idx=1)
        (token_type_embeddings): Embedding(1, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): RobertaEncoder(
        (layer): ModuleList(
          (0-11): 12 x RobertaLayer(
            (attention): RobertaAttention(
              (self): RobertaSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): RobertaSelfOutput(
             

In [76]:
def calculate_accuracy(model,loader):
    device = get_device()

    model_device = model.to(device)


    with torch.no_grad():
        n_samples = 0
        n_correct  = 0
        for x in loader:

            x_device =  {
                'ids': x["ids"].to(device),
                'mask': x["mask"].to(device),
                "sound": x["sound"].to(device)
            }
            targets  = x["target"].to(device)
          

            ypred = model_device(x_device)
            _, ypred_labels = torch.max(ypred,1)
            n_correct += (ypred_labels == targets).sum().item()
            n_samples+= x["ids"].shape[0]


        accuracy = (100*n_correct)/n_samples
        return accuracy


def train_loop(model,train_dataloader,test_dataloader,epochs,loss_fn,optimizer,scheduler):
    device = get_device()
    model_device = model.to(device)
    max_accuracy = float("-inf")

    for epoch in range(epochs):
        t = tqdm.tqdm(train_dataloader)
        
        total_loss = 0
        index = 0
        for x in t:
            x_device =  {
                'ids': x["ids"].to(device),
                'mask': x["mask"].to(device),
                "sound": x["sound"].to(device)
            }
            targets  = x["target"].to(device)

            predictions= model_device(x_device)

            loss = loss_fn(predictions,targets)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            
            total_loss+=loss.item()
            
            t.set_description(f"Epoch: {epoch+1}/{epochs} Loss: {total_loss}")

            index+=1

            if index == len(train_dataloader)-1:
                accuracy = calculate_accuracy(model,test_dataloader)
                if accuracy > max_accuracy:
                    torch.save(model.state_dict(), "4_classes_model_corrected_1.pt")
                    max_accuracy = accuracy
                t.set_postfix({"accuracy":accuracy})
        if scheduler:
            scheduler.step()


            


In [68]:
train_dataloader = DataLoader(train_dataset,batch_size=32,shuffle=True)
dev_dataloader = DataLoader(test_dataset,batch_size=32,shuffle=True)

In [69]:
test_dataset[560]["ids"].shape

torch.Size([99])

In [70]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(),1e-5)

In [77]:
train_loop(model,train_dataloader,dev_dataloader,20,loss_fn,optimizer,None)

Epoch: 1/20 Loss: 13.671029520221055: 100%|██████████| 113/113 [01:03<00:00,  1.79it/s, accuracy=67.1]
Epoch: 2/20 Loss: 31.670539079234004: 100%|██████████| 113/113 [01:02<00:00,  1.81it/s, accuracy=68.3]
Epoch: 3/20 Loss: 10.724540116265416: 100%|██████████| 113/113 [01:01<00:00,  1.84it/s, accuracy=69.6]
Epoch: 4/20 Loss: 7.13142604380846: 100%|██████████| 113/113 [01:02<00:00,  1.82it/s, accuracy=70.8] 
Epoch: 5/20 Loss: 5.841285600094125: 100%|██████████| 113/113 [01:00<00:00,  1.86it/s, accuracy=69.7]
Epoch: 6/20 Loss: 4.374163335538469: 100%|██████████| 113/113 [01:01<00:00,  1.84it/s, accuracy=69.7]
Epoch: 7/20 Loss: 4.434040943626314: 100%|██████████| 113/113 [01:00<00:00,  1.86it/s, accuracy=69.6]
Epoch: 8/20 Loss: 4.282685594022041: 100%|██████████| 113/113 [01:01<00:00,  1.85it/s, accuracy=70.6]
Epoch: 9/20 Loss: 3.405978550261352: 100%|██████████| 113/113 [01:01<00:00,  1.84it/s, accuracy=70.6]
Epoch: 10/20 Loss: 3.1553293872639188: 100%|██████████| 113/113 [01:01<00:00,  

In [79]:
calculate_accuracy(model,dev_dataloader)

72.271714922049

In [78]:
state_dict = torch.load("./4_classes_model_corrected.pt")
model.load_state_dict(state_dict)

<All keys matched successfully>

In [29]:
for x in test_dataset:
    if x["sound"].shape[1]!=281 or x["sound"].shape[2]!=126:
        print("Yes")

In [15]:
text_classifier =RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=4)

state_dict = torch.load("./roberta_full_20ep.pt",map_location="cpu")
text_classifier.load_state_dict(state_dict)
text_classifier.classifier = nn.Identity()

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

In [18]:
sound_classifier = SoundClassifier(num_classes=4)

state_dict = torch.load("./4_classes_model_corrected.pt",map_location="cpu")
sound_classifier.load_state_dict(state_dict)

<All keys matched successfully>

In [267]:
torch.save(state_dict,"4_classes_model_corrected.pt")

In [27]:
dataloader = DataLoader(labeled_dataset,32,True)

In [160]:
for i, data in enumerate(train):

    ids = data['ids']
    mask = data['mask']
    targets = data['target']


    inputs = {'input_ids':      ids,
              'attention_mask': mask,
              }
    text_output = text_classifier(**inputs)
    text_output = text_output[0]

    sound_output = sound_classifier(data["sound"])

    text_output = text_output.view(text_output.shape[0],-1)
    sound_output = sound_output.view(sound_output.shape[0],-1)

    total_outut = torch.cat([text_output,sound_output],dim=1)

    print(total_outut.shape)
    
    if i==5:
        break

torch.Size([32, 214912])
torch.Size([32, 214912])
torch.Size([32, 214912])
torch.Size([32, 214912])
torch.Size([32, 214912])
torch.Size([32, 214912])


In [None]:
from transformers import RobertaForSequenceClassification, RobertaTokenizer
from torch import nn

# define the new model architecture
class RobertaForFeatureExtraction(nn.Module):
    def __init__(self):
        super().__init__()
        self.roberta = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)
        self.roberta.classifier = nn.Identity()  # remove the classifier layer

    def forward(self, input_ids, attention_mask=None, token_type_ids=None):
        outputs = self.roberta(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        
        return outputs

# initialize the tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# initialize the new model
model = RobertaForFeatureExtraction()

# test the model
text = 'This is a test sentence.'
inputs = tokenizer.encode_plus(text, return_tensors='pt', padding=True, truncation=True)
outputs = model(**inputs)
outputs[0].shape


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

torch.Size([1, 8, 768])

In [55]:
print(type(outputs[0]))

<class 'torch.Tensor'>


In [265]:
torch.save(model.state_dict(), "multimodel.pt")