In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F #relu,
from torch.utils.data import DataLoader
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from utils.vevo_dataset import create_vevo_datasets
from utils.device import get_device,use_cuda
from utils.constants import *





In [2]:
# to use cuda
# use_cuda(True)

# yeh fucntion unhone direct provide kara hai for generating the train,test and val set (it supports pytorch dataloader)
train_dataset,val_dataset,test_dataset = create_vevo_datasets(
    dataset_root="./dataset",
    max_seq_chord=300,
    max_seq_video=300,
    vis_models="2d/clip_l14p",
    emo_model="6c_l14p",
    split_ver="v1",
    random_seq=True
)

In [58]:
#inka joh dataset hai woh dictionary ke form me stored hai ["feature_semantic,motion,emotion,note_denity,loudess "]  
# in saare features ko combine karke ek tensor banaya hai jisko RNN(lstm,bilstm,gru,bigru) me daalenge
# shape of tensor [batch_size,300,sabhi features ko combine karke ek vector] here 300 is video length(sequence length)
def batchToInput(batch):
    # extracting features from dataset
    feature_semantic_list = [] 
    for feature_semantic in batch["semanticList"]:
        # print(feature_semantic.shape)
        feature_semantic_list.append( feature_semantic.to(get_device()) )

    feature_scene_offset = batch["scene_offset"].to(get_device())
    feature_motion = batch["motion"].to(get_device())
    feature_emotion = batch["emotion"].to(get_device())
    feature_note_density = batch["note_density"].to(get_device())
    feature_loudness = batch["loudness"].to(get_device())      

    x = feature_semantic_list[0].float()
    for i in range(1, len(feature_semantic_list)):
        x = torch.cat( (x, feature_semantic_list[i].float()), dim=2)            
    x = torch.cat([x, feature_scene_offset.unsqueeze(-1).float()], dim=2) 
    x = torch.cat([x, feature_motion.unsqueeze(-1).float()], dim=2).to(get_device()) 
    x = torch.cat([x, feature_emotion.float()], dim=2).to(get_device()) #(N,300,some of length of output vectors for each feature)

    y = torch.cat((feature_note_density.unsqueeze(-1).float(), feature_loudness.unsqueeze(-1).float()), dim=2) #(N,300,2)
    return x,y
    

In [71]:
# Abhi jab saari video features ko ek vector me daalenge toh uska size calculte kar raha hu in total_vf_dim

# input size cal
total_vf_dim=0
for vf in train_dataset[0]["semanticList"]: #feature semantic me kayi saare features ho sakte hai toh sabhi features ka length add kar raha hu
    total_vf_dim+=vf.shape[1]

total_vf_dim+=1 #scene offset
total_vf_dim+=1 #motion

#emotion (yeh do type ka hai 5c(5 length emotion output) and 6c(6 length emotion output)  )
total_vf_dim+=6 #for 6c 

# params initalization (yeh params maine apni side se daale hai tweak kar hidden_size,num_layers,batch_size,epochs,dropout)
input_size=total_vf_dim
num_layers=2
hidden_size=128
output_size=2
seq_len=300
batch_size=32
epochs=15
dropout=0.1
model_name="BiGRU"


In [80]:
# Now creating the model
# yeh lstm ka model hai isme input dete hai [batch,300,inputsize(776)]
# our output milta hai [batch,300,2] (2---> note_density,loudness for every frame)
class LSTM(nn.Module):
    def __init__(self,input_size,hidden_size=128,output_size=2,num_layers=2,seq_len=300,dropout=0.1):
        super(LSTM,self).__init__()
        self.input_size=input_size
        self.hidden_size=hidden_size
        self.num_layers=num_layers
        self.seq_len=seq_len
        self.output_size=output_size
        self.dropout=dropout

        self.lstm=nn.LSTM(input_size,hidden_size,num_layers,batch_first=True)
        self.fc=nn.Linear(hidden_size,output_size)
    
    def forward(self,x):
        x=F.dropout(x,p=self.dropout,training=self.training)
        h0=torch.zeros(self.num_layers,x.size(0),self.hidden_size).to(device=get_device())
        c0=torch.zeros(self.num_layers,x.size(0),self.hidden_size).to(device=get_device())

        out, _=self.lstm(x,(h0,c0))
        out=self.fc(out)
        return out
    
class BiLSTM(nn.Module):
    def __init__(self,input_size,hidden_size=128,output_size=2,num_layers=2,seq_len=300,dropout=0.1):
        super(BiLSTM,self).__init__()
        self.input_size=input_size
        self.hidden_size=hidden_size
        self.num_layers=num_layers
        self.seq_len=seq_len
        self.output_size=output_size
        self.dropout=dropout

        self.lstm=nn.LSTM(input_size,hidden_size,num_layers,batch_first=True,bidirectional=True)
        self.fc=nn.Linear(2*hidden_size,output_size)
    
    def forward(self,x):
        x=F.dropout(x,p=self.dropout,training=self.training)
        h0=torch.zeros(self.num_layers*2,x.size(0),self.hidden_size).to(device=get_device())
        c0=torch.zeros(self.num_layers*2,x.size(0),self.hidden_size).to(device=get_device())

        out, _=self.lstm(x,(h0,c0))
        out=self.fc(out)
        return out
    
class GRU(nn.Module):
    def __init__(self,input_size,hidden_size=128,output_size=2,num_layers=2,seq_len=300,dropout=0.1):
        super(GRU,self).__init__()
        self.input_size=input_size
        self.hidden_size=hidden_size
        self.num_layers=num_layers
        self.seq_len=seq_len
        self.output_size=output_size
        self.dropout=dropout

        # self.lstm=nn.GR(input_size,hidden_size,num_layers,batch_first=True,bidirectional=True)
        self.gru=nn.GRU(input_size,hidden_size,num_layers,batch_first=True)
        self.fc=nn.Linear(hidden_size,output_size)
    
    def forward(self,x):
        x=F.dropout(x,p=self.dropout,training=self.training)
        h0=torch.zeros(self.num_layers,x.size(0),self.hidden_size).to(device=get_device())

        out, _=self.gru(x,h0)
        out=self.fc(out)
        return out
    
class BiGRU(nn.Module):
    def __init__(self,input_size,hidden_size=128,output_size=2,num_layers=2,seq_len=300,dropout=0.1):
        super(BiGRU,self).__init__()
        self.input_size=input_size
        self.hidden_size=hidden_size
        self.num_layers=num_layers
        self.seq_len=seq_len
        self.output_size=output_size
        self.dropout=dropout

        # self.lstm=nn.GR(input_size,hidden_size,num_layers,batch_first=True,bidirectional=True)
        self.gru=nn.GRU(input_size,hidden_size,num_layers,batch_first=True,bidirectional=True)
        self.fc=nn.Linear(2*hidden_size,output_size)
    
    def forward(self,x):
        x=F.dropout(x,p=self.dropout,training=self.training)
        h0=torch.zeros(2*self.num_layers,x.size(0),self.hidden_size).to(device=get_device())
        out, _=self.gru(x,h0)
        out=self.fc(out)
        return out



In [81]:
# yeh isme num_workers maine apne CPU ke liye 12 daala tha , tu change karle apne hisaab se for better through put
train_loader = DataLoader(train_dataset, batch_size=batch_size,num_workers=12,shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size,num_workers=12,shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size,num_workers=12,shuffle=True)

In [82]:
# creating the model
model=0
if model_name=="LSTM":
    model=LSTM(input_size,hidden_size,output_size,num_layers,seq_len,dropout)
if model_name=="BiLSTM" :
    model=BiLSTM(input_size,hidden_size,output_size,num_layers,seq_len,dropout)
if model_name=="BiGRU" :
    model=BiGRU(input_size,hidden_size,output_size,num_layers,seq_len,dropout)
if model_name=="GRU" :
    model=GRU(input_size,hidden_size,output_size,num_layers,seq_len,dropout)
# 

In [83]:
# unhone model me yahi loss and adam use kia hai iss model ke liye 
criterion=nn.MSELoss() 
optimizer=optim.Adam(model.parameters(),lr=1e-3,weight_decay=1e-5)

In [84]:
# yeh func ek epoch ke liye chalta hai aur model ko train kardeta hai
# criterion-->mse loss hai
# batchToInput input ke shape ko [batch,300,size of all features] kardeta hai
def train_epoch(curr_epoch,train_loader,criterion,optimizer,losses):
    for batch_no,batch in enumerate(train_loader):
        data,targets=batchToInput(batch)
        scores=model(data)
        
        loss=criterion.forward(scores,targets)
        losses.append(loss)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    

In [85]:
# yeh function loader leta hai aur phir avg_rms,avg_loss, avg_rmse_loudness,avg_rmse_note_Denisty return karta hai
def eval_model(model,dataloader,loss):
    model.eval()

    avg_rmse=-1
    avg_loss=-1
    avg_rmse_note_density=-1
    avg_rmse_loudness=-1

    with torch.set_grad_enabled(False):
        n_test=len(dataloader)

        sum_loss=0.0
        sum_rmse=0.0

        sum_rmse_note_density=0.0
        sum_rmse_loudness=0.0

        for batch in dataloader:
            data,targets=batchToInput(batch)

            scores=model(data)
            scores=scores.reshape(scores.shape[0]*scores.shape[1],-1).float()
            targets=targets.reshape(targets.shape[0]*targets.shape[1],-1).float()
            mse=F.mse_loss(scores,targets)
            rmse=torch.sqrt(mse)
            sum_rmse+=float(rmse)

            scores_note_density,scores_loudness=torch.split(scores,split_size_or_sections=1,dim=1)
            targets_note_density,targets_loudness=torch.split(targets,split_size_or_sections=1,dim=1)

            # calculting rmse note density
            mse_note_density=F.mse_loss(scores_note_density,targets_note_density)
            rmse_note_density=torch.sqrt(mse_note_density)
            sum_rmse_note_density+=float(rmse_note_density)

            # calculating rmse loudness
            mse_loudness=F.mse_loss(scores_loudness,targets_loudness)
            rmse_loudness=torch.sqrt(mse_loudness)
            sum_rmse_loudness+=float(rmse_loudness)

            loss=criterion.forward(scores,targets)
            sum_loss+=float(loss)

        avg_loss= sum_loss/n_test
        avg_rmse=sum_rmse/n_test
        avg_rmse_note_density=sum_rmse_note_density/n_test
        avg_rmse_loudness= sum_rmse_loudness/n_test
        
    model.train()
    return avg_loss,avg_rmse,avg_rmse_note_density,avg_rmse_loudness





            
            



In [86]:
# Tracking best evaluation accruracy
best_eval_rmse=float("inf")
best_eval_rmse_epoch=-1
best_eval_loss=float("inf")
best_eval_loss_epoch=-1

# train
losses=[]

# yaha epochs chal rahe hai training ho rahai hai, avg loss,rmse sab calculte ho rah ahai
for epoch in range(epochs):
    # trains for 1 epoch
    train_epoch(epoch,train_loader,criterion,optimizer,losses)

    train_loss,train_rmse,train_rmse_note_density,train_rmse_loudness=eval_model(model,train_loader,criterion)
    eval_loss,eval_rmse,eval_rmse_note_density,eval_rmse_loudness=eval_model(model,val_loader,criterion)

    print(SEPERATOR)
    print("Epoch:", epoch+1)
    print("Avg train loss:", train_loss)
    print("Avg train RMSE:", train_rmse)
    print("Avg train RMSE (Note Density):", train_rmse_note_density)
    print("Avg train RMSE (Loudness):", train_rmse_loudness)
    
    print("Avg val loss:", eval_loss)
    print("Avg val RMSE:", eval_rmse)
    print("Avg val RMSE (Note Density):", eval_rmse_note_density)
    print("Avg val RMSE (Loudness):", eval_rmse_loudness)

    print(SEPERATOR)
    print("")

    if eval_rmse<best_eval_rmse:
        best_eval_rmse=eval_rmse
        best_eval_rmse_epoch=epoch

    if eval_loss<best_eval_loss:
        best_eval_loss=eval_loss
        best_eval_loss_epoch=epoch

print(SEPERATOR)
print(SEPERATOR)
print("Best RMSE score vs epoch",best_eval_rmse_epoch,":",best_eval_rmse)
print("Best loss score vs epoch",best_eval_loss_epoch,":",best_eval_loss)
print(SEPERATOR)
print(SEPERATOR)

    

    

    
    
    

        



Epoch: 1
Avg train loss: 13.056431870711478
Avg train RMSE: 3.61013358517697
Avg train RMSE (Note Density): 5.104191980863872
Avg train RMSE (Loudness): 0.1152737936691234
Avg val loss: 11.976386070251465
Avg val RMSE: 3.4578682581583657
Avg val RMSE (Note Density): 4.88892126083374
Avg val RMSE (Loudness): 0.10983110467592876

Epoch: 2
Avg train loss: 12.029987736752158
Avg train RMSE: 3.4647333245528373
Avg train RMSE (Note Density): 4.8990703632957056
Avg train RMSE (Loudness): 0.0884090277709459
Avg val loss: 10.947323163350424
Avg val RMSE: 3.3058844407399497
Avg val RMSE (Note Density): 4.674550215403239
Avg val RMSE (Loudness): 0.07952139526605606

Epoch: 3
Avg train loss: 11.648379024706388
Avg train RMSE: 3.4115755558013916
Avg train RMSE (Note Density): 4.823939549295526
Avg train RMSE (Loudness): 0.08522150861589532
Avg val loss: 10.912580172220865
Avg val RMSE: 3.3032607237497964
Avg val RMSE (Note Density): 4.670818169911702
Avg val RMSE (Loudness): 0.08056721339623134

Ep

In [87]:
# Testing 
test_loss,test_rmse,test_rmse_note_density,test_rmse_loudness=eval_model(model,test_loader,criterion)

print(SEPERATOR)
print("Avg test loss:", test_loss)
print("Avg test RMSE:", test_rmse)
print("Avg test RMSE (Note Density):", test_rmse_note_density)
print("Avg test RMSE (Loudness):", test_rmse_loudness)

print(SEPERATOR)
print("")

Avg test loss: 11.504242579142252
Avg test RMSE: 3.382254441579183
Avg test RMSE (Note Density): 4.782306512196858
Avg test RMSE (Loudness): 0.09370045860608418

