# New Section

In [1]:
import IPython.display as ipd
import librosa
import librosa.display
import pandas as pd
import os, time, warnings
import numpy as np
from torch.utils.data import DataLoader

In [3]:
def resize_spectrogram(spec, length, fact=-80):

    # Create an empty canvas to put spectrogram into
    canvas = np.ones((len(spec), length)) * fact

    if spec.shape[1] <= length:
        canvas[:, : spec.shape[1]] = spec
    else:
        canvas[:, :length] = spec[:, :length]
    return canvas

def compute_mel_spec(filename, sr=8000, hop_length=512, duration=30.0):

    # Loads the mp3 file
    y, sr = librosa.load(filename, sr=sr)

    # Compute the mel spectrogram
    x_mel = librosa.feature.melspectrogram(y=y, sr=sr)

    # Apply logarithmic dB-scale to spectrogram and set maximum to 0 dB
    x_mel = librosa.power_to_db(x_mel, ref=np.max)

    # Compute mean strength per frequency for mel spectrogram
    mel_strength = np.mean(x_mel, axis=1)

    # Estimate the desired length of the spectrogram
    length = int(duration * sr / hop_length)

    # print(np.min(x_mel))
    # print(np.max(x_mel))

    # Put mel spectrogram into the right shape
    x_mel = resize_spectrogram(x_mel, length)

    x_mel = librosa.util.normalize(x_mel, axis=1)
    x_mel = np.ones(x_mel.shape) + x_mel


    return x_mel, mel_strength

In [4]:
from matplotlib import pyplot as plt

In [5]:
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#device="cpu"

In [6]:
print(device)

cuda


In [7]:
from torch.utils.data.dataset import Dataset

In [8]:
import torch.nn.functional as F

In [9]:
class MyVowelDataset(Dataset):
    def __init__(self, audio_file_list, scores):
        self.audio_file_list = audio_file_list
        self.scores = scores
        
    def __getitem__(self, index):
        img = self.audio_file_list[index]
        # img, _ = librosa.load(img, sr=16000)
        # print(type(img))
        # img = torch.tensor(img, dtype=torch.float32)
        # img = torch.mean(img, dim=0).unsqueeze(0)
        # img = torchaudio.transforms.Spectrogram()(img)
        img, _ = compute_mel_spec(f'/home/ongun/challenge/answer/{img}')
        img = torch.tensor(img, dtype=torch.float32)
        #print(img.shape)
        # img = torch.mean(img, dim=0).unsqueeze(0)
        # img = F.pad(img, (0, 1000 - img.shape[1]))
        img = img.reshape([1, img.shape[0], img.shape[1]])
        # img = generate_features(img)
        # torch.index_select(img, 1, torch.LongTensor([2,0,1]))
        #print(img.shape)
        
        def ordinal_labeler(score):
            levels = [1]*score + [0]*(39 - score)
            levels = torch.tensor(levels, dtype=torch.float32)
            return levels
        
        score = self.scores[index]
        label = np.array(ordinal_labeler(score))
        return img, label

    def __len__(self):
        count = len(self.audio_file_list)
        return count

In [10]:
class MyAnswerDataset(Dataset):
    def __init__(self, audio_file_list, scores):
        self.audio_file_list = audio_file_list
        self.scores = scores
        
    def __getitem__(self, index):
        img = self.audio_file_list[index]
        # img, _ = librosa.load(img, sr=16000)
        # print(type(img))
        # img = torch.tensor(img, dtype=torch.float32)
        # img = torch.mean(img, dim=0).unsqueeze(0)
        # img = torchaudio.transforms.Spectrogram()(img)
        img, _ = compute_mel_spec(f'/home/ongun/challenge/answer/{img}')
        img = torch.tensor(img, dtype=torch.float32)
        #print(img.shape)
        # img = torch.mean(img, dim=0).unsqueeze(0)
        # img = F.pad(img, (0, 1000 - img.shape[1]))
        img = img.reshape([1, img.shape[0], img.shape[1]])
        # img = generate_features(img)
        # torch.index_select(img, 1, torch.LongTensor([2,0,1]))
        #print(img.shape)
        
        def ordinal_labeler(score):
            levels = [1]*score + [0]*(39 - score)
            levels = torch.tensor(levels, dtype=torch.float32)
            return levels
        
        score = self.scores[index]
        label = np.array(ordinal_labeler(score))
        return img, label

    def __len__(self):
        count = len(self.audio_file_list)
        return count

In [11]:
df = pd.read_csv("modified_data.csv")
vowel_df=df.iloc[::2]
answer_df=df.iloc[1::2]

In [12]:
print(df.columns)

Index(['userid', 'filename', 'date', 'score', 'excercise_type'], dtype='object')


In [13]:
num_epochs = 100
num_classes = 40
batch_size = 1
learning_rate = 0.001

In [14]:
vowel_audio_list = np.array(vowel_df['filename'].tolist())
vowel_score_list = np.array(vowel_df['score'].tolist())
vowel_indices = np.arange(len(vowel_audio_list))
np.random.shuffle(vowel_indices)

vowel_limit = int(len(vowel_indices)*0.7)

X_vowel_train, X_vowel_test = vowel_audio_list[vowel_indices[:vowel_limit]], vowel_audio_list[vowel_indices[vowel_limit:]]
y_vowel_train, y_vowel_test = vowel_score_list[vowel_indices[:vowel_limit]], vowel_score_list[vowel_indices[vowel_limit:]]
vowel_train_data = MyVowelDataset(X_vowel_train, y_vowel_train)
vowel_test_data = MyVowelDataset(X_vowel_test, y_vowel_test)
vowel_train_loader = DataLoader(dataset=vowel_train_data, batch_size=batch_size)
vowel_test_loader = DataLoader(dataset=vowel_test_data, batch_size=batch_size)

In [15]:
answer_audio_list = np.array(answer_df['filename'].tolist())
answer_score_list = np.array(answer_df['score'].tolist())
answer_indices = np.arange(len(answer_audio_list))
np.random.shuffle(answer_indices)

answer_limit = int(len(answer_indices)*0.7)

X_answer_train, X_answer_test = answer_audio_list[answer_indices[:answer_limit]], answer_audio_list[answer_indices[answer_limit:]]
y_answer_train, y_answer_test = answer_score_list[answer_indices[:answer_limit]], answer_score_list[answer_indices[answer_limit:]]
answer_train_data = MyAnswerDataset(X_answer_train, y_answer_train)
answer_test_data = MyAnswerDataset(X_answer_test, y_answer_test)
answer_train_loader = DataLoader(dataset=answer_train_data, batch_size=batch_size)
answer_test_loader = DataLoader(dataset=answer_test_data, batch_size=batch_size)

In [16]:
def prediction2label(pred: np.ndarray, threshold: float):
    """Convert ordinal predictions to class labels, e.g.
    
    [0.9, 0.1, 0.1, 0.1] -> 0
    [0.9, 0.9, 0.1, 0.1] -> 1
    [0.9, 0.9, 0.9, 0.1] -> 2
    etc.
    """
    return (pred > threshold).cumprod(axis=1).sum(axis=1) - 1

In [17]:
importance_weights = torch.ones(39, dtype=torch.float).to(device)

def loss_fn2(logits, levels, imp=importance_weights):
    val = (-torch.sum((F.logsigmoid(logits)*levels
                      + (F.logsigmoid(logits) - logits)*(1-levels))*imp,
           dim=1))
    return torch.mean(val)

def loss_fn(logits, levels):
    logits = prediction2label(logits)
    modified_target = torch.zeros_like(logits)

    # Fill in ordinal target function, i.e. 0 -> [1,0,0,...]
    for i, target in enumerate(levels):
        modified_target[i, 0:target+1] = 1

    return nn.MSELoss(reduction='none')(logits, modified_target).sum(axis=1)

In [18]:
class ConvNet(nn.Module):
    def __init__(self):
        super(ConvNet, self).__init__()
########### First Transformer Layer
        
        self.transformer_maxpool1 = nn.MaxPool2d(kernel_size=[1,4], stride=[1,4])
        
        # define single transformer encoder layer
        # self-attention + feedforward network from "Attention is All You Need" paper
        # 4 multi-head self-attention layers each with 40-->512--->40 feedforward network
        transformer_layer1 = nn.TransformerEncoderLayer(
            d_model=128, # input feature (frequency) dim after maxpooling 40*282 -> 40*70 (MFC*time)
            nhead=4, # 4 self-attention layers in each multi-head self-attention layer in each encoder block
            dim_feedforward=512, # 2 linear layers in each encoder block's feedforward network: dim 40-->512--->40
            dropout=0.4, 
            activation='relu' # ReLU: avoid saturation/tame gradient/reduce compute time
        )        
        self.transformer_encoder1 = nn.TransformerEncoder(transformer_layer1, num_layers=4)

############ Second Transformer Layer for the second input
        self.transformer_maxpool2 = nn.MaxPool2d(kernel_size=[1,4], stride=[1,4])
        
        # define single transformer encoder layer
        # self-attention + feedforward network from "Attention is All You Need" paper
        # 4 multi-head self-attention layers each with 40-->512--->40 feedforward network
        transformer_layer2 = nn.TransformerEncoderLayer(
            d_model=128, # input feature (frequency) dim after maxpooling 40*282 -> 40*70 (MFC*time)
            nhead=4, # 4 self-attention layers in each multi-head self-attention layer in each encoder block
            dim_feedforward=512, # 2 linear layers in each encoder block's feedforward network: dim 40-->512--->40
            dropout=0.4, 
            activation='relu' # ReLU: avoid saturation/tame gradient/reduce compute time
        )        
        self.transformer_encoder2 = nn.TransformerEncoder(transformer_layer2, num_layers=4)
        
######## First Conv        
        
        
        
        
        self.conv1_layer1 = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=7, stride=1, padding=2),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=4, stride=4))
        self.conv1_layer2 = nn.Sequential(
            nn.Conv2d(16, 32, kernel_size=5, stride=1, padding=2),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=4, stride=4))
        self.conv1_layer3 = nn.Sequential(
            nn.Conv2d(32, 64, kernel_size=5, stride=1, padding=2),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2))
        
######### Second Conv Block        
        self.conv2_layer1 = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=7, stride=1, padding=2),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=4, stride=4))
        self.conv2_layer2 = nn.Sequential(
            nn.Conv2d(16, 32, kernel_size=5, stride=1, padding=2),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=4, stride=4))
        self.conv2_layer3 = nn.Sequential(
            nn.Conv2d(32, 64, kernel_size=5, stride=1, padding=2),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2))        
######## Fully Connected        
        
        
        
        self.drop_out = nn.Dropout()
        
        #self.fc1 = nn.Linear((111360+128)*2, 4096)
        #self.fc1 = nn.Linear(111616, 512)
        #self.fc2 = nn.Linear(4096, 512)
        self.fc1 = nn.Linear(5632, 39)
        #self.fc3 = nn.Linear(512, 39)
        # self.last = nn.Softmax(dim=1)
    def forward(self, x1,x2):
        out_1 = self.conv1_layer1(x1)
        out_1 = self.conv1_layer2(out_1)
        out_1 = self.conv1_layer3(out_1)
        out_1 = out_1.reshape([out_1.size(0), -1])
        # print(out.shape)
        out_2= self.conv2_layer1(x2)
        out_2= self.conv2_layer2(out_2)
        out_2= self.conv2_layer3(out_2)
        out_2= out_2.reshape([out_2.size(0), -1])
######### Transformer output        
        
        x_maxpool1 = self.transformer_maxpool1(x1)

        # remove channel dim: 1*40*70 --> 40*70
        x_maxpool_reduced1 = torch.squeeze(x_maxpool1,1)
        
        # convert maxpooled feature map format: batch * freq * time ---> time * batch * freq format
        # because transformer encoder layer requires tensor in format: time * batch * embedding (freq)
        x_1 = x_maxpool_reduced1.permute(2,0,1) 
        
        # finally, pass reduced input feature map x into transformer encoder layers
        transformer_output_1 = self.transformer_encoder1(x_1)
        
        # create final feature emedding from transformer layer bytaking mean in the time dimension (now the 0th dim)
        # transformer outputs 2x40 (MFCC embedding*time) feature map, take mean of columns i.e. take time average
        transformer_embedding_1 = torch.mean(transformer_output_1, dim=0) # dim 40x70 --> 40
        
########### Transformer output
        x_maxpool2 = self.transformer_maxpool2(x2)

        # remove channel dim: 1*40*70 --> 40*70
        x_maxpool_reduced2 = torch.squeeze(x_maxpool2,1)
        
        # convert maxpooled feature map format: batch * freq * time ---> time * batch * freq format
        # because transformer encoder layer requires tensor in format: time * batch * embedding (freq)
        x_2 = x_maxpool_reduced2.permute(2,0,1) 
        
        # finally, pass reduced input feature map x into transformer encoder layers
        transformer_output_2 = self.transformer_encoder2(x_2)
        
        # create final feature emedding from transformer layer bytaking mean in the time dimension (now the 0th dim)
        # transformer outputs 2x40 (MFCC embedding*time) feature map, take mean of columns i.e. take time average
        transformer_embedding_2 = torch.mean(transformer_output_2, dim=0) # dim 40x70 --> 40
        
        
        out = torch.cat([out_1,transformer_embedding_1,out_2,transformer_embedding_2], dim=1)  
         
        
        
        out = self.drop_out(out)
        out = self.fc1(out)
        #out = self.fc2(out)
        #out = self.fc3(out)
        # out = self.last(out)
        return out

model = ConvNet()

# Loss and optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [19]:
model.to(device)

ConvNet(
  (transformer_maxpool1): MaxPool2d(kernel_size=[1, 4], stride=[1, 4], padding=0, dilation=1, ceil_mode=False)
  (transformer_encoder1): TransformerEncoder(
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
        )
        (linear1): Linear(in_features=128, out_features=512, bias=True)
        (dropout): Dropout(p=0.4, inplace=False)
        (linear2): Linear(in_features=512, out_features=128, bias=True)
        (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.4, inplace=False)
        (dropout2): Dropout(p=0.4, inplace=False)
      )
      (1): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
 

In [20]:
import warnings
warnings.filterwarnings('ignore')

In [1]:
total_step = len(vowel_train_loader)
#print(len(vowel_train_loader))
num_epochs=150
for epoch in range(num_epochs):
    loss_list = []
    acc_list = []
    for  (images1, labels),(images2,labels2) in zip(vowel_train_loader,answer_train_loader):
        # Run the forward pass
        images1 = images1.cuda()
        images2 = images2.cuda()
        labels = labels.cuda()
    
        outputs = model(images1,images2)
        loss = loss_fn2(outputs, labels)
        loss_list.append(loss.item())

        # Backprop and perform Adam optimisation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Track the accuracy
        total = labels.size(0)
        _, predicted = torch.max(outputs.data, 1)
        # correct = (predicted == labels).sum().item()
        # acc_list.append(correct / total)

    print(f'epoch: {epoch}: acc:','loss: ',np.sum(loss_list))

NameError: ignored

In [None]:
torch.save(model.state_dict(), "model_2inputmodel1-150epoch.pt")

In [None]:
model.eval()
for (images1, labels1),(images2, labels2) in zip(vowel_train_loader,answer_train_loader):
  images1 = images1.cuda()
  labels1 = labels1.cuda()
  images2 = images2.cuda()
  outputs = model(images1,images2)
  #print(prediction2label(labels, 0.9))
  #print(prediction2label(outputs, 0.9))
  print(images1)
  print(images2)
  break

tensor([[[[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]]]], device='cuda:0')
tensor([[[[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]]]], device='cuda:0')
