In [1]:
import torchaudio

In [2]:
import IPython.display as ipd
import librosa
import librosa.display
import pandas as pd
import os, time, warnings
import numpy as np

In [3]:
def resize_spectrogram(spec, length, fact=-80):

    # Create an empty canvas to put spectrogram into
    canvas = np.ones((len(spec), length)) * fact

    if spec.shape[1] <= length:
        canvas[:, : spec.shape[1]] = spec
    else:
        canvas[:, :length] = spec[:, :length]
    return canvas

def compute_mel_spec(filename, sr=8000, hop_length=512, duration=60.0):

    # Loads the mp3 file
    y, sr = librosa.load(filename, sr=sr)

    # Compute the mel spectrogram
    x_mel = librosa.feature.melspectrogram(y=y, sr=sr)

    # Apply logarithmic dB-scale to spectrogram and set maximum to 0 dB
    x_mel = librosa.power_to_db(x_mel, ref=np.max)

    # Compute mean strength per frequency for mel spectrogram
    mel_strength = np.mean(x_mel, axis=1)

    # Estimate the desired length of the spectrogram
    length = int(duration * sr / hop_length)

    # print(np.min(x_mel))
    # print(np.max(x_mel))

    # Put mel spectrogram into the right shape
    x_mel = resize_spectrogram(x_mel, length)

    x_mel = librosa.util.normalize(x_mel, axis=1)
    x_mel = np.ones(x_mel.shape) + x_mel


    return x_mel, mel_strength

In [4]:
from matplotlib import pyplot as plt

In [5]:
import torch
import torch.nn as nn
import torchvision

import torchvision.transforms as transforms

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [6]:
print(device)

cpu


In [7]:
from torch.utils.data.dataset import Dataset

In [8]:
import torch.nn.functional as F

In [10]:
class MyCustomDataset(Dataset):
    def __init__(self, audio_file_list, scores):
        self.audio_file_list = audio_file_list
        self.scores = scores
        
    def __getitem__(self, index):
        img = self.audio_file_list[index]
        # img, _ = librosa.load(img, sr=16000)
        # print(type(img))
        # img = torch.tensor(img, dtype=torch.float32)
        # img = torch.mean(img, dim=0).unsqueeze(0)
        # img = torchaudio.transforms.Spectrogram()(img)
        
        
        img, _ = compute_mel_spec(f'/home/ongun/sustained-phonation-features-master/vowel/{img}')
        img = torch.tensor(img, dtype=torch.float32)
        # print(img.shape)
        # img = torch.mean(img, dim=0).unsqueeze(0)
        # img = F.pad(img, (0, 1000 - img.shape[1]))
        img = img.reshape([1, img.shape[0], img.shape[1]])
        # img = generate_features(img)
        # torch.index_select(img, 1, torch.LongTensor([2,0,1]))
        # print(img.shape)
        
        def ordinal_labeler(score):
            levels = [1]*score + [0]*(39 - score)
            levels = torch.tensor(levels, dtype=torch.float32)
            return levels
        
        score = self.scores[index]
        label = np.array(ordinal_labeler(score))
        return img, label

    def __len__(self):
        count = len(self.audio_file_list)
        return count

In [11]:
df=pd.read_csv("modified_data.csv")
df1 = df.iloc[::2]
# remove even rows:
df2 = df.iloc[1::2]

In [12]:
df2

Unnamed: 0,userid,filename,date,score,excercise_type
1,1614,20220130013629.wav,2022-01-30,11,evening_task:answer_question
3,1614,20220131204837.wav,2022-01-31,12,evening_task:read_text
5,1614,20220201232615.wav,2022-02-01,8,evening_task:answer_question
7,1614,20220203191000.wav,2022-02-03,12,evening_task:read_text
9,1614,20220205194422.wav,2022-02-05,14,evening_task:read_text
...,...,...,...,...,...
305,1717,20220420222748.wav,2022-04-20,17,evening_task:read_text
307,1717,20220421230009.wav,2022-04-21,18,evening_task:read_text
309,1717,20220423230218.wav,2022-04-23,20,evening_task:read_text
311,1717,20220424233122.wav,2022-04-24,20,evening_task:read_text


In [13]:
audio_list = np.array(df1['filename'].tolist())
score_list = np.array(df1['score'].tolist())

In [14]:
indices = np.arange(len(audio_list))
np.random.shuffle(indices)

limit = int(len(indices)*0.7)

X_train, X_test = audio_list[indices[:limit]], audio_list[indices[limit:]]
y_train, y_test = score_list[indices[:limit]], score_list[indices[limit:]] 

In [15]:
train_data = MyCustomDataset(X_train, y_train)
test_data = MyCustomDataset(X_test, y_test)

In [16]:
num_epochs = 50
num_classes = 40
batch_size = 16
learning_rate = 0.001

In [17]:
from torch.utils.data import DataLoader

In [18]:
train_loader = DataLoader(dataset=train_data, batch_size=batch_size)
test_loader = DataLoader(dataset=test_data, batch_size=batch_size)

In [19]:
def prediction2label(pred: np.ndarray):
    """Convert ordinal predictions to class labels, e.g.
    
    [0.9, 0.1, 0.1, 0.1] -> 0
    [0.9, 0.9, 0.1, 0.1] -> 1
    [0.9, 0.9, 0.9, 0.1] -> 2
    etc.
    """
    return (pred > 0.5).cumprod(axis=1).sum(axis=1) - 1

In [20]:
importance_weights = torch.ones(39, dtype=torch.float).to(device)

def loss_fn2(logits, levels, imp=importance_weights):
    val = (-torch.sum((F.logsigmoid(logits)*levels
                      + (F.logsigmoid(logits) - logits)*(1-levels))*imp,
           dim=1))
    return torch.mean(val)

def loss_fn(logits, levels):
    logits = prediction2label(logits)
    modified_target = torch.zeros_like(logits)

    # Fill in ordinal target function, i.e. 0 -> [1,0,0,...]
    for i, target in enumerate(levels):
        modified_target[i, 0:target+1] = 1

    return nn.MSELoss(reduction='none')(logits, modified_target).sum(axis=1)

In [21]:
class ConvNet(nn.Module):
    def __init__(self):
        super(ConvNet, self).__init__()
        self.layer1 = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=7, stride=1, padding=2),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2))
        self.layer2 = nn.Sequential(
            nn.Conv2d(32, 64, kernel_size=5, stride=1, padding=2),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2))
        self.layer3 = nn.Sequential(
            nn.Conv2d(64, 128, kernel_size=5, stride=1, padding=2),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2))
        self.drop_out = nn.Dropout()
        self.fc1 = nn.Linear(111360*2, 4096)
        self.fc2 = nn.Linear(4096, 512)
        self.fc3 = nn.Linear(512, 39)
        # self.last = nn.Softmax(dim=1)
    def forward(self, x):
        out = self.layer1(x)
        out = self.layer2(out)
        out = self.layer3(out)
        out = out.reshape([out.size(0), -1])
        # print(out.shape)
        out = self.drop_out(out)
        out = self.fc1(out)
        out = self.fc2(out)
        out = self.fc3(out)
        # out = self.last(out)
        return out

model = ConvNet()

# Loss and optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [22]:
model.to(device="cpu")

ConvNet(
  (layer1): Sequential(
    (0): Conv2d(1, 32, kernel_size=(7, 7), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (layer2): Sequential(
    (0): Conv2d(32, 64, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (layer3): Sequential(
    (0): Conv2d(64, 128, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (drop_out): Dropout(p=0.5, inplace=False)
  (fc1): Linear(in_features=222720, out_features=4096, bias=True)
  (fc2): Linear(in_features=4096, out_features=512, bias=True)
  (fc3): Linear(in_features=512, out_features=39, bias=True)
)

In [23]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
total_step = len(train_loader)

for epoch in range(num_epochs):
    loss_list = []
    acc_list = []
    for i, (images, labels) in enumerate(train_loader):
        # Run the forward pass
        images = images
        labels = labels
        outputs = model(images)
        loss = loss_fn2(outputs, labels)
        loss_list.append(loss.item())

        # Backprop and perform Adam optimisation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Track the accuracy
        total = labels.size(0)
        _, predicted = torch.max(outputs.data, 1)
        # correct = (predicted == labels).sum().item()
        # acc_list.append(correct / total)
    print(f'epoch: {epoch}: acc:','loss: ',np.mean(loss_list))

In [None]:
torch.save(model, 'model_scripted_7.pt')

In [None]:

model = ConvNet()
#model.load_state_dict(torch.load(file))
model.load_state_dict(torch.load('model_scripted_7.pt',map_location=torch.device('cpu')))

In [None]:
def check_accuracy(test_loader: DataLoader, model: nn.Module):
    num_correct = 0
    total = 0
    model.eval()
    device="cuda"
    acc_list=[]
    with torch.no_grad():
        for data, labels in test_loader:
            data = data.to(device=device)
            labels = labels.to(device=device)
           
            predictions = model(data)
            
            total = labels.size(0)
            predicted = prediction2label(predictions, 0.5)
            targets = prediction2label(labels, 0.5)
            num_correct = (predicted == targets).sum().item()
            acc_list.append(num_correct/total)
        print(f"Test Accuracy of the model: {np.mean(acc_list)}")
