In [13]:
import pandas as pd
import torch
import re
import numpy as np
from torch.utils.data import Dataset, DataLoader
import os
import librosa
import librosa.display
import matplotlib.pyplot as plt

import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchinfo import summary
from torch.optim import lr_scheduler
import time
import os
from tempfile import TemporaryDirectory
import torchvision
from torchvision import datasets, models, transforms


In [14]:
headers = ["Optimizer", "Momentum/Weight Decay", "Learning Rate", "Best Accuracy","Total Time", "Model ID",
               "Train Losses", "Train Accuracies", "Validation Losses", "Validation Accuracies"]

with open('Results.tsv', 'a') as f:
    if os.stat('Results.tsv').st_size == 0:  # Check if file is empty
        f.write("\t".join(headers) + "\n")  # Write headers only if file is empty

In [None]:

from typing import Any


original_tsv = '2000_files.tsv'
test_tsv = 'test.tsv'
train_tsv = 'train.tsv'

# Load original data into DataFrame
df = pd.read_csv(original_tsv, sep='\t', header=0)

# Select 100 female and 750 male rows randomly for test.tsv
if not os.path.exists(test_tsv):
    female_rows_test = df[df['gender'].str.lower() == 'female_feminine'].sample(n=500, random_state=42)
    male_rows_test = df[df['gender'].str.lower() == 'male_masculine'].sample(n=500, random_state=42)
    
    selected_rows_test = pd.concat([female_rows_test, male_rows_test])
    
    # Save selected rows for testing
    selected_rows_test.to_csv(test_tsv, sep='\t', index=False)
    print(f"Created {test_tsv} with selected rows for testing.")
else:
    print(f"{test_tsv} already exists. Skipping creation of the test file.")

# Select remaining rows for train.tsv
if not os.path.exists(train_tsv):
    # Get indices of rows selected for test.tsv
    test_indices = selected_rows_test.index
    
    # Select rows not in test.tsv for training
    df_train = df[~df.index.isin(test_indices)]
    
    # Save remaining rows for training
    df_train.to_csv(train_tsv, sep='\t', index=False)
    print(f"Created {train_tsv} with remaining rows for training.")
else:
    print(f"{train_tsv} already exists. Skipping creation of the train file.")

# Original file remains unchanged
print(f"Original file {original_tsv} remains unchanged.")

# Check for common rows between test.tsv and train.tsv
df1 = pd.read_csv(test_tsv, sep='\t')
df2 = pd.read_csv(train_tsv, sep='\t')

# Merge DataFrames to find common rows
common_rows = pd.merge(df1, df2, how='inner')

# Check if there are common rows
if not common_rows.empty:
    num_common_rows = len(common_rows)
    print(f"Error: There are {num_common_rows} common rows between {test_tsv} and {train_tsv}.")
else:
    print(f"No common rows found between {test_tsv} and {train_tsv}.")



    

In [15]:
class CropLoudest(object):
    def __call__(self,tup):
        audio, sr=tup
        target_length=int(sr/2)
        cs = np.cumsum(audio ** 2)
        start = (cs[target_length:] - cs[:-target_length]).argmax()
        return audio[start:start+target_length],sr

class RecordingDataset(Dataset):
  def __init__(self,tsvFile,soundFolder, transform=None):
    self.table=pd.read_csv(tsvFile, sep='\t', header=0)
    self.table=self.table.drop(self.table.columns[0],axis=1)
    self.table = self.table.dropna(subset=['gender'])

    self.soundFolder=soundFolder

    self.transform=transform

  def __len__(self):
    return len(self.table)


  def __getitem__(self,index):
   # print(self.table)
    audioFile=os.path.join(self.soundFolder,self.table.iloc[index,0])
    #sentence=self.table.iloc[index,2]
    #age=self.table.iloc[index,6]
    gender=self.table.iloc[index,7]
    #accents=self.table.iloc[index,8]
    #print(self.table.iloc[index,0])
    #print(audioFile)
    rawAudio,samplingRate = librosa.load(audioFile)
    

    collection=rawAudio, samplingRate
    if gender=="female_feminine":
        gender=0
    else:
        gender=1
    if self.transform:
      collection=self.transform(collection)
    else:
        print("THERE SHOULD BE TRANSFROM")
        return
    return collection,gender


class GetMfcc_channel1(object):


    def __call__(self, collection):
      
        audio,sr=collection
        mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)  # Choosing 13 coefficients

        # Normalize MFCCs
        mfccs = librosa.util.normalize(mfccs)
        mfccs = mfccs.reshape((mfccs.shape[0], mfccs.shape[1], 1))
        #print(type(mfccs))
        mfccs = np.transpose(mfccs, (2, 0, 1))

        mfccs = torch.tensor(mfccs, dtype=torch.float32)
        #print(mfccs.shape)

        return mfccs




device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
def train_model(model, criterion, optimizer, scheduler, dataloaders,dataset_sizes,printEpoch,num_epochs=25):
    since = time.time()
    train_losses = []
    train_accuracies = []
    val_losses = []
    val_accuracies = []

    # Create a temporary directory to save training checkpoints
    with TemporaryDirectory() as tempdir:
        best_model_params_path = os.path.join(tempdir, 'best_model_params.pt')

        torch.save(model.state_dict(), best_model_params_path)
        best_acc = 0.0

        for epoch in range(num_epochs):
            if printEpoch:
                print(f'Epoch {epoch}/{num_epochs - 1}')

            # Each epoch has a training and validation phase
            for phase in ['train', 'val']:
                if phase == 'train':
                    model.train()  # Set model to training mode
                else:
                    model.eval()   # Set model to evaluate mode

                running_loss = 0.0
                running_corrects = 0

                # Iterate over data.
                for inputs, labels in dataloaders[phase]:
                    inputs = inputs.to(device)
                    labels = labels.to(device)
                    #print("Inputs Size:",inputs.size())


                    # zero the parameter gradients
                    optimizer.zero_grad()

                    # forward
                    # track history if only in train
                    with torch.set_grad_enabled(phase == 'train'):
                        outputs = model(inputs)
                        _, preds = torch.max(outputs, 1)
                        #print("Outputs:",outputs)
                        #print("preds",preds)
                        #print("labels",labels)

                        #print(outputs,labels)
                       

                        loss = criterion(outputs, labels)
                        

                        # backward + optimize only if in training phase
                        if phase == 'train':
                            loss.backward()
                            optimizer.step()

                    # statistics
                    running_loss += loss.item() * inputs.size(0)
                    running_corrects += torch.sum(preds == labels.data)
                if phase == 'train':
                    scheduler.step()

                epoch_loss = running_loss / dataset_sizes[phase]
                epoch_acc = running_corrects.double() / dataset_sizes[phase]

                if phase == 'train':
                    train_losses.append(epoch_loss)
                    train_accuracies.append(epoch_acc)
                else:
                    val_losses.append(epoch_loss)
                    val_accuracies.append(epoch_acc)

                if printEpoch:
                    print('-' * 10)
                    print(f'{phase} Running Loss: {running_loss:.4f} Dataset Size: { dataset_sizes[phase]}')
                    print(f'{phase} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}')

                # deep copy the model
                if phase == 'val' and epoch_acc > best_acc:
                    best_acc = epoch_acc
                    torch.save(model.state_dict(), best_model_params_path)

            #print()

        time_elapsed = time.time() - since
        print(f'Training complete in {time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s')
        print(f'Best val Acc: {best_acc:4f}')

        # load best model weights
        model.load_state_dict(torch.load(best_model_params_path))
    return model,best_acc, train_losses, train_accuracies, val_losses, val_accuracies,time_elapsed

def get_next_model_id():
    model_files = [f for f in os.listdir('.') if re.match(r'models\d+\.pth', f)]
    if not model_files:
        return 1
    existing_ids = sorted([int(re.search(r'models(\d+)\.pth', f).group(1)) for f in model_files])
    if existing_ids:
        next_id = existing_ids[-1] + 1
    else:
        next_id = 1
    return next_id



In [16]:

def plot_metrics(train_losses, train_accuracies, val_losses, val_accuracies,totalTime):
    epochs = range(1, len(train_losses) + 1)
    print(f'Training complete in {totalTime // 60:.0f}m {totalTime % 60:.0f}s')





    # Plotting losses
    plt.figure(figsize=(10, 5))
    plt.plot(epochs, train_losses, 'b', label='Training loss')
    plt.plot(epochs, val_losses, 'r', label='Validation loss')
    plt.title('Training and Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.grid(True)
    plt.show()

    # Plotting accuracies
    plt.figure(figsize=(10, 5))
    plt.plot(epochs, train_accuracies, 'b', label='Training accuracy')
    plt.plot(epochs, val_accuracies, 'r', label='Validation accuracy')
    plt.title('Training and Validation Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.grid(True)
    plt.show()


In [17]:
class YourCNN(nn.Module):
    def __init__(self):
        super(YourCNN, self).__init__()
        
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3,padding=1)#13,22
        self.conv2 = nn.Conv2d(16, 32, kernel_size=2)#12,21
        self.maxpool1 = nn.MaxPool2d(kernel_size=3,stride=1)#10,19
        self.conv3 = nn.Conv2d(32,32,kernel_size=3,padding=1)#10,19
        self.conv4 = nn.Conv2d(32,64,kernel_size=2)#9,18
        self.maxpool2 = nn.MaxPool2d(kernel_size=3,stride=1)#7,16
        self.fc1 = nn.Linear(7*16*64, 2)
        

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = self.maxpool1(x)
        x = F.relu(self.conv3(x))
        x = F.relu(self.conv4(x))
        x = self.maxpool2(x)
        x = x.view(x.size(0), -1)  # Flatten
        x = self.fc1(x)
        return x


In [19]:
model = YourCNN()
batch_size=4
summary(model, input_size=(batch_size, 1, 13, 22))

Layer (type:depth-idx)                   Output Shape              Param #
YourCNN                                  [4, 2]                    --
├─Conv2d: 1-1                            [4, 16, 13, 22]           160
├─Conv2d: 1-2                            [4, 32, 12, 21]           2,080
├─MaxPool2d: 1-3                         [4, 32, 10, 19]           --
├─Conv2d: 1-4                            [4, 32, 10, 19]           9,248
├─Conv2d: 1-5                            [4, 64, 9, 18]            8,256
├─MaxPool2d: 1-6                         [4, 64, 7, 16]            --
├─Linear: 1-7                            [4, 2]                    14,338
Total params: 34,082
Trainable params: 34,082
Non-trainable params: 0
Total mult-adds (Units.MEGABYTES): 14.72
Input size (MB): 0.00
Forward/backward pass size (MB): 0.93
Params size (MB): 0.14
Estimated Total Size (MB): 1.07

In [21]:



def CompleteTrain(crit=nn.CrossEntropyLoss(),printEpoch=True,opt="SGD",numb=0.9,lr=0.001):
    getMfcc = GetMfcc_channel1()
    crop=CropLoudest()
    transformers=[crop,getMfcc,]
    myTransforms=transforms.Compose(transformers)
    TrainDataset=RecordingDataset(soundFolder="2000clips",tsvFile="train.tsv",transform=myTransforms)
    TestDataset=RecordingDataset(soundFolder="2000clips",tsvFile="test.tsv",transform=myTransforms)


    valLoader = DataLoader(TestDataset, batch_size=4,
                            shuffle=False, num_workers=0)
    trainLoader = DataLoader(TrainDataset, batch_size=4,
                            shuffle=True, num_workers=0)
    dataloaders = {"train":trainLoader,"val":valLoader}
    dataset_sizes = {'train': len(TrainDataset), 'val': len(TestDataset)}
    model_conv = torchvision.models.resnet18(weights='IMAGENET1K_V1')

    
    model_conv=YourCNN()
    model_conv = model_conv.to(device)
    
    if opt=="SGD":
        optimizer_conv = optim.SGD(model_conv.parameters(), lr=lr, momentum=numb)
    elif opt=="ADAM":
        optimizer_conv = optim.Adam(model_conv.parameters(), lr=lr,weight_decay=numb)
    exp_lr_scheduler = lr_scheduler.StepLR(optimizer_conv, step_size=7, gamma=0.1)

    model_conv,best_acc, train_losses, train_accuracies, val_losses, val_accuracies,totalTime = train_model(model_conv, crit, optimizer_conv,
                         exp_lr_scheduler, num_epochs=25,dataloaders=dataloaders,dataset_sizes=dataset_sizes,printEpoch=printEpoch)
    
    
    train_accuracies = [tensor.item() for tensor in train_accuracies]
    val_accuracies = [tensor.item() for tensor in val_accuracies]

    #plot_metrics(train_losses, train_accuracies, val_losses, val_accuracies,totalTime)

    model_id = get_next_model_id()
    with open('Results.tsv', 'a') as f:
        f.write(f"{opt}\t{numb}\t{lr}\t{best_acc}\t{totalTime}\t{model_id}\t")
        f.write(",".join(map(str, train_losses)) + "\t")
        f.write(",".join(map(str, train_accuracies)) + "\t")
        f.write(",".join(map(str, val_losses)) + "\t")
        f.write(",".join(map(str, val_accuracies)) + "\n")
        

    # Save the model
    torch.save(model_conv.state_dict(), f"models{model_id}.pth")


    return model_conv,best_acc,model_id



In [22]:
CompleteTrain(printEpoch=False,opt="ADAM",numb=0.001)

KeyboardInterrupt: 

In [23]:
SGDnumb=[0.9,0.5,0.09]
ADAMnumb=[0.001,0.005]
lr=[0.001,0.005,0.01]
bestAc=0
bestId=0
for l in lr:
    for momentum in SGDnumb:
        _,ac,id=CompleteTrain(opt="SGD",numb=momentum,lr=l,printEpoch=False)
        print(f"Model{id} trained")
        if(ac>bestAc):
            bestAc=ac
            bestId=id
    for wDecay in ADAMnumb:
        _,ac,id=CompleteTrain(opt="ADAM",numb=wDecay,lr=l,printEpoch=False)
        print(f"Model{id} trained")
        if(ac>bestAc):
            bestAc=ac
            bestId=id
print(f"Best ID: {bestId} Best Ac: {bestAc}")

Training complete in 8m 17s
Best val Acc: 0.781000
Model1 trained
Training complete in 8m 54s
Best val Acc: 0.668000
Model2 trained
Training complete in 9m 14s
Best val Acc: 0.668000
Model3 trained
Training complete in 9m 14s
Best val Acc: 0.840000
Model4 trained
Training complete in 8m 45s
Best val Acc: 0.500000
Model5 trained
Training complete in 8m 41s
Best val Acc: 0.704000
Model6 trained
Training complete in 9m 48s
Best val Acc: 0.795000
Model7 trained
Training complete in 9m 18s
Best val Acc: 0.778000
Model8 trained
Training complete in 9m 15s
Best val Acc: 0.786000
Model9 trained


KeyboardInterrupt: 

In [23]:
CompleteTrain()

Epoch 0/24
----------
train Running Loss: 671.5389 Dataset Size: 1000
train Loss: 0.6715 Acc: 0.5650
----------
val Running Loss: 603.7663 Dataset Size: 1000
val Loss: 0.6038 Acc: 0.6850
Epoch 1/24
----------
train Running Loss: 567.2319 Dataset Size: 1000
train Loss: 0.5672 Acc: 0.7210
----------
val Running Loss: 589.3770 Dataset Size: 1000
val Loss: 0.5894 Acc: 0.6620
Epoch 2/24
----------
train Running Loss: 515.2654 Dataset Size: 1000
train Loss: 0.5153 Acc: 0.7410
----------
val Running Loss: 645.2785 Dataset Size: 1000
val Loss: 0.6453 Acc: 0.6720
Epoch 3/24
----------
train Running Loss: 516.5610 Dataset Size: 1000
train Loss: 0.5166 Acc: 0.7460
----------
val Running Loss: 476.5856 Dataset Size: 1000
val Loss: 0.4766 Acc: 0.7860
Epoch 4/24
----------
train Running Loss: 475.8352 Dataset Size: 1000
train Loss: 0.4758 Acc: 0.7660
----------
val Running Loss: 458.9945 Dataset Size: 1000
val Loss: 0.4590 Acc: 0.7910
Epoch 5/24
----------
train Running Loss: 440.5096 Dataset Size: 

(YourCNN(
   (conv1): Conv2d(1, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
   (conv2): Conv2d(16, 32, kernel_size=(2, 2), stride=(1, 1))
   (maxpool1): MaxPool2d(kernel_size=3, stride=1, padding=0, dilation=1, ceil_mode=False)
   (conv3): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
   (conv4): Conv2d(32, 64, kernel_size=(2, 2), stride=(1, 1))
   (maxpool2): MaxPool2d(kernel_size=3, stride=1, padding=0, dilation=1, ceil_mode=False)
   (fc1): Linear(in_features=7168, out_features=2, bias=True)
 ),
 tensor(0.8370, device='cuda:0', dtype=torch.float64),
 1)