# Training an image classifier

In [1]:
import numpy as np
import pandas as pd
import torch
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from matplotlib import pyplot as plt

In [6]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image

# Define a sequence of transformations to be applied to the images
transform = transforms.Compose([
    # Convert the image to a tensor, scaling pixel values from [0, 255] to [0, 1]
    transforms.ToTensor(),
    # Normalize each channel (R, G, B) with mean 0.5 and standard deviation 0.5
    transforms.Normalize(0.5),(0.5)  # Single-channel normalization
])

class MelSpecDataset(Dataset):
    def __init__(self, data, labels, transform=None):
        self.data = data
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        image = self.data[idx]
        label = self.labels[idx]
        
        # Convert image to float32 and transpose to (C, H, W)        
        if self.transform:
            image = self.transform(image)
        
        # Convert label to tensor
        label = torch.tensor(label, dtype=torch.long)
        
        return image, label

# Load your data
data = np.load('/home/mendu/Thesis/data/magnatagatune/saved_df_data/data_array.npy')
data = data.reshape((11826, 1, 128, 1255))

df = pd.read_pickle('/home/mendu/Thesis/data/magnatagatune/saved_df_data/df_w_embeddings.pkl')
labels = df['tags'].tolist()

# Create a mapping from label strings to integers
label_to_int = {label: idx for idx, label in enumerate(set(labels))}

# Convert labels to integers
int_labels = [label_to_int[label] for label in labels]

# Split the data into train and val sets
train_size = int(data.shape[0] * 0.8)
train_data, val_data = data[:train_size], data[train_size:]
train_labels, val_labels = int_labels[:train_size], int_labels[train_size:]

# Create datasets
train_dataset = MelSpecDataset(train_data, train_labels, transform=transform)
val_dataset = MelSpecDataset(val_data, val_labels, transform=transform)

# Create data loaders
# batch_size = 32  # Set your desired batch size
train_loader = DataLoader(train_dataset, shuffle=False)
val_loader = DataLoader(val_dataset, shuffle=False)


TypeError: Normalize.__init__() missing 1 required positional argument: 'std'

In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, 3)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(32, 64, 3)
        self.conv3 = nn.Conv2d(64, 128, 3)

        self.fc1 = nn.Linear(128 * 14 * 155, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 8)

    def _get_flattened_size(self, input_shape):
        # Forward pass through conv layers to get the final size
        with torch.no_grad():
            x = torch.zeros(1, *input_shape)
            x = self.pool(F.relu(self.conv1(x)))
            x = self.pool(F.relu(self.conv2(x)))
            x = self.pool(F.relu(self.conv3(x)))
            flattened_size = x.numel()
        return flattened_size

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = self.pool(F.relu(self.conv3(x)))
        x = torch.flatten(x, 1)  # flatten all dimensions except batch
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Instantiate the network
net = Net()

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

# Train the network
for epoch in range(2):  # loop over the dataset multiple times
    running_loss = 0.0
    for i, data in enumerate(train_loader, 0):
        inputs, labels = data

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % 2000 == 1999:    # print every 2000 mini-batches
            print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}')
            running_loss = 0.0

print('Finished Training')


TypeError: Cannot handle this data type: (1, 1, 1255), <f4

In [5]:
(data.reshape((11826, 1, 128, 1255))).shape

(11826, 1, 128, 1255)

In [2]:
# Used to chain together multiple transformations
# first ToTensor() to change the i/p from a PIL image or numpy array to a pytorch tensor, also scales the values from [0,255] to [0,1]
# Normaize; normalises each channel with the given mean and sd
transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

In [3]:
batch_size = 4

In [4]:
# trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
#                                         download=True, transform=transform)
# trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
#                                           shuffle=True, num_workers=2)

# testset = torchvision.datasets.CIFAR10(root='./data', train=False,
#                                        download=True, transform=transform)
# testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size,
#                                          shuffle=False, num_workers=2)

# classes = ('plane', 'car', 'bird', 'cat',
#            'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

In [5]:
# import the ndarray with the mel-specs
data = np.load('/home/mendu/Thesis/data/magnatagatune/saved_df_data/data_array.npy')

# import the metadata
df = pd.read_pickle('/home/mendu/Thesis/data/magnatagatune/saved_df_data/df_w_embeddings.pkl')

# converting the tags column into a list of labels
labels = df['tags'].tolist()

# split the data into train and val sets
train_size = int(data.shape[0]*.8)

# creating a np array of the train data with labels
train_data = []
for i in range(train_size):
    train_data.append([data[i], labels[i]])
    
# creating a np array of the val data
val_data = []
for i in range(train_size, len(data)):
    val_data.append([data[i], labels[i]])

# converting them into tensors
# train_tensor = torch.from_numpy(data_train)
# val_tensor = torch.from_numpy(data_val)

# creating dataloaders
trainloader = torch.utils.data.DataLoader(train_data, batch_size=batch_size)
testloader = torch.utils.data.DataLoader(val_data, batch_size=batch_size)

In [6]:
# train_data = []
# for i in range(len(data_train)):
    # train_data.append([data_train[i], y_train[i]])

In [15]:
data = np.load('/home/mendu/Thesis/data/magnatagatune/saved_df_data/data_array.npy')
data[]

array([[[0.11372549],
        [0.25882354],
        [0.40392157],
        ...,
        [0.56078434],
        [0.47058824],
        [0.15294118]],

       [[0.15686275],
        [0.25490198],
        [0.46666667],
        ...,
        [0.57254905],
        [0.47058824],
        [0.13333334]],

       [[0.15686275],
        [0.18039216],
        [0.54509807],
        ...,
        [0.6666667 ],
        [0.54509807],
        [0.15294118]],

       ...,

       [[0.16078432],
        [0.2       ],
        [0.15294118],
        ...,
        [0.19215687],
        [0.10196079],
        [0.14901961]],

       [[0.13333334],
        [0.06666667],
        [0.10980392],
        ...,
        [0.14509805],
        [0.1764706 ],
        [0.10980392]],

       [[0.10980392],
        [0.16078432],
        [0.10196079],
        ...,
        [0.17254902],
        [0.12941177],
        [0.08627451]]], dtype=float32)

In [19]:
data.shape

(11826, 128, 1255, 1)

In [9]:
import torch.nn as nn
import torch.nn.functional as F


class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 32, 3)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(32, 64, 3)
        # self.pool2 = nn.MaxPool2d(3, 3)
        self.conv3 = nn.Conv2d(64, 128, 3)
        # self.pool2 = nn.MaxPool2d(3, 3)
        
        self.fc1_input_size = self.calculate_fc1_input_size((1, 128, 1255))
        self.fc1 = nn.Linear(self.fc1_input_size, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 8)

    def calculate_fc1_input_size(self, input_size):
        # Forward pass through the conv layers to get the final size before the fully connected layers
        x = torch.zeros(1, *input_size)
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = self.pool(F.relu(self.conv3(x)))
        return x.numel()

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = self.pool(F.relu(self.conv3(x)))
        x = torch.flatten(x, 1) # flatten all dimensions except batch
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x


net = Net()

In [10]:
import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

In [11]:
for epoch in range(2):  # loop over the dataset multiple times

    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % 2000 == 1999:    # print every 2000 mini-batches
            print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}')
            running_loss = 0.0

print('Finished Training')

RuntimeError: Given groups=1, weight of size [32, 1, 3, 3], expected input[4, 128, 1255, 1] to have 1 channels, but got 128 channels instead

In [None]:
4 * 46 * 128

23552