In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import random_split, DataLoader

import numpy as np
import pandas as pd
# import utils

import matplotlib.pyplot as plt
%matplotlib inline

First we will create a class that would represent the dataset and will have both the training data and labels.

In [None]:
class Dataset:
    def __init__(self, path, header = 'infer'):
        '''
        Reads a csv dataset with the assumption that the last column is a categorical label column.
        '''
        self.df = pd.read_csv(path, header = header)
        
        self.data = self.df.values[:, :-1]
        self.data = self.data.astype('float32')
        
        self.labels = self.df.values[:, -1]
        
        # If label is not a number, one-hot encode them
        if not np.issubdtype(self.labels.dtype, np.number):
            self.label_names = []
            for idx, name in enumerate(set(self.labels)):
                self.label_names.append(name)
                self.labels[self.labels == name] = idx
            self.labels = self.labels.astype('float32')
            
        self.labels = self.labels.reshape(-1, 1)
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return (self.data[idx], self.labels[idx])
    
    def __repr__(self):
        return repr(self.df)
    
    def split_data(self, test_ratio = 0.3):
        '''
        Splits data into training and test sets.
        '''
        test_len = round(test_ratio * len(self.data))
        train_len = len(self.data) - test_len
        
        return random_split(self, [train_len, test_len])

In [None]:
dataset = Dataset("https://raw.githubusercontent.com/jbrownlee/Datasets/master/sonar.csv", header = None)

In [None]:
print(dataset)

         0       1       2       3       4       5       6       7       8   \
0    0.0200  0.0371  0.0428  0.0207  0.0954  0.0986  0.1539  0.1601  0.3109   
1    0.0453  0.0523  0.0843  0.0689  0.1183  0.2583  0.2156  0.3481  0.3337   
2    0.0262  0.0582  0.1099  0.1083  0.0974  0.2280  0.2431  0.3771  0.5598   
3    0.0100  0.0171  0.0623  0.0205  0.0205  0.0368  0.1098  0.1276  0.0598   
4    0.0762  0.0666  0.0481  0.0394  0.0590  0.0649  0.1209  0.2467  0.3564   
..      ...     ...     ...     ...     ...     ...     ...     ...     ...   
203  0.0187  0.0346  0.0168  0.0177  0.0393  0.1630  0.2028  0.1694  0.2328   
204  0.0323  0.0101  0.0298  0.0564  0.0760  0.0958  0.0990  0.1018  0.1030   
205  0.0522  0.0437  0.0180  0.0292  0.0351  0.1171  0.1257  0.1178  0.1258   
206  0.0303  0.0353  0.0490  0.0608  0.0167  0.1354  0.1465  0.1123  0.1945   
207  0.0260  0.0363  0.0136  0.0272  0.0214  0.0338  0.0655  0.1400  0.1843   

         9   ...      51      52      53      54   

We have succesfully read the dataset as a numpy array. We need to convert them into tensors and create batches of data to feed into our neural network.

Since the data is already normalized to be between 0 and 1, we do not need to perform any more preprocessing steps.

## Logistic Regression

Before training a full-fledged neural netowork, lets first try using logistic regression to see how it performs on this dataset. We can create a logistic regression uisng PyTorch by defining a single neuron with sigmoid activation.

We split the data into 70% training set and 30% test set.

In [None]:
train_data, test_data = dataset.split_data(test_ratio = 0.3)

# Data Loaders for Logistic Regression
train_loader_lr = DataLoader(train_data, batch_size = 1)
test_loader_lr = DataLoader(test_data, batch_size = 1)

In [None]:
# Defining the number of features available in the dataset
N_FEATURES = dataset.data.shape[1]

In [None]:
class LogisticRegression(nn.Module):
    def __init__(self):
        super(LogisticRegression, self).__init__()
        
        # A single neuron
        self.neuron = nn.Linear(in_features = N_FEATURES, out_features = 1)
        
    def forward(self, x):
        x = self.neuron(x)
        x = torch.sigmoid(x)    # Sigmoid activation on a single neuron, basically makes it logistic regression
        return x

We select L2 loss as our loss function and Stochaistic Gradient descent as optimizer.

To train the model, we run a maximum of 100 epochs. If the change in loss between two epochs falls below some tolerence level, we break out of the loop. 

In [None]:
# Defining loss function and optimizer
lr = LogisticRegression()

loss_fn_lr = nn.MSELoss()
optimizer_lr = optim.SGD(lr.parameters(), lr = 0.001, momentum = 0.9)

In [None]:
# Training our model
loss_hist_lr = []
acc_hist_lr = []
total_train_size = len(train_loader_lr)
TOLERENCE = 0.0001
prev_loss = 0

for epoch in range(100):
    curr_loss = 0
    correct = 0
    for idx, batch in enumerate(train_loader_lr):
        input_data, label = batch
        
        # Forward prop
        output = lr(input_data)
        loss = loss_fn_lr(output, label)
        curr_loss += loss
        
        # Backprop
        optimizer_lr.zero_grad()
        loss.backward()
        optimizer_lr.step()
        
        # Accuracy measure
        if torch.round(output) == label:
            correct += 1
    
    avg_loss = curr_loss / total_train_size
    accuracy = correct / total_train_size
    loss_hist_lr.append(avg_loss)
    acc_hist_lr.append(accuracy)
    
    if abs(avg_loss - prev_loss) < TOLERENCE:
        break
    else:
        prev_loss = avg_loss

In [None]:
print("Train accuracy: {:.4f}".format(round(acc_hist_lr[-1], 4)))
print("Train loss: {:.4f}".format(round(loss_hist_lr[-1].item(), 4)))

Train accuracy: 0.7877
Train loss: 0.1622
