Inspired by this Vanilla [model](https://towardsdatascience.com/lstm-text-classification-using-pytorch-2c6c657f8fc0)


In [1]:
import math
import csv
import os

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import torch.optim as optim

# Evaluation

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Data
Load the features we previosly prepared for our patients.

In [2]:
data_path = 'C:/Users/KHOKHLOVAM/Documents/projects/kotelnikov/data_lstm_test1.csv'
df =pd.read_csv(data_path, header=None, names=range(1564)) # 

In [3]:
df.head()
# todo - add a column with name of the feature. Feature order is: 
# freq
# flash_d
# bandw
# time_lstm (time since last wavetrain)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1554,1555,1556,1557,1558,1559,1560,1561,1562,1563
0,control,0,23.7,13.1,38.4,9.2,25.4,30.4,38.1,13.4,...,,,,,,,,,,
1,control,0,1.0902,1.1266,1.152,2.4472,0.9652,1.1552,1.2954,1.0184,...,,,,,,,,,,
2,control,0,8.8,4.0,12.8,3.1,13.2,11.3,14.1,6.2,...,,,,,,,,,,
3,control,0,0.148,0.082,0.096,0.082,0.064,0.082,0.128,0.17,...,,,,,,,,,,
4,control,1,39.2,40.5,14.8,36.8,24.6,5.6,29.5,28.1,...,,,,,,,,,,


First simple idea on how to use the our data:
1) Take a fixed length sub-sequences from data, and not very very long. Do not use any embedding, but use initial 4 features insted of an embedding, then the input into the LSTM will be:
*batch_size,seq_length (N),embedding_dimension(4)*
**NB:** *embedding dimension is composed of: freq, flash_d, bandw,time_lstm (time since last wavetrain)*

2) Do not care about the separation of the patients for train and validation set for the moment.
3) Use only PD and Control for the moment.


In [124]:
# helper script to cut the data into sub-parts:
def cut_and_store(csv_file, N, save_folder, max_len=1564):
    '''
    N (int): sequence to take length
    max_len: init max lenght of the wavetrain full sequence
    '''
    df = pd.read_csv(data_path, header=None, names=range(max_len)) # 
    # drop ET data
    df = df[df[0] != 'ET']
    # prepare a PD frame 
    new_df = pd.DataFrame(columns=['file_name', 'label', 'patient_id'])  
    for i in range(0,len(df),4):
        # for each patient, make csv files        
        single_row_fr = df.iloc[[i]].values.flatten().tolist()
        patient_id = single_row_fr[1]
        patient_label = single_row_fr[0]
        cleaned_freq = [x for x in single_row_fr[2:] if str(x) != 'nan']
        single_flash_d = df.iloc[[i+1]].values.flatten().tolist()[2:]
        cleanded_flash_d = [x for x in single_flash_d  if str(x) != 'nan']
        single_bandw = df.iloc[[i+2]].values.flatten().tolist()[2:]
        cleaned_bandw = [x for x in single_bandw  if str(x) != 'nan']
        single_time = df.iloc[[i+3]].values.flatten().tolist()[2:]
        cleaned_time_lstm = [x for x in   single_time if str(x) != 'nan']
        num_wavetrains = len(cleaned_freq)        
        for j in range(math.floor(num_wavetrains/N)-1):
            print(f'Patient {i}, subset {j}, label {patient_label}')
            with open(save_folder + f'/{i}_{j}.csv', 'w', newline='') as myfile:
                wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
                wr.writerow(cleaned_freq[j:j+N])
                wr.writerow(cleanded_flash_d[j:j+N])
                wr.writerow(cleaned_bandw[j:j+N])
                wr.writerow(cleaned_time_lstm[j:j+N])
                new_df.loc[len(new_df.index)] = [f'{i}_{j}.csv',patient_label, patient_id] 
        # save for dataloader
    new_df.to_csv(save_folder+'/all_data_200N.csv')  


In [125]:
# Actual cut, just specify N and the folder to save the dataset
folder_p = 'C:/Users/KHOKHLOVAM/Documents/projects/kotelnikov/data/lstm_trials/n200_4_features/'              
cut_and_store('C:/Users/KHOKHLOVAM/Documents/projects/kotelnikov/data_lstm_test1.csv', 200, folder_p)

Patient 0, subset 0, label control
Patient 0, subset 1, label control
Patient 0, subset 2, label control
Patient 0, subset 3, label control
Patient 4, subset 0, label control
Patient 4, subset 1, label control
Patient 4, subset 2, label control
Patient 4, subset 3, label control
Patient 4, subset 4, label control
Patient 4, subset 5, label control
Patient 8, subset 0, label control
Patient 8, subset 1, label control
Patient 8, subset 2, label control
Patient 8, subset 3, label control
Patient 12, subset 0, label control
Patient 12, subset 1, label control
Patient 12, subset 2, label control
Patient 12, subset 3, label control
Patient 12, subset 4, label control
Patient 16, subset 0, label control
Patient 16, subset 1, label control
Patient 16, subset 2, label control
Patient 16, subset 3, label control
Patient 16, subset 4, label control
Patient 20, subset 0, label control
Patient 20, subset 1, label control
Patient 20, subset 2, label control
Patient 20, subset 3, label control
Patien

### Create first training set


In [2]:
from torch.utils.data import Dataset, DataLoader
class PDControlDataset(Dataset):
    """Neurogenertive features dataset."""

    def __init__(self, csv_file, root_dir):
        """
        Args:
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the images.
           
        """
        self.df = pd.read_csv(csv_file) # 
        self.root_dir = root_dir
        
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        file_path = os.path.join(self.root_dir,
                                self.df.iloc[idx, 1])
        label = self.convert_label(self.df.iloc[idx, 2])
        features  = pd.read_csv(file_path,  header=None).values
        sample = {'data': features, 'label': label}


        return sample
    
    def convert_label(self, label):
        if label == 'control':
            return 0
        elif label == 'PDL':
            return 1
        elif label == 'PDR':
            return 1
        else:
            raise Exception('ONLY control, PDL, PDR are currently supported')

In [3]:
folder_p = 'C:/Users/KHOKHLOVAM/Documents/projects/kotelnikov/data/lstm_trials/n200_4_features/'              
dataset = PDControlDataset(folder_p +'/all_data_200N.csv', root_dir=folder_p)

The dataset is very unbalanced... There can be two reasons:
There are more Parkinson patients than the control patients.
There are more wavetrains in Parkinson patients.


In [4]:
# check data statistics
labels = [0]*len(dataset)
for i in range(len(dataset)):
    sample = dataset[i]
    labels[i] = sample['label']
values, counts = np.unique(labels, return_counts=True)
print(values, counts)

[0 1] [ 39 104]


In [5]:
for i in range(len(dataset)):
    sample = dataset[i]
    print(i, sample)
    break

0 {'data': array([[2.3700e+01, 1.3100e+01, 3.8400e+01, 9.2000e+00, 2.5400e+01,
        3.0400e+01, 3.8100e+01, 1.3400e+01, 3.5200e+01, 3.4800e+01,
        2.1800e+01, 2.5400e+01, 9.9000e+00, 2.6600e+01, 1.3400e+01,
        2.6000e+01, 4.1000e+00, 3.0900e+01, 1.2800e+01, 2.5600e+01,
        2.8200e+01, 1.0600e+01, 4.0500e+01, 1.9600e+01, 3.6800e+01,
        8.8000e+00, 2.5600e+01, 4.5000e+00, 2.1300e+01, 3.4700e+01,
        2.5900e+01, 9.1000e+00, 1.4900e+01, 3.7300e+01, 1.0800e+01,
        1.0000e+00, 2.8000e+01, 3.7200e+01, 1.9200e+01, 3.5200e+01,
        2.2100e+01, 1.0600e+01, 2.5000e+00, 3.3700e+01, 3.7300e+01,
        2.0500e+01, 1.0300e+01, 3.6800e+01, 1.7800e+01, 2.9500e+01,
        2.1300e+01, 2.0200e+01, 7.0000e+00, 1.4100e+01, 2.6000e+00,
        3.5000e+01, 2.9500e+01, 1.0900e+01, 6.9000e+00, 3.0900e+01,
        3.2900e+01, 1.0900e+01, 2.7700e+01, 3.0800e+01, 1.0600e+01,
        1.0600e+01, 3.0900e+01, 9.1000e+00, 2.3400e+01, 2.9500e+01,
        9.9000e+00, 3.8100e+01, 8.000

In [7]:
data_train = DataLoader(dataset, batch_size=8,shuffle=True) # TODO to fix the dataloader ... 

# Model

First LSTM is a bi-directional model

In [8]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
class LSTM(nn.Module):

    def __init__(self, dimension=128, input_size = 4):
        super(LSTM, self).__init__()

        self.dimension = dimension
        self.lstm = nn.LSTM(input_size=input_size,
                            hidden_size=dimension,
                            num_layers=1,
                            batch_first=True,
                            bidirectional=True)
        self.drop = nn.Dropout(p=0.5)

        self.fc = nn.Linear(2*dimension, 1)

    def forward(self, X, N=200):
        ''' N is the legnth of a feature vector'''
        lstm_output, _ = self.lstm(X)

        out_forward = lstm_output[range(len(lstm_output)), N - 1, :self.dimension]
        out_reverse = lstm_output[:, 0, self.dimension:]
        out_reduced = torch.cat((out_forward, out_reverse), 1)
        X_fea = self.drop(out_reduced)

        X_fea = self.fc(X_fea)
        X_fea = torch.squeeze(X_fea, 1)
        out = torch.sigmoid(X_fea)


        return out

# Training

In [18]:
# Training Function

def train(model,
          optimizer,
          train_iter,
          criterion = nn.BCELoss(),
#           valid_loader = train_iter,
          num_epochs = 5,
          best_valid_loss = float("Inf")):
    
    eval_every = len(train_iter)
    # initialize running values
    running_loss = 0.0
    valid_running_loss = 0.0
    global_step = 0
    train_loss_list = []
    valid_loss_list = []
    global_steps_list = []

    # training loop
    model.train()
    for epoch in range(num_epochs):
        for  data in train_iter:  
            labels = data['label'].double().to(device)
            X  = torch.transpose(data['data'],2,1).to(device)
            output = model(X)

            loss = criterion(output, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # update running values
            running_loss += loss.item()
            global_step += 1

            # evaluation step
            total_correct = 0
            total_predicted = 0
            if global_step % eval_every == 0:
                model.eval()
                with torch.no_grad():                    
                  # validation loop
                  for  data in train_iter:           
                      labels = data['label'].double().to(device)
                      X  = torch.transpose(data['data'],2,1).to(device)
                      output = model(X)
                      loss = criterion(output, labels)
#                       print(output, labels)
                      classifications = output.round() #binary accuracy
                      correct_predictions = sum(classifications==labels).item()
                      total_correct += correct_predictions
                      total_predicted += len(labels)                        
                      valid_running_loss += loss.item()

                # evaluation
                average_train_loss = running_loss / eval_every
                average_valid_loss = valid_running_loss / len(train_iter)
                train_loss_list.append(average_train_loss)
                valid_loss_list.append(average_valid_loss)
                global_steps_list.append(global_step)
                
                # resetting running values
                running_loss = 0.0                
                valid_running_loss = 0.0
                model.train()

                # print progress
                print('Epoch [{}/{}], Step [{}/{}], Train Loss: {:.4f}, Valid Loss: {:.4f} Accuracy: {:.4f}'
                      .format(epoch+1, num_epochs, global_step, num_epochs*len(train_iter),
                              average_train_loss, average_valid_loss, total_correct/total_predicted))
                
#                 # checkpoint
#                 if best_valid_loss > average_valid_loss:
#                     best_valid_loss = average_valid_loss
#                     save_checkpoint(file_path + '/model.pt', model, optimizer, best_valid_loss)
#                     save_metrics(file_path + '/metrics.pt', train_loss_list, valid_loss_list, global_steps_list)
    
#     save_metrics(file_path + '/metrics.pt', train_loss_list, valid_loss_list, global_steps_list)
#     print('Finished Training!')


model = LSTM().to(device)
optimizer = optim.Adam(model.parameters(), lr=0.01)



In [10]:
# LR 0.01
train(model=model.to(device).double(), optimizer=optimizer,train_iter=data_train, num_epochs=15)

Epoch [1/15], Step [18/270], Train Loss: 0.6426, Valid Loss: 0.6216 Accuracy: 0.7273
Epoch [2/15], Step [36/270], Train Loss: 0.6014, Valid Loss: 0.5831 Accuracy: 0.7273
Epoch [3/15], Step [54/270], Train Loss: 0.6017, Valid Loss: 0.5848 Accuracy: 0.7273
Epoch [4/15], Step [72/270], Train Loss: 0.5832, Valid Loss: 0.5822 Accuracy: 0.7273
Epoch [5/15], Step [90/270], Train Loss: 0.6195, Valid Loss: 0.5814 Accuracy: 0.7273
Epoch [6/15], Step [108/270], Train Loss: 0.6237, Valid Loss: 0.6287 Accuracy: 0.7273
Epoch [7/15], Step [126/270], Train Loss: 0.6427, Valid Loss: 0.5960 Accuracy: 0.7273
Epoch [8/15], Step [144/270], Train Loss: 0.5869, Valid Loss: 0.5784 Accuracy: 0.7273
Epoch [9/15], Step [162/270], Train Loss: 0.6142, Valid Loss: 0.5779 Accuracy: 0.7273
Epoch [10/15], Step [180/270], Train Loss: 0.6340, Valid Loss: 0.5819 Accuracy: 0.7273


KeyboardInterrupt: 

In [99]:
# LR 0.001
train(model=model.to(device).double(), optimizer=optimizer,train_iter=data_train, num_epochs=1)

tensor([0.6457, 0.6342, 0.6442], dtype=torch.float64) tensor([1., 1., 1.], dtype=torch.float64)
tensor([0.6390, 0.6542, 0.6567], dtype=torch.float64) tensor([1., 0., 1.], dtype=torch.float64)
tensor([0.6437, 0.6311, 0.6260], dtype=torch.float64) tensor([1., 1., 1.], dtype=torch.float64)
tensor([0.6587, 0.6576, 0.6467], dtype=torch.float64) tensor([1., 0., 0.], dtype=torch.float64)
tensor([0.6503, 0.6423, 0.6562], dtype=torch.float64) tensor([1., 0., 1.], dtype=torch.float64)
tensor([0.6430, 0.6152, 0.6373], dtype=torch.float64) tensor([1., 0., 0.], dtype=torch.float64)
tensor([0.6408, 0.6537, 0.6535], dtype=torch.float64) tensor([1., 1., 1.], dtype=torch.float64)
tensor([0.6405, 0.6306, 0.6690], dtype=torch.float64) tensor([1., 0., 0.], dtype=torch.float64)
tensor([0.6342, 0.6542, 0.6483], dtype=torch.float64) tensor([1., 0., 0.], dtype=torch.float64)
tensor([0.6440, 0.6490, 0.6519], dtype=torch.float64) tensor([0., 1., 0.], dtype=torch.float64)
tensor([0.6433, 0.6478, 0.6569], dtype=t

## Conclusion:
    The simplistic intial model doesn't work on our data. It can be the model itself, the fact that the data are not normalized, and maybe the fact that the dataset is not normally distributed. Next step is to try a model without a bi-directional approach, but simply a one-dir LSTM for classification. It is nevertheless very close to the current model. 
    

Here is an attempt to run the same model with data sampled more uniformly, the  train_iter compensates for under represented class. This way it seems to work, but since it is using same training and evaluation data, the results are non-significant. 

Let's try to add oversampling for the minority class now

In [100]:
from torch.utils.data import WeightedRandomSampler
batch_size = 8
sample_weights = [ 39/(39+104), 104/(39+104)] 
sampler = WeightedRandomSampler(weights=sample_weights,num_samples=len(dataset), replacement = True)
train_iter = DataLoader(dataset, sampler =  sampler,  batch_size=batch_size) # TODO to fix the dataloader ... 

NameError: name 'train_sampler' is not defined

In [99]:
model = LSTM().to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
train(model=model.to(device).double(), optimizer=optimizer,train_iter=train_iter, num_epochs=15)

TypeError: train() missing 1 required positional argument: 'val_iter'

When we do the balancing, we manage to overtrain the model. It does not mean too much, however, it is already a better result than previously.

## Vanilla 2
The frist simple model didn't work without dataset balancing, so I am trying a model even simpler one from [here](https://blog.floydhub.com/long-short-term-memory-from-zero-to-hero-with-pytorch/)


In [5]:
class VanillaParkinsonNet(nn.Module):
    def __init__(self, output_size, feat_dim, hidden_dim, n_layers, drop_prob=0.5):
        super(VanillaParkinsonNet, self).__init__()
        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        self.feat_dim = feat_dim
        
        self.lstm = nn.LSTM(self.feat_dim, hidden_dim, n_layers, dropout=drop_prob, batch_first=True)
        self.dropout = nn.Dropout(drop_prob)
        self.fc = nn.Linear(hidden_dim, output_size)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x, hidden):
        batch_size = x.size(0)
        lstm_out, hidden = self.lstm(x, hidden)
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        
        out = self.dropout(lstm_out)
        out = self.fc(out)
        out = self.sigmoid(out)
        
        out = out.view(batch_size, -1)
        out = out[:,-1]
        return out, hidden
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device))
        return hidden


In [6]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
feat_size = 4
output_size = 1
hidden_dim = 128
n_layers = 2
batch_size = 8
train_iter = DataLoader(dataset, batch_size=batch_size,shuffle=True) # TODO to fix the dataloader ... 

model = VanillaParkinsonNet(output_size,feat_size, hidden_dim, n_layers)
model.to(device).double()

lr=0.005
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [26]:
epochs = 10
counter = 0
print_every = 100
clip = 5
valid_loss_min = np.Inf

model.train()
for i in range(epochs):   
    for  data in train_iter:  
        labels = data['label'].double()
        inputs = torch.transpose(data['data'],2,1)
        h = model.init_hidden(len(labels))
        counter += 1
        h = tuple([e.data for e in h])
        inputs, labels = inputs.to(device), labels.to(device)
        model.zero_grad()
        output, h = model(inputs, h)
        loss = criterion(output.squeeze(), labels)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        
        if counter%print_every == 0:
            val_losses = []
            model.eval()
            for  data in train_iter:  
                lab = data['label'].double()
                inp  = torch.transpose(data['data'],2,1)
                # Attention - I reinit this hidden for each batch
                val_h = model.init_hidden(len(lab))
                val_h = tuple([each.data for each in val_h])
                inp, lab = inp.to(device), lab.to(device)
                out, val_h = model(inp, val_h)
                val_loss = criterion(out.squeeze(), lab)
                val_losses.append(val_loss.item())
                
            model.train()
            print("Epoch: {}/{}...".format(i+1, epochs),
                  "Step: {}...".format(counter),
                  "Loss: {:.6f}...".format(loss.item()),
                  "Val Loss: {:.6f}".format(np.mean(val_losses)))
            if np.mean(val_losses) <= valid_loss_min:
                torch.save(model.state_dict(), './state_dict.pt')
                print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(valid_loss_min,np.mean(val_losses)))
                valid_loss_min = np.mean(val_losses)

Epoch: 6/10... Step: 100... Loss: 0.000038... Val Loss: 0.000038
Validation loss decreased (inf --> 0.000038).  Saving model ...


As previously, we can can overfit the model to the class. The experiment have only shown, that it is possible to overfit an LSTM for our data, and that class balancing is very inportant. Next set of experiments is an attempt to train the model in correct manner, that is to separate the data into training and validation sets.  To start, let's just use random split which does not take into account patient IDs. 

## Correct train and validation data

In [9]:
from torch.utils.data import  random_split
train_data, test_data = random_split(dataset, [120,23])

In [10]:
# Labels check, check data statistics
labels = [0]*len(train_data)
for i in range(len(train_data)):
    sample = train_data[i]
    labels[i] = sample['label']
values, counts = np.unique(labels, return_counts=True)
print(values, counts) 

[0 1] [37 83]


In [12]:
# let's try to train the first model again! We augment the train_data, but not the test_data
batch_size = 8
sample_weights = torch.from_numpy(np.array([33/(33+87),])).double()
#sampler = WeightedRandomSampler(weights=sample_weights,num_samples=len(train_data), replacement = True)
train_iter = DataLoader(train_data, batch_size=batch_size) 
val_iter = DataLoader(test_data, batch_size=batch_size) 

In [82]:
sample_weights

tensor([0.2750], dtype=torch.float64)

In [85]:
# check balanced dataset
labels_ = []
for  i, data in enumerate(train_iter):      
    labels = data['label'].tolist()
    print(labels)
    labels_.extend(labels)
values, counts = np.unique(labels_, return_counts=True)
print(values, counts) 
#### !!!!!!!!!!!! Random sampling doesn't work for some reason

[1, 1, 0, 1, 1, 1, 0, 1]
[1, 1, 1, 0, 1, 1, 1, 1]
[1, 1, 1, 0, 1, 1, 0, 1]
[1, 1, 0, 1, 0, 1, 1, 1]
[0, 0, 1, 1, 1, 1, 1, 0]
[1, 1, 1, 1, 1, 1, 1, 1]
[1, 1, 0, 0, 0, 1, 1, 1]
[1, 0, 1, 0, 0, 1, 0, 1]
[1, 1, 0, 1, 1, 0, 1, 0]
[1, 0, 1, 1, 1, 1, 1, 1]
[1, 1, 1, 1, 0, 0, 0, 0]
[1, 1, 0, 1, 1, 0, 0, 1]
[1, 0, 1, 1, 0, 1, 1, 1]
[1, 0, 1, 1, 1, 1, 1, 1]
[1, 0, 1, 1, 1, 1, 1, 0]
[0 1] [33 87]


In [41]:
len(train_iter), len(val_iter)

(15, 3)

### Redefine train function, so that it uses val dataloader
Also a different loss funciton is used here: [BCEWithLogits](https://pytorch.org/docs/stable/generated/torch.nn.BCEWithLogitsLoss.html)

In [17]:
# Training Function
loss_train = []
accuracy_train = []

def train(model,
          optimizer,
          train_iter,
          val_iter,
          criterion = nn.BCEWithLogitsLoss(),
          num_epochs = 10,
          best_valid_loss = float("Inf")):
    
    eval_every = len(train_iter)
    # initialize running values
    running_loss = 0.0
    valid_running_loss = 0.0
    global_step = 0
    train_loss_list = []
    valid_loss_list = []
    global_steps_list = []

    # training loop
    model.train()
    for epoch in range(num_epochs):
        total_correct_t = 0 
        total_predicted_t = 0
        for  data in train_iter:  
            labels = data['label'].double().to(device)
            X  = torch.transpose(data['data'],2,1).to(device)
            output = model(X)

            loss = criterion(output, labels)
            loss_train.append(loss.item())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            #accuracy train metric
            classifications_t = output.round() #binary accuracy
            #print(classifications_t, labels)
            correct_predictions_t = sum(classifications_t==labels).item()
            total_correct_t += correct_predictions_t
            total_predicted_t += len(labels)        

            # update running values
            running_loss += loss.item()
            global_step += 1

            # evaluation step
            total_correct = 0
            total_predicted = 0
            if global_step % eval_every == 0:
                model.eval()
                with torch.no_grad():                    
                  # validation loop
                  for  data in val_iter:           
                      labels = data['label'].double().to(device)
                      X  = torch.transpose(data['data'],2,1).to(device)
                      output = model(X)
                      loss = criterion(output, labels)
#                     print(output, labels)
                      classifications = output.round() #binary accuracy
#                      print(classifications, labels)
                      correct_predictions = sum(classifications==labels).item()
                      total_correct += correct_predictions
                      total_predicted += len(labels)                        
                      valid_running_loss += loss.item()

                # evaluation
                average_train_loss = running_loss / eval_every
                average_valid_loss = valid_running_loss / len(train_iter)
                train_loss_list.append(average_train_loss)
                valid_loss_list.append(average_valid_loss)
                global_steps_list.append(global_step)
                
                # resetting running values
                running_loss = 0.0                
                valid_running_loss = 0.0
                model.train()

                # print progress
                print('Epoch [{}/{}], Step [{}/{}], Train Loss: {:.4f}, Valid Loss: {:.4f} Accuracy: {:.4f}'
                      .format(epoch+1, num_epochs, global_step, num_epochs*len(train_iter),
                              average_train_loss, average_valid_loss, total_correct/total_predicted))
                
#                 # checkpoint
#                 if best_valid_loss > average_valid_loss:
#                     best_valid_loss = average_valid_loss
#                     save_checkpoint(file_path + '/model.pt', model, optimizer, best_valid_loss)
#                     save_metrics(file_path + '/metrics.pt', train_loss_list, valid_loss_list, global_steps_list)
    
#     save_metrics(file_path + '/metrics.pt', train_loss_list, valid_loss_list, global_steps_list)
#     print('Finished Training!')
        accuracy_train.append(total_correct_t/total_predicted_t)
        print(f'Train accuracy {total_correct_t/total_predicted_t}')

model = LSTM().to(device)
optimizer = optim.Adam(model.parameters(), lr=0.0001)



In [None]:
train(model=model.to(device).double(), optimizer=optimizer,train_iter=train_iter, val_iter=val_iter, num_epochs=10)

tensor([0., 1., 0., 1., 0., 1., 0., 0.], dtype=torch.float64,
       grad_fn=<RoundBackward0>) tensor([1., 0., 1., 1., 1., 1., 1., 1.], dtype=torch.float64)
tensor([1., 1., 1., 0., 1., 0., 1., 0.], dtype=torch.float64,
       grad_fn=<RoundBackward0>) tensor([1., 1., 1., 0., 1., 0., 0., 1.], dtype=torch.float64)
tensor([1., 1., 0., 1., 1., 1., 0., 1.], dtype=torch.float64,
       grad_fn=<RoundBackward0>) tensor([1., 1., 0., 0., 1., 1., 0., 1.], dtype=torch.float64)
tensor([1., 1., 1., 0., 1., 0., 1., 0.], dtype=torch.float64,
       grad_fn=<RoundBackward0>) tensor([1., 1., 1., 0., 1., 1., 0., 1.], dtype=torch.float64)
tensor([1., 0., 1., 1., 1., 1., 1., 1.], dtype=torch.float64,
       grad_fn=<RoundBackward0>) tensor([1., 1., 1., 0., 1., 1., 1., 1.], dtype=torch.float64)
tensor([1., 1., 0., 1., 1., 1., 0., 1.], dtype=torch.float64,
       grad_fn=<RoundBackward0>) tensor([1., 0., 0., 1., 1., 1., 1., 1.], dtype=torch.float64)
tensor([1., 1., 1., 1., 0., 1., 0., 0.], dtype=torch.float

As before, we overfit for the train data, and validation accuracy is low, and validation loss is only increasing during the training.  Moreover, a single label is predicted for validation, no matter the input data. It is a weird behavior, because in training data, different lables are presented,and here a special loss is used, which takes into account unbalanced data.

Tips and Tricks for LSTM
* It is possible to use TimeDistribted LSTM [Keras implementation](https://stackoverflow.com/questions/47410239/how-to-feed-into-lstm-with-4-dimensional-input)
* Explanation about inputs and outputs of LSTM [here](https://stackoverflow.com/questions/49466894/how-to-correctly-give-inputs-to-embedding-lstm-and-linear-layers-in-pytorch)
* Batching and collation. torch.utils.data.DataLoader is an iterator which provides all these features. Parameters used below should be clear. One parameter of interest is collate_fn. You can specify how exactly the samples need to be batched using collate_fn. [Tutorial](https://pytorch.org/tutorials/recipes/recipes/custom_dataset_transforms_loader.html).
* Unbalanced data. Dataloader with sampling [torch](https://towardsdatascience.com/demystifying-pytorchs-weightedrandomsampler-by-example-a68aceccb452).

In [None]:
# LSTM tests
lstm = nn.LSTM(input_size=4,
                            hidden_size=128,
                            num_layers=1,
                            batch_first=True,
                            bidirectional=True)

In [None]:
# input = torch.randn(5, 3, 10)
input =  dataset.__getitem__(0)['data']
input_torch = torch.from_numpy(input)
input_torch = torch.transpose(input_torch, 0, 1)
input_torch = torch.unsqueeze(input_torch, dim=0).to(torch.float32)
input_torch.shape