# Training on Google Colab

This notebook is used to train the model in Google Colab and allows us to take advantage of the free GPU support within colab notebooks. 

This notebook should be used as follows:
1. Upload the notebook to Google Drive
1. Open the notebook in Google Colab
1. Make the Google Drive and set up the path to the data in the Google Drive
    - if the drive cannot be mounted, then the notebook will need to be run locally (train_network_local.ipynb)

In [2]:
import copy
import glob
import os
import random # to set the python random seed
import time

import numpy as np # to set the numpy random seed
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision.transforms import ToTensor
import torch.nn.functional as TF
import torchvision.transforms.functional as TVF
from torchmetrics.functional import accuracy
import torch.optim as optim
from torch.optim import lr_scheduler
from torchvision import datasets, transforms
# Ignore excessive warnings
import logging
logging.propagate = False 
logging.getLogger().setLevel(logging.ERROR)

# WandB – Import the wandb library
# import wandb
from pytorch_lightning import LightningModule, seed_everything, Trainer
from pytorch_lightning.callbacks import LearningRateMonitor, ModelCheckpoint
from pytorch_lightning.loggers import WandbLogger

from tqdm.notebook import tqdm

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')


In [4]:
ls ./gdrive/MyDrive/aos_c205_final_project_data/data/

[0m[01;34msta_chunks[0m/              sta_missing_data.txt  sta_validation_set.txt
sta_dataset_cleaned.txt  sta_test_set.txt
sta_icme_list.txt        sta_train_set.txt


In [5]:
data_path = './gdrive/MyDrive/aos_c205_final_project_data/data/'

In [10]:
seed_everything(42)
AVAIL_GPUS = min(1, torch.cuda.device_count())
USE_CUDA = torch.cuda.is_available()
NUM_WORKERS = int(os.cpu_count())

#TODO: CHANGE PATH TO WHERE YOU ARE RUNNING THE CODE
DATA_DIR = data_path
# TODO:BONUS
BATCH_SIZE = 32
EPOCHS = 20
LR = 0.001

Global seed set to 42


In [11]:
configs = { 
   "seed": 42, 
   "lr": LR, 
   "bs": BATCH_SIZE, 
   "gpus": AVAIL_GPUS, 
   "num_workers": NUM_WORKERS, 
   "data_dir": DATA_DIR, 
   "epochs": EPOCHS
}

In [12]:
class ICMEDataset(Dataset):
    def __init__(self, icme_labels, rootdir, datadir, transform=None):
        """Pytorch dataset class for the ICME dataset
        
        
        """
        self.rootdir = rootdir
        self.datadir = datadir
        self.df = pd.read_csv(
            f'{rootdir}/data/{icme_labels}',
            header=0, 
            parse_dates=['start_time', 'stop_time']
        )
#         self.df['fname']
        self.transform=transform
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        f = self.df.fname_img.iloc[idx]
        label = self.df.label.iloc[idx]
        img = np.load(f'{self.datadir}/{f}')
        return img, label

In [13]:
class IcmeNet(LightningModule):
    def __init__(
        self,
        num_classes, 
        train_loader,
        test_loader,
        val_loader,
        kernel_size=3
    ):
        super(IcmeNet, self).__init__()
        
        self.num_classes = num_classes
        self._train_dataloader = train_loader
        self._test_dataloder = test_loader
        self._val_dataloader = val_loader
        self._input_dim = train_loader.dataset[0][0].shape[1]
        self.kernel_size = kernel_size
        
        # TODO: Add some layers to the network like we discussed and remember how to calculate the shape of each output
        
  # current run
  # stride =1, kernel_size=9

        #  8 channels input, 16 channel output
        self.conv1 = nn.Conv2d(8, 16, kernel_size=self.kernel_size, stride=1) 
        
        # 16 channels input, 32 channel output
        self.conv2 = nn.Conv2d(16, 32, kernel_size=self.kernel_size, stride=1)
      
        # self.fc1 = nn.Linear(32 * 22 * 22, 8192)
        self.fc1 = nn.Linear(32 * 24 * 24, 8192)
        self.dropout = nn.Dropout(0.10)
        self.fc2 = nn.Linear(8192, 1024)
        self.fc3 = nn.Linear(1024, 256)
        self.fc4 = nn.Linear(256, 64)
        self.fc5 = nn.Linear(64, num_classes)
        
    def compute_shape(self, dim, kernel_size, stride=1, padding=0):
        out_dim = (dim + 2*padding - kernel_size)//stride + 1
        return out_dim
                                               
    # TODO: Based on the layers you define above, implement the forward function
    def forward(self, x):
        # x = x.float()
        x = TF.relu(TF.max_pool2d(self.conv1(x), 4))
        x = TF.relu(TF.max_pool2d(self.conv2(x), 4))      
        # x = self.dropout(x)
      
        # Reshapes x into size so we can feed the convolution layer outputs into our fully connected layer
        x = x.view(-1, 32 * 24 * 24) 

        # We apply the relu activation function and dropout to the output of our fully connected layers
        x = TF.relu(self.fc1(x))
        x = self.dropout(x)
        x = TF.relu(self.fc2(x))
        x = TF.relu(self.fc3(x))
        x = TF.relu(self.fc4(x))
        x = self.fc5(x)
        return x
    
    def training_step(self, batch, batch_idx):
        x, y, = batch
        y_hat = self(x)
        loss = TF.cross_entropy(y_hat, y)
        self.log("Train_loss", loss)
        return loss
    
    def evaluate(self, batch, stage=None):
        x, y, = batch
        y_hat = self(x)
        loss = TF.cross_entropy(y_hat, y)
        _, preds = torch.max(y_hat, dim=1)
        acc = accuracy(preds, y)
        
        if stage:
            self.log(f'{stage}_loss', loss, prog_bar=True)
            self.log(f'{stage}_acc', acc, prog_bar=True)

    def validation_step(self, batch, batch_idx):
        self.evaluate(batch, 'val')

    def test_step(self, batch, batch_idx):
        self.evaluate(batch, 'test')
        
    def configure_optimizers(self, LR=0.001):
        #TODO: BONUS
        optimizer = optim.SGD(self.parameters(), lr=LR, weight_decay=0.01)
        return {"optimizer": optimizer}

    def train_dataloader(self):
        return self._train_dataloader

    def val_dataloader(self):
        return self._val_dataloader

In [14]:
def train_model(
    model, 
    criterion,
    optimizer, 
    scheduler,
    num_epochs=25,
    dataloaders=None,
    device=None
):
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0
    
    dataset_sizes = {}
    model_checkpoints = {'epoch':[],'model':[]}
    for key in dataloaders.keys():
        dataset_sizes[key] = len(dataloaders[key])
    
    training_data = {'train_loss':[], 'val_loss':[], 'train_acc': [], 'val_acc': []}
    
    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            for inputs, labels in tqdm(dataloaders[phase], desc=phase, total=len(dataloaders[phase])):
                inputs = inputs.to(device)
                labels = labels.to(device)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs.float())
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)
            if phase == 'train':
                scheduler.step()

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc))

            training_data[f'{phase}_loss'].append(epoch_loss)
            training_data[f'{phase}_acc'].append(epoch_acc)
            
            # deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())
              
            if phase == 'val':
              model_checkpoints['epoch'].append(epoch)
              model_checkpoints['model'].append(
                  copy.deepcopy(model.state_dict())
              )

        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model, training_data, model_checkpoints

In [15]:
icme_train_dataset = ICMEDataset(
    'sta_train_set.txt', 
    rootdir=(
        './gdrive/MyDrive/aos_c205_final_project_data/'
    ),
    datadir=(
        './gdrive/MyDrive/aos_c205_final_project_data/data/sta_chunks'
    )
)
train_loader = DataLoader(
    icme_train_dataset, 
    batch_size=1, 
    shuffle=True,
    num_workers=0
)

# Test dataset and loader
icme_test_dataset = ICMEDataset(
    'sta_test_set.txt', 
    rootdir=(
        './gdrive/MyDrive/aos_c205_final_project_data/'
    ),
    datadir=(
        './gdrive/MyDrive/aos_c205_final_project_data/data/sta_chunks'
    )
)
test_loader = DataLoader(
    icme_test_dataset, 
    batch_size=1, 
    shuffle=True,
    num_workers=0
)

#val dataset and loader
icme_val_dataset = ICMEDataset(
    'sta_validation_set.txt', 
    rootdir=(
        './gdrive/MyDrive/aos_c205_final_project_data/'
    ),
    datadir=(
        './gdrive/MyDrive/aos_c205_final_project_data/data/sta_chunks'
    )
)

val_loader = DataLoader(
    icme_val_dataset, 
    batch_size=1, 
    shuffle=True,
    num_workers=0
)

In [16]:
dataloaders = {'train': train_loader, 'val':val_loader}

In [17]:
model = IcmeNet(num_classes=2, kernel_size=9, train_loader=train_loader, test_loader=test_loader, val_loader=val_loader)


In [18]:
model

IcmeNet(
  (conv1): Conv2d(8, 16, kernel_size=(9, 9), stride=(1, 1))
  (conv2): Conv2d(16, 32, kernel_size=(9, 9), stride=(1, 1))
  (fc1): Linear(in_features=18432, out_features=8192, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (fc2): Linear(in_features=8192, out_features=1024, bias=True)
  (fc3): Linear(in_features=1024, out_features=256, bias=True)
  (fc4): Linear(in_features=256, out_features=64, bias=True)
  (fc5): Linear(in_features=64, out_features=2, bias=True)
)

In [19]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = model.to(device)

In [20]:
# We will use stochastic gradient descent as our optimization routine
optimizer = optim.Adam(model.parameters(), lr=configs['lr'])

# We will also use a scheduler for the learning rate.
# This allows us to optimize the learning rate by having large steps
# at first and then getting gradually smaller (by a factor of 1/2) every 5 epochs
scheduler = lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)

In [21]:
loss_fn = TF.cross_entropy

In [22]:
model, training_data, model_checkpoints = train_model(
    model, 
    criterion=loss_fn, 
    optimizer=optimizer, 
    scheduler=scheduler,
    num_epochs=20,
    dataloaders=dataloaders,
    device=device
  )

Epoch 0/19
----------


train:   0%|          | 0/500 [00:00<?, ?it/s]

train Loss: 2.6496 Acc: 0.5580


val:   0%|          | 0/143 [00:00<?, ?it/s]

val Loss: 0.6764 Acc: 0.6573

Epoch 1/19
----------


train:   0%|          | 0/500 [00:00<?, ?it/s]

train Loss: 0.7083 Acc: 0.6420


val:   0%|          | 0/143 [00:00<?, ?it/s]

val Loss: 1.4943 Acc: 0.5944

Epoch 2/19
----------


train:   0%|          | 0/500 [00:00<?, ?it/s]

train Loss: 0.6796 Acc: 0.6880


val:   0%|          | 0/143 [00:00<?, ?it/s]

val Loss: 0.6096 Acc: 0.6573

Epoch 3/19
----------


train:   0%|          | 0/500 [00:00<?, ?it/s]

train Loss: 0.5349 Acc: 0.7700


val:   0%|          | 0/143 [00:00<?, ?it/s]

val Loss: 1.6352 Acc: 0.5874

Epoch 4/19
----------


train:   0%|          | 0/500 [00:00<?, ?it/s]

train Loss: 0.4571 Acc: 0.8520


val:   0%|          | 0/143 [00:00<?, ?it/s]

val Loss: 0.6071 Acc: 0.7273

Epoch 5/19
----------


train:   0%|          | 0/500 [00:00<?, ?it/s]

train Loss: 0.2910 Acc: 0.8860


val:   0%|          | 0/143 [00:00<?, ?it/s]

val Loss: 1.1258 Acc: 0.7622

Epoch 6/19
----------


train:   0%|          | 0/500 [00:00<?, ?it/s]

train Loss: 0.1462 Acc: 0.9560


val:   0%|          | 0/143 [00:00<?, ?it/s]

val Loss: 1.7723 Acc: 0.6573

Epoch 7/19
----------


train:   0%|          | 0/500 [00:00<?, ?it/s]

train Loss: 0.1013 Acc: 0.9700


val:   0%|          | 0/143 [00:00<?, ?it/s]

val Loss: 1.3288 Acc: 0.6993

Epoch 8/19
----------


train:   0%|          | 0/500 [00:00<?, ?it/s]

train Loss: 0.0074 Acc: 0.9960


val:   0%|          | 0/143 [00:00<?, ?it/s]

val Loss: 2.3376 Acc: 0.6853

Epoch 9/19
----------


train:   0%|          | 0/500 [00:00<?, ?it/s]

train Loss: 0.0006 Acc: 1.0000


val:   0%|          | 0/143 [00:00<?, ?it/s]

val Loss: 2.7093 Acc: 0.6853

Epoch 10/19
----------


train:   0%|          | 0/500 [00:00<?, ?it/s]

train Loss: 0.0003 Acc: 1.0000


val:   0%|          | 0/143 [00:00<?, ?it/s]

val Loss: 2.8689 Acc: 0.7063

Epoch 11/19
----------


train:   0%|          | 0/500 [00:00<?, ?it/s]

RuntimeError: ignored

In [43]:
torch.save(model.state_dict(), './gdrive/MyDrive/aos_c205_final_project_data/adam_optim_neural_network.pth')

In [44]:
training_data_df = pd.DataFrame(training_data)

In [48]:
training_data_df['train_acc'] = [val.cpu().numpy() for val in training_data_df['train_acc']]

In [51]:
training_data_df.to_csv('./gdrive/MyDrive/aos_c205_final_project_data/training_results.txt', header=True, index=True)