# **Homework 1: COVID-19 Cases Prediction (Regression)**

Objectives:
* Solve a regression problem with deep neural networks (DNN).
* Understand basic DNN training tips.
* Familiarize yourself with PyTorch.

If you have any questions, please contact the TAs via TA hours, NTU COOL, or email to mlta-2022-spring@googlegroups.com

# Download data
If the Google Drive links below do not work, you can download data from [Kaggle](https://www.kaggle.com/c/ml2022spring-hw1/data), and upload data manually to the workspace.

In [54]:
# !gdown --id '1kLSW_-cW2Huj7bh84YTdimGBOJaODiOS' --output covid.train.csv
# !gdown --id '1iiI5qROrAhZn-o4FPqsE97bMzDEFvIdg' --output covid.test.csv

# Import packages

In [1]:
# Numerical Operations
import math
import numpy as np

# Reading/Writing Data
import pandas as pd
import os
import csv

# For Progress Bar
# from tqdm import tqdm

# Pytorch
import torch 
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split

# For plotting learning curve
# from torch.utils.tensorboard import SummaryWriter

# Some Utility Functions

You do not need to modify this part.

In [2]:
def same_seed(seed): 
    '''Fixes random number generator seeds for reproducibility.'''
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

def train_valid_split(data_set, valid_ratio, seed):
    '''Split provided training data into training set and validation set'''
    valid_set_size = int(valid_ratio * len(data_set)) 
    train_set_size = len(data_set) - valid_set_size
    train_set, valid_set = random_split(data_set, [train_set_size, valid_set_size], generator=torch.Generator().manual_seed(seed))
    return np.array(train_set), np.array(valid_set)

def predict(test_loader, model, device):
    model.eval() # Set your model to evaluation mode.
    preds = []
    for x in test_loader:
        x = x.to(device)                        
        with torch.no_grad():                   
            pred = model(x)                     
            preds.append(pred.detach().cpu())   
    preds = torch.cat(preds, dim=0).numpy()  
    return preds

# Dataset

In [3]:
class COVID19Dataset(Dataset):
    '''
    x: Features.
    y: Targets, if none, do prediction.
    '''
    def __init__(self, x, y=None):
        if y is None:
            self.y = y
        else:
            self.y = torch.FloatTensor(y)
        self.x = torch.FloatTensor(x)

    def __getitem__(self, idx):
        if self.y is None:
            return self.x[idx]
        else:
            return self.x[idx], self.y[idx]

    def __len__(self):
        return len(self.x)

# Neural Network Model
Try out different model architectures by modifying the class below.

In [4]:
class My_Model(nn.Module):
    def __init__(self, input_dim):
        super(My_Model, self).__init__()
        # TODO: modify model's structure, be aware of dimensions. 
        self.layers = nn.Sequential(
            nn.Linear(input_dim, 1),
#             nn.ReLU(),
#             nn.Linear(16, 1)
        )

    def forward(self, x):
        x = self.layers(x)
        x = x.squeeze(1) # (B, 1) -> (B)
        return x

# Feature Selection
Choose features you deem useful by modifying the function below.

In [5]:
def select_feat(train_data, valid_data, test_data, select_all=True):
    '''Selects useful features to perform regression'''
    y_train, y_valid = train_data[:,-1], valid_data[:,-1]
    raw_x_train, raw_x_valid, raw_x_test = train_data[:,:-1], valid_data[:,:-1], test_data

    if select_all:
        feat_idx = list(range(raw_x_train.shape[1]))
    else:
        feat_idx = [38, 39, 40, 41, 53, 54, 55, 56, 57, 69, 70, 71, 72, 73, 85, 86, 87, 88, 89, 101, 102, 103, 104, 105]
#         feat_idx = [38, 39, 40, 41, 50, 53, 54, 55, 56, 57, 66, 69, 70, 71, 72, 73, 82, 85, 86, 87, 88, 89, 98, 101, 102, 103, 104, 105, 114] # TODO: Select suitable feature columns.
        
    return raw_x_train[:,feat_idx], raw_x_valid[:,feat_idx], raw_x_test[:,feat_idx], y_train, y_valid

# Training Loop

In [28]:
def trainer(train_loader, valid_loader, model, config, device):

    criterion = nn.MSELoss(reduction='mean') # Define your loss function, do not modify this.

    # Define your optimization algorithm. 
    # TODO: Please check https://pytorch.org/docs/stable/optim.html to get more available algorithms.
    # TODO: L2 regularization (optimizer(weight decay...) or implement by your self).
    optimizer = torch.optim.Adam(model.parameters(), lr=config['learning_rate'],weight_decay=0.02) 
    #scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=2000, gamma=0.1)
    # writer = SummaryWriter() # Writer of tensoboard.

    if not os.path.isdir('./models'):
        os.mkdir('./models') # Create directory of saving models.

    n_epochs, best_loss, step, early_stop_count = config['n_epochs'], math.inf, 0, 0

    for epoch in range(n_epochs):
        model.train() # Set your model to train mode.
        loss_record = []

        # tqdm is a package to visualize your training progress.
        # train_pbar = tqdm(train_loader, position=0, leave=True)

        for x, y in train_loader:
            optimizer.zero_grad()               # Set gradient to zero.
            x, y = x.to(device), y.to(device)   # Move your data to device. 
            pred = model(x)             
            loss = criterion(pred, y)
            loss.backward()                     # Compute gradient(backpropagation).
            optimizer.step()                    # Update parameters.
            step += 1
            loss_record.append(loss.detach().item())
            
            # Display current epoch number and loss on tqdm progress bar.
            # train_pbar.set_description(f'Epoch [{epoch+1}/{n_epochs}]')
            # train_pbar.set_postfix({'loss': loss.detach().item()})
        #scheduler.step()
        mean_train_loss = sum(loss_record)/len(loss_record)
#         print(f'Epoch [{epoch+1}/{n_epochs}]: Train loss: {mean_train_loss:.4f}')
        # writer.add_scalar('Loss/train', mean_train_loss, step)
#         if mean_train_loss >1.05:
#             continue
        model.eval() # Set your model to evaluation mode.
        loss_record = []
        for x, y in valid_loader:
            x, y = x.to(device), y.to(device)
            with torch.no_grad():
                pred = model(x)
                loss = criterion(pred, y)

            loss_record.append(loss.item())
            
        mean_valid_loss = sum(loss_record)/len(loss_record)
        print(f'Epoch [{epoch+1}/{n_epochs}]: Train loss: {mean_train_loss:.4f}, Valid loss: {mean_valid_loss:.4f}')
        # writer.add_scalar('Loss/valid', mean_valid_loss, step)

        if mean_valid_loss < best_loss:
            best_loss = mean_valid_loss
            torch.save(model.state_dict(), config['save_path']) # Save your best model
            print('Saving model with loss {:.3f}...'.format(best_loss))
            early_stop_count = 0
        else: 
            early_stop_count += 1

        if early_stop_count >= config['early_stop']:
            print('\nModel is not improving, so we halt the training session.')
            print(best_loss)
            return

# Configurations
`config` contains hyper-parameters for training and the path to save your model.

In [29]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
config = {
    'seed':6264,      # Your seed number, you can pick your lucky number. :)
    'select_all': False,   # Whether to use all features.
    'valid_ratio': 0.2,   # validation_size = train_size * valid_ratio
    'n_epochs': 3000,     # Number of epochs.            
    'batch_size': 256,
    'learning_rate': 5e-3,              
    'early_stop': 400,    # If model has not improved for this many consecutive epochs, stop training.     
    'save_path': './models/model5.ckpt'  # Your model will be saved here.
}

# Dataloader
Read data from files and set up training, validation, and testing sets. You do not need to modify this part.

In [30]:
# Set seed for reproducibility
same_seed(config['seed'])


# train_data size: 2699 x 118 (id + 37 states + 16 features x 5 days) 
# test_data size: 1078 x 117 (without last day's positive rate)
train_data, test_data = pd.read_csv('./covid.train.csv').values, pd.read_csv('./covid.test.csv').values
train_data, valid_data = train_valid_split(train_data, config['valid_ratio'], config['seed'])

# Print out the data size.
print(f"""train_data size: {train_data.shape} 
valid_data size: {valid_data.shape} 
test_data size: {test_data.shape}""")

# Select features
x_train, x_valid, x_test, y_train, y_valid = select_feat(train_data, valid_data, test_data, config['select_all'])

# Print out the number of features.
print(f'number of features: {x_train.shape[1]}')

train_dataset, valid_dataset, test_dataset = COVID19Dataset(x_train, y_train), \
                                            COVID19Dataset(x_valid, y_valid), \
                                            COVID19Dataset(x_test)
# m = train_dataset.x.mean(0, keepdim=True)
# s = train_dataset.x.std(0, keepdim=True)
# train_dataset.x = (train_dataset.x-m) / s
# valid_dataset.x = (valid_dataset.x-m) / s
# test_dataset.x = (test_dataset.x-m) / s

# Pytorch data loader loads pytorch dataset into batches.
train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True, pin_memory=True)
valid_loader = DataLoader(valid_dataset, batch_size=config['batch_size'], shuffle=True, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=config['batch_size'], shuffle=False, pin_memory=True)

train_data size: (2160, 118) 
valid_data size: (539, 118) 
test_data size: (1078, 117)
number of features: 24


# Start training!

In [31]:
model = My_Model(input_dim=x_train.shape[1]).to(device) # put your model and data on the same computation device.
# model.load_state_dict(torch.load('./models/model.ckpt'))
trainer(train_loader, valid_loader, model, config, device)

Epoch [1/3000]: Train loss: 119.5395, Valid loss: 25.5677
Saving model with loss 25.568...
Epoch [2/3000]: Train loss: 12.3156, Valid loss: 5.9678
Saving model with loss 5.968...
Epoch [3/3000]: Train loss: 10.9523, Valid loss: 12.2027
Epoch [4/3000]: Train loss: 9.8916, Valid loss: 5.9436
Saving model with loss 5.944...
Epoch [5/3000]: Train loss: 5.1843, Valid loss: 3.8064
Saving model with loss 3.806...
Epoch [6/3000]: Train loss: 5.2468, Valid loss: 4.1419
Epoch [7/3000]: Train loss: 4.7075, Valid loss: 3.9567
Epoch [8/3000]: Train loss: 4.5340, Valid loss: 4.1196
Epoch [9/3000]: Train loss: 4.4088, Valid loss: 3.7098
Saving model with loss 3.710...
Epoch [10/3000]: Train loss: 4.2775, Valid loss: 3.6976
Saving model with loss 3.698...
Epoch [11/3000]: Train loss: 4.1793, Valid loss: 3.7964
Epoch [12/3000]: Train loss: 4.0470, Valid loss: 4.2531
Epoch [13/3000]: Train loss: 3.9582, Valid loss: 3.5768
Saving model with loss 3.577...
Epoch [14/3000]: Train loss: 3.8812, Valid loss: 3

Epoch [134/3000]: Train loss: 1.4117, Valid loss: 1.4533
Epoch [135/3000]: Train loss: 1.4118, Valid loss: 1.2596
Epoch [136/3000]: Train loss: 1.3928, Valid loss: 1.5458
Epoch [137/3000]: Train loss: 1.4448, Valid loss: 1.4328
Epoch [138/3000]: Train loss: 1.4106, Valid loss: 1.3463
Epoch [139/3000]: Train loss: 1.3999, Valid loss: 1.3703
Epoch [140/3000]: Train loss: 1.4073, Valid loss: 1.1214
Epoch [141/3000]: Train loss: 1.3991, Valid loss: 1.2866
Epoch [142/3000]: Train loss: 1.3888, Valid loss: 1.2038
Epoch [143/3000]: Train loss: 1.3836, Valid loss: 1.4542
Epoch [144/3000]: Train loss: 1.4033, Valid loss: 1.0391
Saving model with loss 1.039...
Epoch [145/3000]: Train loss: 1.3789, Valid loss: 1.1501
Epoch [146/3000]: Train loss: 1.3583, Valid loss: 1.1686
Epoch [147/3000]: Train loss: 1.3777, Valid loss: 1.4162
Epoch [148/3000]: Train loss: 1.3773, Valid loss: 1.5228
Epoch [149/3000]: Train loss: 1.3827, Valid loss: 1.5285
Epoch [150/3000]: Train loss: 1.3704, Valid loss: 1.2028

Epoch [280/3000]: Train loss: 1.1956, Valid loss: 0.9335
Epoch [281/3000]: Train loss: 1.1927, Valid loss: 1.0639
Epoch [282/3000]: Train loss: 1.1920, Valid loss: 0.8762
Epoch [283/3000]: Train loss: 1.2200, Valid loss: 1.0591
Epoch [284/3000]: Train loss: 1.2135, Valid loss: 1.0399
Epoch [285/3000]: Train loss: 1.2068, Valid loss: 0.9297
Epoch [286/3000]: Train loss: 1.2380, Valid loss: 0.9757
Epoch [287/3000]: Train loss: 1.1975, Valid loss: 1.1237
Epoch [288/3000]: Train loss: 1.1810, Valid loss: 0.8998
Epoch [289/3000]: Train loss: 1.1785, Valid loss: 1.1110
Epoch [290/3000]: Train loss: 1.1965, Valid loss: 0.9685
Epoch [291/3000]: Train loss: 1.1713, Valid loss: 0.8917
Epoch [292/3000]: Train loss: 1.1720, Valid loss: 0.9155
Epoch [293/3000]: Train loss: 1.1616, Valid loss: 0.8545
Epoch [294/3000]: Train loss: 1.1730, Valid loss: 1.1040
Epoch [295/3000]: Train loss: 1.1730, Valid loss: 1.0026
Epoch [296/3000]: Train loss: 1.1719, Valid loss: 0.9357
Epoch [297/3000]: Train loss: 1

Epoch [428/3000]: Train loss: 1.1639, Valid loss: 0.8716
Epoch [429/3000]: Train loss: 1.1570, Valid loss: 1.1343
Epoch [430/3000]: Train loss: 1.1425, Valid loss: 1.1223
Epoch [431/3000]: Train loss: 1.1936, Valid loss: 0.8479
Epoch [432/3000]: Train loss: 1.1355, Valid loss: 0.9587
Epoch [433/3000]: Train loss: 1.1730, Valid loss: 0.7816
Saving model with loss 0.782...
Epoch [434/3000]: Train loss: 1.1447, Valid loss: 0.9482
Epoch [435/3000]: Train loss: 1.1397, Valid loss: 0.8683
Epoch [436/3000]: Train loss: 1.1424, Valid loss: 0.8746
Epoch [437/3000]: Train loss: 1.1539, Valid loss: 1.0466
Epoch [438/3000]: Train loss: 1.1605, Valid loss: 1.0052
Epoch [439/3000]: Train loss: 1.2058, Valid loss: 0.8418
Epoch [440/3000]: Train loss: 1.1666, Valid loss: 0.8824
Epoch [441/3000]: Train loss: 1.1726, Valid loss: 0.8475
Epoch [442/3000]: Train loss: 1.1476, Valid loss: 0.8234
Epoch [443/3000]: Train loss: 1.1859, Valid loss: 0.9570
Epoch [444/3000]: Train loss: 1.1999, Valid loss: 0.8759

Epoch [570/3000]: Train loss: 1.1739, Valid loss: 0.9579
Epoch [571/3000]: Train loss: 1.1474, Valid loss: 0.7899
Epoch [572/3000]: Train loss: 1.1573, Valid loss: 0.8586
Epoch [573/3000]: Train loss: 1.1379, Valid loss: 0.8011
Epoch [574/3000]: Train loss: 1.1588, Valid loss: 0.8610
Epoch [575/3000]: Train loss: 1.2160, Valid loss: 1.1477
Epoch [576/3000]: Train loss: 1.1451, Valid loss: 1.2080
Epoch [577/3000]: Train loss: 1.1316, Valid loss: 0.9133
Epoch [578/3000]: Train loss: 1.1765, Valid loss: 0.8113
Epoch [579/3000]: Train loss: 1.1435, Valid loss: 0.8451
Epoch [580/3000]: Train loss: 1.1493, Valid loss: 0.9350
Epoch [581/3000]: Train loss: 1.1360, Valid loss: 0.9970
Epoch [582/3000]: Train loss: 1.1375, Valid loss: 1.1525
Epoch [583/3000]: Train loss: 1.1513, Valid loss: 1.1626
Epoch [584/3000]: Train loss: 1.1742, Valid loss: 0.9645
Epoch [585/3000]: Train loss: 1.1287, Valid loss: 0.8672
Epoch [586/3000]: Train loss: 1.1355, Valid loss: 0.8308
Epoch [587/3000]: Train loss: 1

Epoch [716/3000]: Train loss: 1.1225, Valid loss: 1.1460
Epoch [717/3000]: Train loss: 1.1621, Valid loss: 0.9682
Epoch [718/3000]: Train loss: 1.1542, Valid loss: 0.9846
Epoch [719/3000]: Train loss: 1.1451, Valid loss: 0.8285
Epoch [720/3000]: Train loss: 1.1774, Valid loss: 0.9648
Epoch [721/3000]: Train loss: 1.1334, Valid loss: 0.9975
Epoch [722/3000]: Train loss: 1.1264, Valid loss: 0.8577
Epoch [723/3000]: Train loss: 1.1527, Valid loss: 0.8252
Epoch [724/3000]: Train loss: 1.1334, Valid loss: 0.9237
Epoch [725/3000]: Train loss: 1.1529, Valid loss: 0.8799
Epoch [726/3000]: Train loss: 1.1949, Valid loss: 0.9106
Epoch [727/3000]: Train loss: 1.1473, Valid loss: 1.0626
Epoch [728/3000]: Train loss: 1.1618, Valid loss: 0.9096
Epoch [729/3000]: Train loss: 1.1758, Valid loss: 0.9746
Epoch [730/3000]: Train loss: 1.1992, Valid loss: 0.9720
Epoch [731/3000]: Train loss: 1.1768, Valid loss: 0.8592
Epoch [732/3000]: Train loss: 1.1674, Valid loss: 1.0682
Epoch [733/3000]: Train loss: 1

Epoch [860/3000]: Train loss: 1.1846, Valid loss: 0.8775
Epoch [861/3000]: Train loss: 1.1746, Valid loss: 1.0298
Epoch [862/3000]: Train loss: 1.1945, Valid loss: 0.9688
Epoch [863/3000]: Train loss: 1.1502, Valid loss: 1.0631
Epoch [864/3000]: Train loss: 1.1686, Valid loss: 1.1063
Epoch [865/3000]: Train loss: 1.1348, Valid loss: 0.8842
Epoch [866/3000]: Train loss: 1.1548, Valid loss: 0.9645
Epoch [867/3000]: Train loss: 1.1816, Valid loss: 1.0518
Epoch [868/3000]: Train loss: 1.1682, Valid loss: 0.8770
Epoch [869/3000]: Train loss: 1.1694, Valid loss: 0.8872
Epoch [870/3000]: Train loss: 1.1268, Valid loss: 1.0197
Epoch [871/3000]: Train loss: 1.1529, Valid loss: 0.9163
Epoch [872/3000]: Train loss: 1.1634, Valid loss: 0.9485
Epoch [873/3000]: Train loss: 1.1142, Valid loss: 0.8870
Epoch [874/3000]: Train loss: 1.1252, Valid loss: 0.9077
Epoch [875/3000]: Train loss: 1.1392, Valid loss: 1.0487
Epoch [876/3000]: Train loss: 1.1298, Valid loss: 0.8517
Epoch [877/3000]: Train loss: 1

# Plot learning curves with `tensorboard` (optional)

`tensorboard` is a tool that allows you to visualize your training progress.

If this block does not display your learning curve, please wait for few minutes, and re-run this block. It might take some time to load your logging information. 

In [32]:
# %reload_ext tensorboard
# %tensorboard --logdir=./runs/
# torch.save(model.state_dict(), config['save_path'])

# Testing
The predictions of your model on testing set will be stored at `pred.csv`.

In [33]:
def save_pred(preds, file):
    ''' Save predictions to specified file '''
    with open(file, 'w') as fp:
        writer = csv.writer(fp)
        writer.writerow(['id', 'tested_positive'])
        for i, p in enumerate(preds):
            writer.writerow([i, p])

model = My_Model(input_dim=x_train.shape[1]).to(device)
model.load_state_dict(torch.load(config['save_path']))
preds = predict(test_loader, model, device) 
save_pred(preds, 'model5.csv')         

# Reference
This notebook uses code written by Heng-Jui Chang @ NTUEE (https://github.com/ga642381/ML2021-Spring/blob/main/HW01/HW01.ipynb)