In [1]:
import os
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from torch.utils.tensorboard import SummaryWriter

from tqdm import tqdm

# Some Utility Function

In [2]:
def set_seed(seed): 
    '''Fixes random number generator seeds for reproducibility.'''
    # if determinsitic is True, cause cuDNN to  
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

In [3]:
def train_valid_split(data_set, valid_ratio, seed):
    '''Split provided training data into training set and validation set'''
    valid_set_size = int(valid_ratio * len(data_set)) 
    train_set_size = len(data_set) - valid_set_size
    train_set, valid_set = random_split(data_set, [train_set_size, valid_set_size], generator=torch.Generator().manual_seed(seed))
    return np.array(train_set), np.array(valid_set)

# Dataset

In [4]:
class MyDataset(Dataset):
    '''
    x: Features.
    y: Targets, if none, do prediction.
    '''
    def __init__(self, x, y=None):
        if y is None:
            self.y = y
        else:
            self.y = torch.FloatTensor(y)
        self.x = torch.FloatTensor(x)

    def __getitem__(self, idx):
        if self.y is None:
            return self.x[idx]
        else:
            return self.x[idx], self.y[idx]

    def __len__(self):
        return len(self.x)

# Model

In [5]:
class FNN(nn.Module):
    def __init__(self, input_dim):
        super(FNN, self).__init__()
        # TODO: modify model's structure, be aware of dimensions. 
        self.layers = nn.Sequential(
            nn.Linear(input_dim, 16),
            nn.LeakyReLU(),
            
            nn.Linear(16, 16),
            nn.LeakyReLU(),
            
            nn.Linear(16, 1)
        )

    def forward(self, x):
        x = self.layers(x)
        x = x.squeeze(1) # (B, 1) -> (B)
        return x

# Feature Selecting

In [6]:
def select_feat(train_data, valid_data, test_data, select_all=True):
    '''Selects useful features to perform regression'''
    y_train, y_valid = train_data[:,-1], valid_data[:,-1]
    raw_x_train, raw_x_valid, raw_x_test = train_data[:,:-1], valid_data[:,:-1], test_data

    if select_all:
        feat_idx = list(range(raw_x_train.shape[1]))
    else:
        feat_idx = [0,1,2,3,4] # TODO: Select suitable feature columns.
        
    return raw_x_train[:,feat_idx], raw_x_valid[:,feat_idx], raw_x_test[:,feat_idx], y_train, y_valid

# Training

In [7]:
from statistics import mode


def trainer(train_loader, valid_loader, model, config):
    criterion = nn.MSELoss(reduction='mean')
    optimizer = torch.optim.Adam(model.parameters(), lr=config['lr'], weight_decay=config['weight_decay'])
    
    writer = SummaryWriter()
    
    if not os.path.isdir('./models'):
        os.mkdir('./models')
    
    device = torch.device(config['device'])
    n_epochs, best_loss, step, early_stop_count = config['n_epochs'], 2**64, 0, 0
    
    epoch_pbar = tqdm(range(n_epochs), position=0)
    for epoch in epoch_pbar:
        model.train()
        loss_record = []
        
        for x, y in train_loader:
            x, y = x.to(device), y.to(device)
            
            optimizer.zero_grad()
            pred = model(x)
            loss = criterion(pred, y)
            loss.backward()
            optimizer.step()
            step += 1
            
            loss_record.append(loss.detach().item())

        mean_train_loss = sum(loss_record)/len(loss_record)
        writer.add_scalar('Loss/train', mean_train_loss, step)

        # validation
        model.eval()
        loss_record = []
        for x, y in valid_loader:
            x, y = x.to(device), y.to(device)
            with torch.no_grad():
                pred = model(x)
                loss = criterion(pred, y)

            loss_record.append(loss.item())
        mean_valid_loss = sum(loss_record)/len(loss_record)
        writer.add_scalar('Loss/valid', mean_valid_loss, step)
        
        epoch_pbar.set_description(f'Epoch [{epoch+1}/{n_epochs}]')
        epoch_pbar.set_postfix({'train_loss': mean_train_loss, 'valid_loss': mean_valid_loss})
        
        if mean_valid_loss < best_loss:
            best_loss = mean_valid_loss
            torch.save(model.state_dict(), config['save_path'])
            early_stop_count = 0
        else:
            early_stop_count += 1
        
        if early_stop_count >= config['early_stop']:
            print('Trining end.')
            return

In [8]:
config = {
	'device': 'cuda' if torch.cuda.is_available() else 'cpu',
    'seed': 233333,
    'select_all': True,
    'valid_ratio': 0.2,
    'n_epochs': 3000,
    'batch_size':256,
    'weight_decay':0.1,
    'lr': 1e-3,
    'early_stop': 400,
    'save_path': './models/model.ckpt'
}

In [9]:
set_seed(config['seed'])

train_data, test_data = pd.read_csv('./covid.train.csv').values, pd.read_csv('./covid.test.csv').values
train_data, valid_data = train_valid_split(train_data, config['valid_ratio'], config['seed'])
print(f'train_data size: {train_data.shape}')
print(f'valid_data size: {valid_data.shape}')
print(f'test_data size: {test_data.shape}')

train_data size: (2160, 118)
valid_data size: (539, 118)
test_data size: (1078, 117)


In [10]:
x_train, x_valid, x_text, y_train, y_valid = select_feat(train_data, valid_data, test_data, config['select_all'])
print(f'number of features: {x_train.shape[1]}')

number of features: 117


In [11]:
train_dataset, valid_dataset, test_dataset = MyDataset(x_valid, y_valid), MyDataset(x_train, y_train), MyDataset(x_text)

In [12]:
train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True, pin_memory=True)
valid_loader = DataLoader(valid_dataset, batch_size=config['batch_size'], shuffle=True, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=config['batch_size'], shuffle=False, pin_memory=True)

In [13]:
model = FNN(input_dim=x_train.shape[1]).to(torch.device(config['device']))
trainer(train_loader, valid_loader, model, config)

Epoch [2365/3000]:  79%|███████████   | 2364/3000 [01:12<00:19, 32.47it/s, train_loss=1.24, valid_loss=1.4]

Trining end.





In [14]:
%reload_ext tensorboard
%tensorboard --logdir=./runs/

Reusing TensorBoard on port 6006 (pid 11280), started 1:22:49 ago. (Use '!kill 11280' to kill it.)

In [15]:
model = FNN(input_dim=x_train.shape[1]).to(config['device'])
model.load_state_dict(torch.load(config['save_path']))

<All keys matched successfully>

In [16]:
def predict(data_loader, model, device):
    model.eval()
    preds = []
    for x in tqdm(test_loader):
        x = x.to(device)
        with torch.no_grad():
            pred = model(x)
            preds.append(pred.detach().cpu())
    preds = torch.cat(preds, dim=0).numpy()
    return preds

In [17]:
preds = predict(test_loader, model, config['device'])

100%|███████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 556.94it/s]


In [18]:
pd.DataFrame(preds, columns=['tested_positive']).to_csv('./predict/predict2.csv', index_label='id')