# Import packages

In [3]:
# Numerical Operations
import math
import numpy as np

# Reading/Writing Data
import pandas as pd
import os
import csv

# For Progress Bar
from tqdm import tqdm

# Pytorch
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split

# Matplotlib
import matplotlib.pyplot as plt

# Optuna
import optuna

# For plotting learning curve
from torch.utils.tensorboard import SummaryWriter

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


# Some Utility Functions

You do not need to modify this part.

In [4]:
def same_seed(seed):
    '''Fixes random number generator seeds for reproducibility.'''
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

def train_valid_split(data_set, valid_ratio, seed):
    '''Split provided training data into training set and validation set'''
    valid_set_size = int(valid_ratio * len(data_set))
    train_set_size = len(data_set) - valid_set_size
    train_set, valid_set = random_split(data_set, [train_set_size, valid_set_size], generator=torch.Generator().manual_seed(seed))
    return np.array(train_set), np.array(valid_set)

def predict(test_loader, model, device):
    model.eval() # Set your model to evaluation mode.
    preds = []
    for x in tqdm(test_loader):
        x = x.to(device)
        with torch.no_grad():
            pred = model(x)
            preds.append(pred.detach().cpu())
    preds = torch.cat(preds, dim=0).numpy()
    return preds

# Dataset

In [5]:
class CovidDataset(Dataset):
    def __init__(self, x, y=None):
        if y is None:
            self.y = y
        else:
            self.y = torch.FloatTensor(y)
        self.x = torch.FloatTensor(x)
    
    def __getitem__(self, idx):
        if self.y is None:
            return self.x[idx]
        else:
            return self.x[idx], self.y[idx]
    
    def __len__(self):
        return len(self.x)

# Neural Network Model

Try out different model architectures by modifying the class below. (You could tune config['layer'] to try)

In [6]:
class My_Model(nn.Module):
    def __init__(self, input_dim):
        super(My_Model, self).__init__()
        # TODO: modify model's structure, be aware of dimensions.
        self.layers = nn.Sequential(
            nn.Linear(input_dim, config['layer'][0]),
            nn.ReLU(),
            nn.Linear(config['layer'][0], config['layer'][1]),
            nn.ReLU(),
            nn.Linear(config['layer'][1], 1)
        )

    def forward(self, x):
        x = self.layers(x)
        x = x.squeeze(1) # (B, 1) -> (B)
        return x

# Feature Selection

Choose features you deem useful by modifying the function below.

In [7]:
from sklearn.feature_selection import SelectKBest, f_regression

def select_feat(train_data, valid_data, test_data, no_select_all=True):
    '''Selects useful features to perform regression'''
    global config
    y_train, y_valid = train_data[:,-1], valid_data[:,-1]
    raw_x_train, raw_x_valid, raw_x_test = train_data[:,:-1], valid_data[:,:-1], test_data

    if not no_select_all:
        feat_idx = list(range(raw_x_train.shape[1]))
    else:
        # feature selection
        k = config['k']
        selector = SelectKBest(score_func=f_regression, k=k)
        result = selector.fit(train_data[:, :-1], train_data[:,-1])
        idx = np.argsort(result.scores_)[::-1]
        feat_idx = list(np.sort(idx[:k]))

    return raw_x_train[:,feat_idx], raw_x_valid[:,feat_idx], raw_x_test[:,feat_idx], y_train, y_valid

# Training Loop

In [8]:
def trainer(train_loader, valid_loader, model, config, device):

    criterion = nn.MSELoss(reduction='mean') # Define your loss function, do not modify this.
    
    # Define your optimization algorithm.
    if config['optim'] == 'SGD':
        if config['no_momentum']:
            optimizer = torch.optim.SGD(model.parameters(), lr=config['learning_rate'], weight_decay=config['weight_decay'])     
        else:
            optimizer = torch.optim.SGD(model.parameters(), lr=config['learning_rate'], momentum=config['momentum'], weight_decay=config['weight_decay'])     
    elif config['optim'] == 'Adam':
        optimizer = torch.optim.Adam(model.parameters(), lr=config['learning_rate'], weight_decay=config['weight_decay'])
        
    writer = SummaryWriter() # Writer of tensoboard.

        
    if not os.path.isdir('./models'):
        os.mkdir('./models') # Create directory of saving models.

    n_epochs, best_loss, step, early_stop_count = config['n_epochs'], math.inf, 0, 0

    for epoch in range(n_epochs):
        model.train() # Set your model to train mode.
        loss_record = []
        
        # 如果你在kaggle上运行，可以注释掉大部分的打印函数，并将train_pbar注释掉，令 x,y in train_loader，因为kaggle上打印太多可能会报错。
        # tqdm is a package to visualize your training progress.
#         train_pbar = tqdm(train_loader, position=0, leave=True)

#         for x, y in train_pbar:
        for x, y in train_loader:
            optimizer.zero_grad()               # Set gradient to zero.
            x, y = x.to(device), y.to(device)   # Move your data to device.
            pred = model(x)
            loss = criterion(pred, y)
            loss.backward()                     # Compute gradient(backpropagation).
            optimizer.step()                    # Update parameters.
            step += 1
            loss_record.append(loss.detach().item())

            # Display current epoch number and loss on tqdm progress bar.
#             train_pbar.set_description(f'Epoch [{epoch+1}/{n_epochs}]')
#             train_pbar.set_postfix({'loss': loss.detach().item()})

        mean_train_loss = sum(loss_record)/len(loss_record)

        model.eval() # Set your model to evaluation mode.
        loss_record = []
        for x, y in valid_loader:
            x, y = x.to(device), y.to(device)
            with torch.no_grad():
                pred = model(x)
                loss = criterion(pred, y)

            loss_record.append(loss.item())

        mean_valid_loss = sum(loss_record)/len(loss_record)        
        
#         if epoch % 100 == 0:
#             print(f'Epoch [{epoch+1}/{n_epochs}]: Train loss: {mean_train_loss:.4f}, Valid loss: {mean_valid_loss:.4f}')

        if not config['no_tensorboard']:
            writer.add_scalar('Loss/train', mean_train_loss, step)
            writer.add_scalar('Loss/valid', mean_valid_loss, step)

        if mean_valid_loss < best_loss:
            best_loss = mean_valid_loss
            
            # 一轮实验中保存 K 折交叉验证中单折表现最好的模型
            if len(valid_scores):
                if best_loss < min(valid_scores):
                    torch.save(model.state_dict(), config['save_path']) # Save your best model
#                     print(f'Epoch [{epoch+1}/{n_epochs}]: Train loss: {mean_train_loss:.4f}, Valid loss: {mean_valid_loss:.4f}')
                    print('Saving model with loss {:.3f}...'.format(best_loss))
            else:
                torch.save(model.state_dict(), config['save_path']) # Save your best model
#                 print(f'Epoch [{epoch+1}/{n_epochs}]: Train loss: {mean_train_loss:.4f}, Valid loss: {mean_valid_loss:.4f}')
                print('Saving model with loss {:.3f}...'.format(best_loss))
                
            early_stop_count = 0
        else:
            early_stop_count += 1

        if early_stop_count >= config['early_stop']:
            print('Best loss {:.3f}...'.format(best_loss))
            print('\nModel is not improving, so we halt the training session.')
            break
    return best_loss

# Save predictions

In [9]:
def save_pred(preds, file):
    ''' Save predictions to specified file '''
    with open(file, 'w') as fp:
        writer = csv.writer(fp)
        writer.writerow(['id', 'tested_positive'])
        for i, p in enumerate(preds):
            writer.writerow([i, p])

In [10]:

# dropbox link
!wget -O covid_train.csv https://www.dropbox.com/s/lmy1riadzoy0ahw/covid.train.csv?dl=0
!wget -O covid_test.csv https://www.dropbox.com/s/zalbw42lu4nmhr2/covid.test.csv?dl=0

--2023-05-31 06:02:35--  https://www.dropbox.com/s/lmy1riadzoy0ahw/covid.train.csv?dl=0
Resolving www.dropbox.com (www.dropbox.com)... 162.125.1.18, 2620:100:6016:18::a27d:112
Connecting to www.dropbox.com (www.dropbox.com)|162.125.1.18|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: /s/raw/lmy1riadzoy0ahw/covid.train.csv [following]
--2023-05-31 06:02:36--  https://www.dropbox.com/s/raw/lmy1riadzoy0ahw/covid.train.csv
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://uc3bc52db62bdf683bca083f89c3.dl.dropboxusercontent.com/cd/0/inline/B9HQRetIW3Qmui02z0D9aDBxfFRehcWiDOo5foWKZ1BMj8Yl_pip1wJdK6bd2nHPCbhB0pLU5FE6aAx_U11i6KSKnZDSpR_7k07TqEDa2AXOgM7LFgnRtlyKIjpv4sCczWwKc5Y_sFvEUT7wD4-SjyNcA021ozTsrtyGpUf9EA_MHg/file# [following]
--2023-05-31 06:02:36--  https://uc3bc52db62bdf683bca083f89c3.dl.dropboxusercontent.com/cd/0/inline/B9HQRetIW3Qmui02z0D9aDBxfFRehcWiDOo5foWKZ1BMj8Yl_pip1wJdK6bd2nHPCb

# Start training!

config contains hyper-parameters for training and the path to save your model.

`objective()` is used for automatic parameter tuning, but you could set `AUTO_TUNE_PARAM` `False` to avoid it.

In [18]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
if torch.cuda.is_available():
    print("OKGPU")

config = {
    'seed': 5201314,      # Your seed number, you can pick your lucky number. :)
    'k': 16,    # Select k feature
    'layer': [16, 8],
    'optim': 'Adam',
    'momentum': 0.7,
    'valid_ratio': 0.2,   # validation_size = train_size * valid_ratio
    'n_epochs': 10000,     # Number of epochs.
    'batch_size': 128,
    'learning_rate': 1e-03,
    'weight_decay': 1e-4,
    'early_stop': 600,    # If model has not improved for this many consecutive epochs, stop training.
    'save_path': './models/model.ckpt',  # Your model will be saved here.
    'no_select_all': True,   # Whether to use all features.
    'no_momentum': False,   # Whether to use momentum
    'no_normal': False,  # Whether to normalize data
    'no_k_cross': False,     # Whether to use K-fold cross validation
    'no_save': False,   # Whether to save model parameters
    'no_tensorboard': True,  # Whether to write tensorboard
} 

# 设置 k-fold 中的 k，这里是根据 valid_ratio 设定的
k = int(1 / config['valid_ratio'])

 # Set seed for reproducibility
same_seed(config['seed'])

training_data, test_data = pd.read_csv('/kaggle/input/ml2023spring-hw1/covid_train.csv').values, pd.read_csv('/kaggle/input/ml2023spring-hw1/covid_test.csv').values
    
num_valid_samples = len(training_data) // k
np.random.shuffle(training_data)
valid_scores = []  # 记录 valid_loss

def objective(trial):
    if trial != None:
        print('\nNew trial here')
        # 定义需要调优的超参数空间
        config['seed'] = trial.suggest_int('seed', 1, 520131455)
        # config['learning_rate'] = trial.suggest_float('lr', 1e-5, 1e-4)
        # config['batch_size'] = trial.suggest_categorical('batch_size', [128])
        # config['k'] = trial.suggest_int('k_feats', 32, 40)
        config['layer'][0] = config['k']
    
    # 打印所需的超参数
    print(f'''hyper-parameter: 
        seed: {config['seed']}
        optimizer: {config['optim']},
        lr: {config['learning_rate']}, 
        batch_size: {config['batch_size']}, 
        k: {config['k']}, 
        layer: {config['layer']}''')
    
    global valid_scores
    # 每次 trial 初始化 valid_scores，可以不初始化，通过 trial * k + fold 来访问当前 trial 的 valid_score，
    # 这样可以让 trainer() 保存 trials 中最好的模型参数，但这并不意味着该参数对应的 k-fold validation loss 最低。
    valid_scores = []

    for fold in range(k):
        # Data split
        valid_data = training_data[num_valid_samples * fold:
                                num_valid_samples * (fold + 1)]
        train_data = np.concatenate((
            training_data[:num_valid_samples * fold],
            training_data[num_valid_samples * (fold + 1):]))

        # Normalization
        if not config['no_normal']:
            train_mean = np.mean(train_data[:, 35:-1], axis=0)  # 前 35 列为 one-hot vector，我并没有对他们做 normalization，可以自行设置
            train_std = np.std(train_data[:, 35:-1], axis=0)
            train_data[:, 35:-1] -= train_mean
            train_data[:, 35:-1] /= train_std
            valid_data[:, 35:-1] -= train_mean
            valid_data[:, 35:-1] /= train_std
            test_data[:, 35:] -= train_mean
            test_data[:, 35:] /= train_std

        x_train, x_valid, x_test, y_train, y_valid = select_feat(train_data, valid_data, test_data, config['no_select_all'])
        
        train_dataset, valid_dataset, test_dataset = CovidDataset(x_train, y_train), \
                                                CovidDataset(x_valid, y_valid), \
                                                CovidDataset(x_test)

        # Pytorch data loader loads pytorch dataset into batches.
        train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True, pin_memory=True)
        valid_loader = DataLoader(valid_dataset, batch_size=config['batch_size'], shuffle=True, pin_memory=True)
        test_loader = DataLoader(test_dataset, batch_size=config['batch_size'], shuffle=False, pin_memory=True)
        
        model = My_Model(input_dim=x_train.shape[1]).to(device) # put your model and data on the same computation device.
        valid_score = trainer(train_loader, valid_loader, model, config, device)
        valid_scores.append(valid_score)
        
        if not config['no_k_cross']:
            break
            
        if valid_score > 2:
            print(f'在第{fold+1}折上欠拟合') # 提前终止，减少计算资源
            break       
        
    print(f'valid_scores: {valid_scores}')
    
    if trial != None:
        return np.average(valid_scores)
    else:
        return x_test, test_loader


AUTO_TUNE_PARAM = False  # Whether to tune parameters automatically

if AUTO_TUNE_PARAM:
    # 使用Optuna库进行超参数搜索
    n_trials = 50  # 设置试验数量
    print(f'AUTO_TUNE_PARAM: {AUTO_TUNE_PARAM}\nn_trials: {n_trials}')
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=n_trials)

    # 输出最优的超参数组合和性能指标
    print('Best hyperparameters: {}'.format(study.best_params))
    print('Best performance: {:.4f}'.format(study.best_value))
else:
    # 注意，只有非自动调参时才进行了predict，节省一下计算资源
    print(f'You could set AUTO_TUNE_PARAM True to tune parameters automatically.\nAUTO_TUNE_PARAM: {AUTO_TUNE_PARAM}')
    x_test, test_loader = objective(None)
    model = My_Model(input_dim=x_test.shape[1]).to(device)
    model.load_state_dict(torch.load(config['save_path']))
    preds = predict(test_loader, model, device)
    save_pred(preds, 'submission.csv')

OKGPU
You could set AUTO_TUNE_PARAM True to tune parameters automatically.
AUTO_TUNE_PARAM: False
hyper-parameter: 
        seed: 5201314
        optimizer: Adam,
        lr: 0.001, 
        batch_size: 128, 
        k: 16, 
        layer: [16, 8]
Saving model with loss 384.245...
Saving model with loss 378.793...
Saving model with loss 376.269...
Saving model with loss 375.082...
Saving model with loss 372.826...
Saving model with loss 362.742...
Saving model with loss 349.744...
Saving model with loss 329.850...
Saving model with loss 305.995...
Saving model with loss 280.810...
Saving model with loss 244.356...
Saving model with loss 210.668...
Saving model with loss 174.213...
Saving model with loss 144.797...
Saving model with loss 119.949...
Saving model with loss 104.996...
Saving model with loss 96.262...
Saving model with loss 90.725...
Saving model with loss 88.071...
Saving model with loss 85.631...
Saving model with loss 84.356...
Saving model with loss 81.427...
Saving mod

100%|██████████| 8/8 [00:00<00:00, 1045.02it/s]


# Plot learning curves with `tensorboard` (optional)

`tensorboard` is a tool that allows you to visualize your training progress.

If this block does not display your learning curve, please wait for few minutes, and re-run this block. It might take some time to load your logging information. 

In [12]:
%reload_ext tensorboard
%tensorboard --logdir=./runs/

In [13]:
    x_test, test_loader = objective(None)
    model = My_Model(input_dim=x_test.shape[1]).to(device)
    model.load_state_dict(torch.load(config['save_path']))
    preds = predict(test_loader, model, device)
    save_pred(preds, 'submission.csv')

hyper-parameter: 
        seed: 5201314
        optimizer: Adam,
        lr: 0.0001, 
        batch_size: 128, 
        k: 16, 
        layer: [16, 8]
Saving model with loss 378.212...
Saving model with loss 374.679...
Saving model with loss 372.607...
Saving model with loss 368.441...
Saving model with loss 365.675...
Saving model with loss 364.324...
Saving model with loss 363.158...
Saving model with loss 358.136...
Saving model with loss 356.697...
Saving model with loss 354.515...
Saving model with loss 350.502...
Saving model with loss 347.380...
Saving model with loss 347.309...
Saving model with loss 344.518...
Saving model with loss 338.820...
Saving model with loss 330.098...
Saving model with loss 325.419...
Saving model with loss 321.391...
Saving model with loss 317.663...
Saving model with loss 313.248...
Saving model with loss 307.808...
Saving model with loss 303.925...
Saving model with loss 298.641...
Saving model with loss 293.779...
Saving model with loss 285.515...

100%|██████████| 8/8 [00:00<00:00, 1153.43it/s]
