In [1]:
import os
import gc
import numpy as np
import pandas as pd
import math
import random
from time import time
from tqdm import tqdm
from pathlib import Path

import torch
from torch import nn, cuda
import torchvision.models as models
from torch.utils.data import DataLoader, Dataset

from utils import CosineAnnealingWithRestartsLR

from torch.optim import Adam, SGD, Optimizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

class Semi_dataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y
        self.X_dataset = []
        self.Y_dataset = []
        for x in X:
            self.X_dataset.append(torch.FloatTensor(x))
        try:
            for y in Y.values:
                self.Y_dataset.append(torch.tensor(y))
        except:
            print("no label")
            
    def __len__(self):
        return len(self.X)

    def __getitem__(self, index):
        data = self.X_dataset[index]
        try:
            target = self.Y_dataset[index]
            return data, target
        except:
            return data


def build_dataloader(X, Y, batch_size, shuffle=False):
    
    dataset = Semi_dataset(X, Y)
    dataloader = DataLoader(
                            dataset,
                            batch_size=batch_size,
                            shuffle=shuffle,
                            num_workers=8
                            )
    return dataloader

def mean_absolute_error(y_true, y_pred,
                        sample_weight=None,
                        multioutput='uniform_average'):
    
    output_errors = np.average(np.abs(y_pred - y_true),
                               weights=sample_weight, axis=0)
    if isinstance(multioutput, str):
        if multioutput == 'raw_values':
            return output_errors
        elif multioutput == 'uniform_average':
            multioutput = None

    return np.average(output_errors, weights=multioutput)

class AdamW(Optimizer):
    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
                 weight_decay=0):
        defaults = dict(lr=lr, betas=betas, eps=eps,
                        weight_decay=weight_decay)
        super(AdamW, self).__init__(params, defaults)

    def step(self, closure=None):
        """Performs a single optimization step.

        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad.data
                if grad.is_sparse:
                    raise RuntimeError('AdamW does not support sparse gradients, please consider SparseAdam instead')

                state = self.state[p]

                # State initialization
                if len(state) == 0:
                    state['step'] = 0
                    # Exponential moving average of gradient values
                    state['exp_avg'] = torch.zeros_like(p.data)
                    # Exponential moving average of squared gradient values
                    state['exp_avg_sq'] = torch.zeros_like(p.data)

                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
                beta1, beta2 = group['betas']

                state['step'] += 1

                # Decay the first and second moment running average coefficient
                exp_avg.mul_(beta1).add_(1 - beta1, grad)
                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)

                denom = exp_avg_sq.sqrt().add_(group['eps'])

                bias_correction1 = 1 - beta1 ** state['step']
                bias_correction2 = 1 - beta2 ** state['step']
                step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1

                p.data.addcdiv_(-step_size, exp_avg, denom)

                if group['weight_decay'] != 0:
                    p.data.add_(-group['weight_decay'], p.data)

        return loss

class MLP_only_flatfeatures(nn.Module):
    def __init__(self, num_classes=1):
        super(MLP_only_flatfeatures, self).__init__()
        self.num_classes = num_classes         
        self.fc_layers = nn.Sequential(
            #nn.Linear(226, 1000),
            nn.Linear(226, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            ####### Block 1 #######
            nn.Linear(256, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            
            nn.Linear(64, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            
            nn.Linear(64, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.1),
            ####### Block 2 #######
            nn.Linear(256, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            
            nn.Linear(64, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            
            nn.Linear(64, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.1),
            ####### Block 3 #######
            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            
            nn.Linear(128, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            
            nn.Linear(128, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.1),
            ####### Block 4 #######
            nn.Linear(512, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            
            nn.Linear(128, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            
            nn.Linear(128, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.1),
            ####### Block 5 #######
            nn.Linear(512, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            
            nn.Linear(128, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            
            nn.Linear(128, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.1),
            ####### Block 6 #######
            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            
            nn.Linear(256, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            
            nn.Linear(256, 1024),
            nn.BatchNorm1d(1024),
            nn.ReLU(),
            nn.Dropout(0.1),
            ####### Block 7 #######
            nn.Linear(1024, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            
            nn.Linear(256, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            
            nn.Linear(256, 1024),
            nn.BatchNorm1d(1024),
            nn.ReLU(),
            nn.Dropout(0.1),
            ####### Block 8 #######
            nn.Linear(1024, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            
            nn.Linear(256, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            
            nn.Linear(256, 1024),
            nn.BatchNorm1d(1024),
            nn.ReLU(),
            nn.Dropout(0.1),
            ######### LAST ##########
            
            nn.Linear(1024, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            
            nn.Dropout(0.1),
            nn.Linear(512, self.num_classes)
            )             
        self._initialize_weights()

    def forward(self, x):
        out = self.fc_layers(x)
        return out

    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.normal_(m.weight, 0, 0.01)
                nn.init.constant_(m.bias, 0)

def build_model(device, model_name='mlp', weight_path=None):

    if model_name == 'mlp':
        model = MLP_only_flatfeatures(4)
    
    return model

def validation(model, criterion, valid_loader, device):
    
    model.eval()
    valid_preds = np.zeros((len(valid_loader.dataset), 4))
    valid_targets = np.zeros((len(valid_loader.dataset), 4))
    val_loss = 0.
    
    with torch.no_grad():
        for i, (data, target) in enumerate(valid_loader):
            
            valid_targets[i * batch_size: (i+1) * batch_size] = target.float().numpy().copy()

            data = data.to(device)
            target = target.float().to(device)
                
            output = model(data)
            loss = criterion(output, target)
            
            valid_preds[i * batch_size: (i+1) * batch_size] = output.detach().cpu().numpy()
            
            val_loss += loss.item() / len(valid_loader)
    
    val_score = mean_absolute_error(valid_preds, valid_targets)

    return val_loss, val_score

seed = 42
seed_everything(seed)

if cuda.is_available:
    device = torch.device("cuda:0")
else:
    device = torch.device("cpu")

In [3]:
%%time
DATASET_PATH = '../wafer'
train_df = pd.read_csv(os.path.join(DATASET_PATH, 'train.csv'))
test_df = pd.read_csv(os.path.join(DATASET_PATH, 'test.csv'))
submission_df = pd.read_csv(os.path.join(DATASET_PATH, 'sample_submission.csv'))

X_train, X_val, y_train, y_val = train_test_split(train_df.iloc[:, 4:], train_df.iloc[:, :4], test_size=0.1, random_state=42, shuffle=True)

X_train = X_train.reset_index(drop=True)
X_val = X_val.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_val = y_val.reset_index(drop=True)

scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_val = scaler.transform(X_val)

batch_size = 2048
train_loader = build_dataloader(X_train, y_train, batch_size, shuffle=True)
valid_loader = build_dataloader(X_val, y_val, batch_size, shuffle=False)

test_df.iloc[:, 1:] = scaler.transform(test_df.iloc[:, 1:])

CPU times: user 35.5 s, sys: 4.86 s, total: 40.4 s
Wall time: 23.3 s


In [4]:
# output path
#output_dir = Path('./', 'output')
#output_dir.mkdir(exist_ok=True, parents=True)

num_epochs = 20000
criterion = nn.L1Loss()

best_epoch_list = []
best_valid_score_list = []

# build model
model = build_model(device, model_name='mlp')
model.to(device)

lr = 0.001
optimizer = AdamW(model.parameters(), lr)
###################  Scheduler ################
#eta_min = 0.000001
#T_max = 10
#T_mult = 1
#restart_decay = 0.97
#scheduler = CosineAnnealingWithRestartsLR(optimizer, T_max=T_max, eta_min=eta_min, T_mult=T_mult, restart_decay=restart_decay)
###############################################

start_time = time()

best_epoch = 0
best_train_loss = 1000
best_valid_score = 1000

for epoch in range(num_epochs):

    model.train()
    optimizer.zero_grad()
    train_loss = 0.0

    for batch_idx, (data, target) in enumerate(train_loader):

        if device:
            data = data.to(device)
            target = target.float().to(device)
        else:
            target = target.float()

        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        train_loss += loss.item() / len(train_loader)

    val_loss, val_score = validation(model, criterion, valid_loader, device)

    elapsed = time() - start_time

    lr = [_['lr'] for _ in optimizer.param_groups]

    #scheduler.step(val_score)
    
    print('Epoch {} / {}  train Loss: {:.4f}  val_loss: {:.4f}  val_score: {:.4f}  lr: {:.5f}  elapsed: {:.0f}m {:.0f}s' \
          .format(epoch,  num_epochs - 1, train_loss, val_loss, val_score, lr[0], elapsed // 60, elapsed % 60))
        
    #model_path = output_dir / 'best_model.pt'
    model_path = '../wafer/mlp_weights/mlp_v2.pt'

    if val_score < best_valid_score:
        best_valid_score = val_score
        best_epoch = epoch
        torch.save(model.state_dict(), model_path)
        print('----------------------------------------------------------------------->> loss improved to {:.5f}'.format(best_valid_score))

    best_epoch_list.append(best_epoch)
    best_valid_score_list.append(best_valid_score)
print("==================== mlp - Best val_loss - {:.5f} =================".format(best_valid_score))

532  val_loss: 3.0597  val_score: 3.0602  lr: 0.00100  elapsed: 49m 40s
Epoch 320 / 19999  train Loss: 6.2484  val_loss: 3.1612  val_score: 3.1610  lr: 0.00100  elapsed: 49m 49s
Epoch 321 / 19999  train Loss: 6.2549  val_loss: 2.9586  val_score: 2.9595  lr: 0.00100  elapsed: 49m 59s
Epoch 322 / 19999  train Loss: 6.2093  val_loss: 3.0348  val_score: 3.0352  lr: 0.00100  elapsed: 50m 8s
Epoch 323 / 19999  train Loss: 6.2340  val_loss: 3.0129  val_score: 3.0144  lr: 0.00100  elapsed: 50m 17s
Epoch 324 / 19999  train Loss: 6.2639  val_loss: 3.2287  val_score: 3.2285  lr: 0.00100  elapsed: 50m 27s
Epoch 325 / 19999  train Loss: 6.2270  val_loss: 3.0607  val_score: 3.0608  lr: 0.00100  elapsed: 50m 36s
Epoch 326 / 19999  train Loss: 6.1415  val_loss: 2.8638  val_score: 2.8646  lr: 0.00100  elapsed: 50m 45s
Epoch 327 / 19999  train Loss: 6.1642  val_loss: 3.1133  val_score: 3.1144  lr: 0.00100  elapsed: 50m 54s
Epoch 328 / 19999  train Loss: 6.2314  val_loss: 3.0220  val_score: 3.0232  lr: 0

KeyboardInterrupt: 

In [8]:
epoch_df = pd.DataFrame()
epoch_df['epoch'] = best_epoch_list
epoch_df['val_score'] = best_valid_score_list

In [9]:
epoch_df.sort_values('val_score').head()

Unnamed: 0,epoch,val_score
3610,3545,1.213821
3574,3545,1.213821
3573,3545,1.213821
3572,3545,1.213821
3571,3545,1.213821


In [10]:
score_to = round(min(best_valid_score_list),6)
score_to

1.213821

In [11]:
%%time
batch_size = 2048
test_loader = build_dataloader(test_df.iloc[:, 1:].values, Y=None, batch_size=batch_size, shuffle=False)

model = build_model(device, model_name='mlp')
model.to(device)

model.eval()
model.load_state_dict(torch.load(model_path))

test_preds = np.zeros((len(test_loader.dataset), 4))

with torch.no_grad():
    for batch_idx, data in enumerate(test_loader):
        if device:
            data = data.to(device)
        outputs = model(data)
        test_preds[batch_idx * batch_size:(batch_idx+1) * batch_size] = outputs.detach().cpu().numpy()

no label
CPU times: user 125 ms, sys: 364 ms, total: 489 ms
Wall time: 539 ms


In [12]:
submission = pd.DataFrame({'id': submission_df['id'],
                           'layer_1':test_preds.transpose()[0],
                           'layer_2':test_preds.transpose()[1],
                           'layer_3':test_preds.transpose()[2],
                           'layer_4':test_preds.transpose()[3]})
submission.to_csv('../wafer/mlp_submission/mlp_v2_3610e_{}_submission.csv'.format(score_to), index=False)

submission.head()

Unnamed: 0,id,layer_1,layer_2,layer_3,layer_4
0,0,255.138824,227.89328,133.118896,85.314796
1,1,159.078415,126.125671,235.97789,99.262184
2,2,147.385742,179.830536,271.033752,156.604706
3,3,92.139008,228.100906,189.203613,83.059128
4,4,272.181763,299.291992,245.998352,269.601746
