In [1]:
# from google.colab import drive
# drive.mount('/content/drive/')

In [2]:
%cd /content/drive/MyDrive/topcoder

/content/drive/MyDrive/topcoder


# Новый раздел

In [3]:
import pickle
import yaml
import numpy as np

import os
import random
import time

import torch
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from torch import nn
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

In [4]:
import math


__all__ = ['mobilenetv3_large', 'mobilenetv3_small']


def _make_divisible(v, divisor, min_value=None):
    """
    This function is taken from the original tf repo.
    It ensures that all layers have a channel number that is divisible by 8
    It can be seen here:
    https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
    :param v:
    :param divisor:
    :param min_value:
    :return:
    """
    if min_value is None:
        min_value = divisor
    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
    # Make sure that round down does not go down by more than 10%.
    if new_v < 0.9 * v:
        new_v += divisor
    return new_v


class h_sigmoid(nn.Module):
    def __init__(self, inplace=True):
        super(h_sigmoid, self).__init__()
        self.relu = nn.ReLU6(inplace=inplace)

    def forward(self, x):
        return self.relu(x + 3) / 6


class h_swish(nn.Module):
    def __init__(self, inplace=True):
        super(h_swish, self).__init__()
        self.sigmoid = h_sigmoid(inplace=inplace)

    def forward(self, x):
        return x * self.sigmoid(x)


class SELayer(nn.Module):
    def __init__(self, channel, reduction=4):
        super(SELayer, self).__init__()
        self.avg_pool = nn.AdaptiveAvgPool2d(1)
        self.fc = nn.Sequential(
                nn.Linear(channel, _make_divisible(channel // reduction, 8)),
                nn.ReLU(inplace=True),
                nn.Linear(_make_divisible(channel // reduction, 8), channel),
                h_sigmoid()
        )

    def forward(self, x):
        b, c, _, _ = x.size()
        y = self.avg_pool(x).view(b, c)
        y = self.fc(y).view(b, c, 1, 1)
        return x * y


def conv_3x3_bn(inp, oup, stride):
    return nn.Sequential(
        nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
        nn.BatchNorm2d(oup),
        h_swish()
    )


def conv_1x1_bn(inp, oup):
    return nn.Sequential(
        nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
        nn.BatchNorm2d(oup),
        h_swish()
    )


class InvertedResidual(nn.Module):
    def __init__(self, inp, hidden_dim, oup, kernel_size, stride, use_se, use_hs):
        super(InvertedResidual, self).__init__()
        assert stride in [1, 2]

        self.identity = stride == 1 and inp == oup

        if inp == hidden_dim:
            self.conv = nn.Sequential(
                # dw
                nn.Conv2d(hidden_dim, hidden_dim, kernel_size, stride, (kernel_size - 1) // 2, groups=hidden_dim, bias=False),
                nn.BatchNorm2d(hidden_dim),
                h_swish() if use_hs else nn.ReLU(inplace=True),
                # Squeeze-and-Excite
                SELayer(hidden_dim) if use_se else nn.Identity(),
                # pw-linear
                nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
                nn.BatchNorm2d(oup),
            )
        else:
            self.conv = nn.Sequential(
                # pw
                nn.Conv2d(inp, hidden_dim, 1, 1, 0, bias=False),
                nn.BatchNorm2d(hidden_dim),
                h_swish() if use_hs else nn.ReLU(inplace=True),
                # dw
                nn.Conv2d(hidden_dim, hidden_dim, kernel_size, stride, (kernel_size - 1) // 2, groups=hidden_dim, bias=False),
                nn.BatchNorm2d(hidden_dim),
                # Squeeze-and-Excite
                SELayer(hidden_dim) if use_se else nn.Identity(),
                h_swish() if use_hs else nn.ReLU(inplace=True),
                # pw-linear
                nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
                nn.BatchNorm2d(oup),
            )

    def forward(self, x):
        if self.identity:
            return x + self.conv(x)
        else:
            return self.conv(x)


class MobileNetV3(nn.Module):
    def __init__(self, cfgs, mode, num_classes=1000, width_mult=1.):
        super(MobileNetV3, self).__init__()
        # setting of inverted residual blocks
        self.cfgs = cfgs
        assert mode in ['large', 'small']

        # building first layer
        input_channel = _make_divisible(16 * width_mult, 8)
        layers = [conv_3x3_bn(3, input_channel, 2)]
        # building inverted residual blocks
        block = InvertedResidual
        for k, t, c, use_se, use_hs, s in self.cfgs:
            output_channel = _make_divisible(c * width_mult, 8)
            exp_size = _make_divisible(input_channel * t, 8)
            layers.append(block(input_channel, exp_size, output_channel, k, s, use_se, use_hs))
            input_channel = output_channel
        self.features = nn.Sequential(*layers)
        # building last several layers
        self.conv = conv_1x1_bn(input_channel, exp_size)
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        output_channel = {'large': 1280, 'small': 1024}
        output_channel = _make_divisible(output_channel[mode] * width_mult, 8) if width_mult > 1.0 else output_channel[mode]
        self.classifier = nn.Sequential(
            nn.Linear(exp_size, output_channel),
            h_swish(),
            nn.Dropout(0.2),
            nn.Linear(output_channel, num_classes),
        )

        self._initialize_weights()

    def forward(self, x):
        x = self.features(x)
        x = self.conv(x)
        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        return x

    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
                if m.bias is not None:
                    m.bias.data.zero_()
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()
            elif isinstance(m, nn.Linear):
                n = m.weight.size(1)
                m.weight.data.normal_(0, 0.01)
                m.bias.data.zero_()


def mobilenetv3_large(**kwargs):
    """
    Constructs a MobileNetV3-Large model
    """
    cfgs = [
        # k, t, c, SE, HS, s 
        [3,   1,  16, 0, 0, 1],
        [3,   4,  24, 0, 0, 2],
        [3,   3,  24, 0, 0, 1],
        [5,   3,  40, 1, 0, 2],
        [5,   3,  40, 1, 0, 1],
        [5,   3,  40, 1, 0, 1],
        [3,   6,  80, 0, 1, 2],
        [3, 2.5,  80, 0, 1, 1],
        [3, 2.3,  80, 0, 1, 1],
        [3, 2.3,  80, 0, 1, 1],
        [3,   6, 112, 1, 1, 1],
        [3,   6, 112, 1, 1, 1],
        [5,   6, 160, 1, 1, 2],
        [5,   6, 160, 1, 1, 1],
        [5,   6, 160, 1, 1, 1]
    ]
    return MobileNetV3(cfgs, mode='large', **kwargs)


def mobilenetv3_small(**kwargs):
    """
    Constructs a MobileNetV3-Small model
    """
    cfgs = [
        # k, t, c, SE, HS, s 
        [3,    1,  16, 1, 0, 2],
        [3,  4.5,  24, 0, 0, 2],
        [3, 3.67,  24, 0, 0, 1],
        [5,    4,  40, 1, 1, 2],
        [5,    6,  40, 1, 1, 1],
        [5,    6,  40, 1, 1, 1],
        [5,    3,  48, 1, 1, 1],
        [5,    3,  48, 1, 1, 1],
        [5,    6,  96, 1, 1, 2],
        [5,    6,  96, 1, 1, 1],
        [5,    6,  96, 1, 1, 1],
    ]

    return MobileNetV3(cfgs, mode='small', **kwargs)


In [5]:
import math
import torch
from torch.optim.lr_scheduler import _LRScheduler

class CosineAnnealingWarmupRestarts(_LRScheduler):
    """
        optimizer (Optimizer): Wrapped optimizer.
        first_cycle_steps (int): First cycle step size.
        cycle_mult(float): Cycle steps magnification. Default: -1.
        max_lr(float): First cycle's max learning rate. Default: 0.1.
        min_lr(float): Min learning rate. Default: 0.001.
        warmup_steps(int): Linear warmup step size. Default: 0.
        gamma(float): Decrease rate of max learning rate by cycle. Default: 1.
        last_epoch (int): The index of last epoch. Default: -1.
    """
    
    def __init__(self,
                 optimizer : torch.optim.Optimizer,
                 first_cycle_steps : int,
                 cycle_mult : float = 1.,
                 max_lr : float = 0.1,
                 min_lr : float = 0.001,
                 warmup_steps : int = 0,
                 gamma : float = 1.,
                 last_epoch : int = -1
        ):
        assert warmup_steps < first_cycle_steps
        
        self.first_cycle_steps = first_cycle_steps # first cycle step size
        self.cycle_mult = cycle_mult # cycle steps magnification
        self.base_max_lr = max_lr # first max learning rate
        self.max_lr = max_lr # max learning rate in the current cycle
        self.min_lr = min_lr # min learning rate
        self.warmup_steps = warmup_steps # warmup step size
        self.gamma = gamma # decrease rate of max learning rate by cycle
        
        self.cur_cycle_steps = first_cycle_steps # first cycle step size
        self.cycle = 0 # cycle count
        self.step_in_cycle = last_epoch # step size of the current cycle
        
        super(CosineAnnealingWarmupRestarts, self).__init__(optimizer, last_epoch)
        
        # set learning rate min_lr
        self.init_lr()
    
    def init_lr(self):
        self.base_lrs = []
        for param_group in self.optimizer.param_groups:
            param_group['lr'] = self.min_lr
            self.base_lrs.append(self.min_lr)
    
    def get_lr(self):
        if self.step_in_cycle == -1:
            return self.base_lrs
        elif self.step_in_cycle < self.warmup_steps:
            return [(self.max_lr - base_lr)*self.step_in_cycle / self.warmup_steps + base_lr for base_lr in self.base_lrs]
        else:
            return [base_lr + (self.max_lr - base_lr) \
                    * (1 + math.cos(math.pi * (self.step_in_cycle-self.warmup_steps) \
                                    / (self.cur_cycle_steps - self.warmup_steps))) / 2
                    for base_lr in self.base_lrs]

    def step(self, epoch=None):
        if epoch is None:
            epoch = self.last_epoch + 1
            self.step_in_cycle = self.step_in_cycle + 1
            if self.step_in_cycle >= self.cur_cycle_steps:
                self.cycle += 1
                self.step_in_cycle = self.step_in_cycle - self.cur_cycle_steps
                self.cur_cycle_steps = int((self.cur_cycle_steps - self.warmup_steps) * self.cycle_mult) + self.warmup_steps
        else:
            if epoch >= self.first_cycle_steps:
                if self.cycle_mult == 1.:
                    self.step_in_cycle = epoch % self.first_cycle_steps
                    self.cycle = epoch // self.first_cycle_steps
                else:
                    n = int(math.log((epoch / self.first_cycle_steps * (self.cycle_mult - 1) + 1), self.cycle_mult))
                    self.cycle = n
                    self.step_in_cycle = epoch - int(self.first_cycle_steps * (self.cycle_mult ** n - 1) / (self.cycle_mult - 1))
                    self.cur_cycle_steps = self.first_cycle_steps * self.cycle_mult ** (n)
            else:
                self.cur_cycle_steps = self.first_cycle_steps
                self.step_in_cycle = epoch
                
        self.max_lr = self.base_max_lr * (self.gamma**self.cycle)
        self.last_epoch = math.floor(epoch)
        for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()):
            param_group['lr'] = lr

In [6]:
CONFIG_PATH = "proj_config.yaml"
with open(CONFIG_PATH, 'r') as stream:
    CONFIG = yaml.safe_load(stream)

In [6]:
def spec_augment(spec: np.ndarray, num_mask=2, freq_masking_max_percentage=0.05, time_masking_max_percentage=0.1):
    spec = spec.copy()
    for i in range(num_mask):
        num_freqs, num_frames = spec.shape
        freq_percentage = random.uniform(0.0, freq_masking_max_percentage)
        time_percentage = random.uniform(0.0, time_masking_max_percentage)
        
        num_freqs_to_mask = int(freq_percentage * num_freqs)
        num_frames_to_mask = int(time_percentage * num_frames)
        
        t0 = int(np.random.uniform(low=0.0, high=num_frames - num_frames_to_mask))
        f0 = int(np.random.uniform(low=0.0, high=num_freqs - num_freqs_to_mask))
        
        spec[:, t0:t0 + num_frames_to_mask] = 0      
        spec[f0:f0 + num_freqs_to_mask, :] = 0 
        
    return spec

In [7]:
def uniform_len(mel, input_len):
    mel_len = mel.shape[-1]
    if mel_len > input_len:
        diff = mel_len - input_len
        start = np.random.randint(diff)
        end = start + input_len
        mel = mel[:, start: end]
    elif mel_len < input_len:
        diff = input_len - mel_len
        offset = np.random.randint(diff)
        offset_right = diff - offset
        mel = np.pad(
            mel,
            ((0, 0), (offset, offset_right)),
            "symmetric",  # constant
        )
    return mel


class TorqueDataset(Dataset):
    """Face Landmarks dataset."""

    def __init__(self, data, mel_logs, labels=None, transform=None):
        """Init Dataset"""
        self.mel_logs = mel_logs
        self.data = data
        self.labels = labels
        self.transform = transform
        self.input_len = CONFIG['mel']['mel_len']
        self.mode = 'test' if self.labels is None else 'train'

    def __len__(self):
        """Length"""
        return len(self.mel_logs)

    def __getitem__(self, index):
        """Generates one sample of data"""
        table_data = self.data[[index]]

        label = None
        if self.mode == 'train':
            label = self.labels[[index]]

        mel_data = uniform_len(self.mel_logs[index], self.input_len)
        if self.transform and self.mode == 'train':
            mel_data = self.transform(mel_data)

        mel_data = np.expand_dims(mel_data, axis=0)
        return mel_data, label

In [8]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def seed_everything(seed=1234):
    """Fix random seeds"""
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [9]:
def get_model(pretrained_mn3_path="", pretrained_path=""):
    """Load MobilenetV3 model with specified in and out channels"""
    # model = mobilenetv3_small().to(DEVICE)
    model = mobilenetv3_large().to(DEVICE)
    if pretrained_mn3_path and not pretrained_path:
        model.load_state_dict(torch.load(pretrained_mn3_path))

    model.features[0][0].weight.data = torch.sum(
        model.features[0][0].weight.data, dim=1, keepdim=True
    )
    model.features[0][0].in_channels = 1

    model.classifier[-1].weight.data = torch.sum(
        model.classifier[-1].weight.data, dim=0, keepdim=True
    )

    model.classifier[-1].bias.data = torch.sum(
        model.classifier[-1].bias.data, dim=0, keepdim=True
    )
    model.classifier[-1].out_features = 1

    if pretrained_path:
        model.load_state_dict(torch.load(pretrained_path))
    return model


def process_epoch(model, criterion, optimizer, loader):
    """Calc one epoch"""
    losses = []
    y_true = []
    y_pred = []
    with torch.set_grad_enabled(model.training):
        for local_batch, local_labels in loader:
            local_batch, local_labels = \
                local_batch.to(DEVICE), local_labels.to(DEVICE)

            optimizer.zero_grad()
            outputs = model(local_batch)

            loss = criterion(outputs, local_labels)
            if model.training:
                loss.backward()
                optimizer.step()

            losses.append(loss)
            y_true.append(local_labels.detach().cpu().numpy())
            y_pred.append(outputs.data.detach().cpu().numpy())
    loss_train = np.array(losses).astype(np.float32).mean()
    y_true = np.concatenate(y_true)
    y_pred = np.concatenate(y_pred)
    rmse_train = mean_squared_error(y_true, y_pred, squared=False)
    return loss_train, rmse_train, y_true, y_pred

In [10]:
def train_model(model, criterion, optimizer, scheduler, train_loader, test_loader, n_fold):
    """Training loop"""
    logs = {'loss_train': [], 'loss_val': [], 'mse_train': [], 'mse_val': []}
    best_true = None
    best_pred = None
    for epoch in range(CONFIG['num_epochs']):
        start_time = time.time()
        scheduler.step()

        # Training
        model.train()
        loss_train, mse_train, _, _ = \
            process_epoch(model, criterion, optimizer, train_loader)
        logs['loss_train'].append(loss_train)
        logs['mse_train'].append(mse_train)

        # Validation
        model.eval()
        loss_val, mse_val, y_true, y_pred = \
            process_epoch(model, criterion, optimizer, test_loader)
        logs['loss_val'].append(loss_val)
        logs['mse_val'].append(mse_val)
        print(
            f"Epoch #{epoch + 1}. "
            f"Time: {(time.time() - start_time):.1f}s. "
            f"Train loss: {loss_train:.3f}, train mse: {mse_train:.5f}. "
            f"Val loss: {loss_val:.3f}, val mse: {mse_val:.5f}"
        )
        if mse_val <= np.min(logs['mse_val']):
            if CONFIG['save_model']:
                torch.save(
                    model.state_dict(),
                    os.path.join(
                        CONFIG['model_dir'],
                        f"work_{CONFIG['experiment_name']}_fold{n_fold}.pt"
                    )
                )
            best_true = y_true
            best_pred = y_pred
    return best_true, best_pred


def run_training():
    with open(CONFIG['data_path'], 'rb') as f:
        (data, mel_logs, target) = pickle.load(f)

    folds = KFold(
        n_splits=CONFIG['n_folds'],
        shuffle=True,
        random_state=CONFIG['fold_seed']
    )
    splits = list(folds.split(mel_logs))

    total_rmse = list()

    for n_fold, (train_idx, val_idx) in enumerate(splits):
        print(f"Start #{n_fold + 1} fold")
        train_dataset = TorqueDataset(
            data[train_idx],
            [mel_logs[i] for i in train_idx],
            target[train_idx],
            transform=spec_augment
        )
        val_dataset = TorqueDataset(
            data[val_idx],
            [mel_logs[i] for i in val_idx],
            target[val_idx]
        )
        train_loader = DataLoader(train_dataset, **CONFIG['loader_params'])
        val_loader = DataLoader(val_dataset, **CONFIG['loader_params'])

        model = get_model(CONFIG['pretrained_path'])
        criterion = nn.MSELoss()
        optimizer = torch.optim.Adam(model.parameters(), CONFIG['lr'])
        scheduler = CosineAnnealingWarmupRestarts(optimizer, **CONFIG['scheduler_params'])

        best_true, best_pred = \
            train_model(model, criterion, optimizer, scheduler, train_loader, val_loader, n_fold)

        rmse = mean_squared_error(best_true, best_pred, squared=False)
        print(f"Training done. Best rmse: {rmse}")
        total_rmse.append(rmse)
    print(f"Total rmse: {np.mean(total_rmse)}")
    print(total_rmse)

In [11]:
seed_everything()

In [12]:
CONFIG['loader_params'] = {'batch_size': 16, 'shuffle': True, 'num_workers': 4}
CONFIG['lr'] = 0.0001
CONFIG['num_epochs'] = 40

CONFIG['pretrained_path'] = './pretrained/mobilenetv3-large-1cd25616.pth'

CONFIG['scheduler_params'] = {'first_cycle_steps':20,
                            'cycle_mult':1.0,
                            'max_lr':CONFIG['lr'] * 6,
                            'min_lr':CONFIG['lr'] / 8,
                            'warmup_steps':5,
                            'gamma':0.9}

CONFIG['experiment_name'] = 'one_cosine'

CONFIG['freq_masking_max_percentage'] = 0.15
CONFIG['time_masking_max_percentage'] = 0

In [16]:
run_training() 

# Total rmse: 22.120615005493164
# [22.944857, 20.443523, 19.873909, 23.853697, 20.806335, 21.964138, 20.037605, 22.54679, 24.713093, 24.0222]

Start #1 fold
Epoch #1. Time: 6.4s. Train loss: 5768.606, train mse: 76.07491. Val loss: 4435.967, val mse: 66.72330
Epoch #2. Time: 6.4s. Train loss: 1252.637, train mse: 35.36698. Val loss: 847.327, val mse: 29.37916
Epoch #3. Time: 6.5s. Train loss: 874.189, train mse: 29.50181. Val loss: 833.632, val mse: 28.86547
Epoch #4. Time: 6.4s. Train loss: 870.914, train mse: 29.52761. Val loss: 816.574, val mse: 28.10573
Epoch #5. Time: 6.4s. Train loss: 823.081, train mse: 28.69410. Val loss: 800.026, val mse: 28.13097
Epoch #6. Time: 6.4s. Train loss: 772.562, train mse: 27.78125. Val loss: 810.655, val mse: 28.92309
Epoch #7. Time: 6.4s. Train loss: 754.283, train mse: 27.51840. Val loss: 761.099, val mse: 27.84222
Epoch #8. Time: 6.4s. Train loss: 743.080, train mse: 27.24125. Val loss: 733.774, val mse: 26.69363
Epoch #9. Time: 6.4s. Train loss: 695.480, train mse: 26.33726. Val loss: 697.167, val mse: 26.49322
Epoch #10. Time: 6.3s. Train loss: 637.381, train mse: 25.25855. Val loss: