### 基本の設定

### Seed の固定

再現性を取るため

In [2]:
# from google.colab import drive
# drive.mount('/content/drive')

In [3]:
import os
import random

import numpy as np
import torch


def seed_torch(seed=330):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    print("*** os.environ['PYTHONHASHSEED']  = ", os.environ['PYTHONHASHSEED'])
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_torch

import matplotlib.pyplot as plt

import os

import pandas as pd
import numpy as np
from glob import  glob

### 各種ディレクトリの定義

In [4]:
dataset_root = '/content/drive/MyDrive/atmacup11/'
assert dataset_root is not None

input_dir = os.path.join(dataset_root, "datasets")
photo_dir = os.path.join(input_dir, "photos")

output_dir = os.path.join(dataset_root, "outputs_nb010")
os.makedirs(output_dir, exist_ok=True)

train_df = pd.read_csv(os.path.join(input_dir, 'train.csv'))
test_df = pd.read_csv(os.path.join(input_dir, 'test.csv'))

material_df = pd.read_csv(os.path.join(input_dir, 'materials.csv'))
technique_df = pd.read_csv(os.path.join(input_dir, 'techniques.csv'))

RANDOM_SEED = 330

In [5]:
class Config:
    N_FOLDS = 5
    N_EPOCHS = 100

### StratifiedGroupKFoldの実装

In [6]:
import random
from sklearn.model_selection import GroupKFold
from collections import Counter, defaultdict


def Count_y(y, groups):
    # y counts per group
    unique_num = np.max(y) + 1
    y_counts_per_group = defaultdict(lambda: np.zeros(unique_num))
    for label, g in zip(y, groups):
        y_counts_per_group[g][label] += 1

    return y_counts_per_group


def StratifiedGroupKFold(X, y, groups, features, k, seed = None):
    # Preparation
    max_y = np.max(y)
    y_counts_per_group = Count_y(y, groups)
    kf = GroupKFold(n_splits=k)

    for train_idx, val_idx in kf.split(X, y, groups):
        # Training dataset and validation dataset
        x_train = X.iloc[train_idx, :]
        id_train = x_train["art_series_id"].unique()
        x_train = x_train[features]

        x_val, y_val = X.iloc[val_idx, :], y.iloc[val_idx]
        id_val = x_val["art_series_id"].unique()
        x_val = x_val[features]

        # y counts of training dataset and validation dataset
        y_counts_train = np.zeros(max_y+1)
        y_counts_val = np.zeros(max_y+1)
        for id_ in id_train:
            y_counts_train += y_counts_per_group[id_]
        for id_ in id_val:
            y_counts_val += y_counts_per_group[id_]

        # Determination ratio of validation dataset
        numratio_train = y_counts_train / np.max(y_counts_train)
        stratified_count = np.ceil(y_counts_val[np.argmax(y_counts_train)] * numratio_train)
        stratified_count = stratified_count.astype(int)

        # Select validation dataset randomly
        val_idx = np.array([])
        np.random.seed(seed) 
        for num in range(max_y+1):
            val_idx = np.append(val_idx, np.random.choice(y_val[y_val==num].index, stratified_count[num]))
        val_idx = val_idx.astype(int)
        
        yield train_idx, val_idx

### 画像データの読み込み

In [7]:
from PIL import Image

def to_img_path(object_id):
    return os.path.join(photo_dir, f'{object_id}.jpg')

def read_image(object_id):
    return Image.open(to_img_path(object_id))
import torch
from torch import nn
from torch.optim import Adam
from torch.optim.optimizer import Optimizer
from torch.utils import data

# torchvision
from torchvision import transforms as T
from torchvision.models import resnet34

# scikit-learn
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

In [8]:
IMG_MEAN = [0.485, 0.456, 0.406]
IMG_STD = [0.229, 0.224, 0.225]

class AtmaDataset(data.Dataset):
    """atmaCup用にデータ読み込み等を行なうデータ・セット"""
    object_path_key = "object_path"
    label_key = "target"

    @property
    def meta_keys(self):
        retval = [self.object_path_key]

        if self.is_train:
            retval += [self.label_key]

        return retval

    def __init__(self, meta_df: pd.DataFrame, is_train=True):
        """
        args:
            meta_df: 
                画像へのパスと label 情報が含まれている dataframe
                必ず object_path に画像へのパス, target に正解ラベルが入っている必要があります
            
            is_train:
                True のとき学習用のデータ拡張を適用します.
                False の時は単に size にリサイズを行います
        """

        self.is_train = is_train
        self.meta_df = meta_df.reset_index(drop=True)
        self.index_to_data = self.meta_df.to_dict(orient="index")

        size = (224, 224)

        additional_items = (
            [T.Resize(size)]
            if not is_train
            else [
                T.RandomGrayscale(p=0.2),
                T.RandomVerticalFlip(),
                T.RandomHorizontalFlip(),
                T.RandomResizedCrop(size),
            ]
        )

        self.transformer = T.Compose(
            [*additional_items, T.ToTensor(), T.Normalize(mean=IMG_MEAN, std=IMG_STD)]
        )

    def __getitem__(self, index):
        data = self.index_to_data[index]

        obj_path, label = data.get(self.object_path_key), data.get(self.label_key, -1)
        img = Image.open(obj_path)
        img = self.transformer(img)
        return img, label

    def __len__(self):
        return len(self.meta_df)

In [9]:
# CUDA を使うので確認. google colab の場合 GPU accelerator をオンにしておいてください
assert torch.cuda.is_available()

DEVICE = torch.device("cuda")

## Train / Validation Phase

In [10]:
def train(
    model: nn.Module,
    optimizer: Optimizer,
    train_loader: data.DataLoader
) -> pd.Series:

    # train にすることで model 内の学習時にのみ有効な機構が有効になります (Dropouts Layers、BatchNorm Layers...)
    model.train()
    
    criterion = nn.MSELoss()

    # ロスの値を保存する用に dict を用意
    metrics = defaultdict(float)
    n_iters = len(train_loader)
    
    for i, (x_i, y_i) in enumerate(train_loader):
        x_i = x_i.to(DEVICE)
        y_i = y_i.to(DEVICE).reshape(-1, 1).float()

        output = model(x_i)
        loss = criterion(output, y_i)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        metric_i = {
            # loss は tensor object なので item をつかって python object に戻す
            "loss": loss.item()
        }
        for k, v in metric_i.items():
            metrics[k] += v

    for k, v in metrics.items():
        metrics[k] /= n_iters

    # Series型を作るが、その時すべての列名に接頭語 train_をつける
    return pd.Series(metrics).add_prefix("train_")

def predict(model: nn.Module, loader: data.DataLoader) -> np.ndarray:
    # train とは逆で model 内の学習時にのみ有効な機構がオフになります (Dropouts Layers、BatchNorm Layers...)
    model.eval()
    
    predicts = []
    
    for x_i, y_i in loader:
        
        # 明示的に勾配を計算しないように指定することができます. 
        # この関数ではモデルの更新はせずに単に出力だけを使いますので勾配は不要です.
        with torch.no_grad():
            output = model(x_i.to(DEVICE))

        predicts.extend(output.data.cpu().numpy())

    pred = np.array(predicts).reshape(-1)
    return pred


def calculate_metrics(y_true, y_pred) -> dict:
    """正解ラベルと予測ラベルから指標を計算する"""    
    return {
        'rmse': mean_squared_error(y_true, y_pred) ** .5
    }


def valid(
    model: nn.Module, 
    y_valid: np.ndarray, 
    valid_loader: data.DataLoader
) -> pd.Series:
    """検証フェーズ
    与えられたモデル・データローダを使って検証フェーズを実行。スコアの dict と予測した値を返す
    """
    
    pred = predict(model, valid_loader)
    score = calculate_metrics(y_valid, pred)

    valid_score = pd.Series(score)
    return valid_score.add_prefix("valid_"), pred

## Run Fold

1. train / valid の loader 作成
2. 以下を epoch 数だけ繰り返す
    1. 学習用データで学習 
    2. 検証用データで検証スコアの算出

In [11]:
def get_output_dir(n_cv: int):
    return os.path.join(output_dir, 'simple_resnet', f'cv={n_cv}')

In [12]:
def run_fold(
    model: nn.Module, 
    train_df: pd.DataFrame, 
    valid_df: pd.DataFrame, 
    y_valid: np.ndarray, 
    output_dir: str, 
    n_epochs=30) -> np.ndarray:
    """
    train / valid に分割されたデータで学習と同時に検証を行なう
    """
    
    # 0: 
    #   : 前準備. dataframe から data loader を作成
    train_dataset = AtmaDataset(meta_df=train_df)
    train_loader = data.DataLoader(
        train_dataset, batch_size=128, shuffle=True, drop_last=True, num_workers=2
    )
    
    #   : 検証用の方は is_train=False にしてデータ拡張オフにする
    valid_dataset = AtmaDataset(meta_df=valid_df, is_train=False)
    valid_loader = data.DataLoader(valid_dataset, batch_size=256, num_workers=2)
    
        # 当Fold(CV#)用のdirを作る
    os.makedirs(output_i, exist_ok=True)

    # optimizer の定義
    optimizer = Adam(model.parameters(), lr=1e-3)

    # --- 保存のための変数定義
    score_df = pd.DataFrame()
    valid_score = np.inf
    valid_score_key = "valid_rmse"
    valid_best_pred = None

    for epoch in range(1, n_epochs + 1):
        print(f'|||. start epoch = {epoch}.  |||')
        
        # 1: 学習用データで学習を実行。学習時のロスを取得
        score_train = train(model, optimizer, train_loader)

        # 2: 検証データでのスコアを計算
        score_valid, y_valid_pred = valid(model=model, valid_loader=valid_loader, y_valid=y_valid)

        # print(score_valid)

        --- 学習のロスと検証スコアの値をデータフレームに追加
        # pd.concat : DataFrame , Seriesを連結する
        row = pd.concat([score_train, score_valid])
        row["epoch"] = epoch
        row = pd.DataFrame([row])
        # tabulate : 表形式で表示してくれる
        # 多分どんどん行が増えてくるはず？ Yes!
        # print(tabulate(row, headers=row.columns))
        score_df = pd.concat([score_df, row], ignore_index=True)
        # ---

           
        #  今の検証スコアと過去最高のスコアを比較
        current_score = score_valid[valid_score_key]
        if current_score < valid_score:
            # スコア改善したときモデルを保存する
            print(f'validation score is improved!! {valid_score:.4f} -> {current_score:.4f}')
            torch.save(
                model.state_dict(), os.path.join(output_dir, 'model_best.pth')
            )
            valid_score = current_score
            valid_best_pred = y_valid_pred

    score_df.to_csv(os.path.join(output_dir, 'score.csv'), index=False)
    return valid_best_pred


### その他

モデル作成などの関数定義

In [13]:
def create_model():
    model = resnet34(pretrained=False)
    model.fc = nn.Linear(in_features=512, out_features=1, bias=True)    
    return model
    
def create_metadata(input_df):
    out_df = input_df[['object_id']].copy()
    out_df['object_path'] = input_df['object_id'].map(to_img_path)
    
    if "target" in input_df:
        out_df["target"] = input_df["target"]

    return out_df

def run_test_predict(model):
    test_meta_df = create_metadata(test_df)

    # 学習時のデータ拡張はオフにしたいので is_train=False としている
    test_dataset = AtmaDataset(meta_df=test_meta_df, is_train=False)
    test_loader = data.DataLoader(dataset=test_dataset, batch_size=128, drop_last=False, num_workers=2)
    
    y_pred = predict(model, loader=test_loader)
    return y_pred

## おさらい

前回の学習のコード

In [14]:
train_meta_df = create_metadata(train_df)

In [15]:
# skf = StratifiedGroupKFold(X, y, groups, features, 5, RANDOM_SEED)
# oof = np.zeros((len(train_df), ), dtype=np.float32)

# for i, (idx_tr, idx_valid) in enumerate(skf):
#     print("******************************")
#     print(f"********  cv = {i}   ********")
#     print("******************************")
#     output_i = get_output_dir(i)
#     model = create_model()
#     model.to(DEVICE)
    
#     # 1. Fold の学習
#     oof_i = run_fold(
#                 model=model, 
#                 train_df=train_meta_df.iloc[idx_tr], 
#                 valid_df=train_meta_df.iloc[idx_valid], 
#                 y_valid=train_meta_df['target'].values[idx_valid],
#                 output_dir = output_i,
#                 n_epochs=100
#             )
    
#     oof[idx_valid] = oof_i


# # 学習が終了したら各foldの検証予測値を使って、train_df['target']とのrmseを計算する
# calculate_metrics(train_df['target'], oof)

### TTA

test time augmentation の略です。推論を行なう際にも Augmentation を行ってその平均を取る方法のことです。同じ画像をちょっとずらしたものを何回も推論して平均することで性能が上がる場合があります。

今回は簡易のため学習時に使うのと同様の augmentation を使っています。(これが一番いいというわけではないです)

In [16]:
from tqdm import tqdm

def run_test_predict_TTA(model, n_tta=0):
    test_meta_df = create_metadata(test_df)

    # n_tta > 0 の時だけデータ拡張を on にする (is_train = True)
    is_tta_mode = n_tta > 0
    test_dataset = AtmaDataset(meta_df=test_meta_df, is_train=is_tta_mode)
    test_loader = data.DataLoader(dataset=test_dataset, batch_size=128, drop_last=False, num_workers=2)
    
    predictions = []
    n_times = 1 if not is_tta_mode else n_tta
    print(f"run #{n_times} times / tta={is_tta_mode}")
    for _ in tqdm(range(n_times)):
        y_pred = predict(model, loader=test_loader)
        predictions.append(y_pred)
    
    # axis=0: 行方向
    return np.array(predictions).mean(axis=0)

## TTA 10回

In [18]:
# TTAを使ってテストデータを推論してみる
test_predictions = []

# for i in range(len(cv)):
for i in range(5):
    output_i = get_output_dir(i)
    
    model = resnet34(pretrained=False)
    model.fc = nn.Linear(in_features=512, out_features=1, bias=True)
    
    model_path = os.path.join(output_i, 'model_best.pth')
    model.load_state_dict(torch.load(model_path))
    model.to(DEVICE)
    
    y_pred_i = run_test_predict_TTA(model, 5)
    test_predictions.append(y_pred_i)

# submission用のファイルを生成
# すべての予測の平均値を使う
pred_mean = np.array(test_predictions).mean(axis=0)

pd.DataFrame({
    "target": pred_mean
}).to_csv(os.path.join(output_dir, "013__submission.csv"), index=False)

  0%|          | 0/5 [00:00<?, ?it/s]

run #5 times / tta=True


100%|██████████| 5/5 [01:32<00:00, 18.51s/it]
  0%|          | 0/5 [00:00<?, ?it/s]

run #5 times / tta=True


100%|██████████| 5/5 [01:31<00:00, 18.30s/it]
  0%|          | 0/5 [00:00<?, ?it/s]

run #5 times / tta=True


100%|██████████| 5/5 [01:33<00:00, 18.72s/it]
  0%|          | 0/5 [00:00<?, ?it/s]

run #5 times / tta=True


100%|██████████| 5/5 [01:31<00:00, 18.23s/it]
  0%|          | 0/5 [00:00<?, ?it/s]

run #5 times / tta=True


100%|██████████| 5/5 [01:32<00:00, 18.47s/it]
