# 価格予測モデルのBaseline  
- CNNを用いたモデルを作成する．  
- 価格予測とクラス分類でタスクが大きく異なるので，imagenetで学習したモデルを用いないものを最初に作成する．  
- サイトに載せられる画像を教師データとしており，画像が大きく回転したりなどは不要と考えられるためそのような前処理は行わない．  
- 損失関数にはmaeもしくはrmseを用いる．  

## モデルの構築  
- EfficientNetB0（未学習）を用いて特徴量を抽出．  
- num_sales, コレクション名のone-hotベクトルを抽出した特徴量に結合．  
- 全結合層を重ねて出力．  

## 結果
タスクA  
使用データ: タスクA用のデータすべて+タスクB用のデータ15%のランダムサンプリング  
- RMSE: 0.843  
- MAE: 0.472  

タスクB  
使用データ: タスクB用のデータの中でイーサリアムが10未満のもの（それ以上のものは全体の1%程度だったため今回は外れ値として外している）  
-> 次回以降はRMSLEを用いるためこれらも含めて学習予定  
- RMSE: 0.499  
- MAE: 0.208  

タスクAに関してはデータ不足の可能性が考えられるため，特徴量抽出とともにデータを追加で収集する．  

In [1]:
import os
from typing import List, Optional, Tuple
import math

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
import cv2
import tensorflow as tf
import tensorflow.keras.layers as layers
import tensorflow.keras.models as models
import tensorflow.keras.losses as losses
import tensorflow.keras.optimizers as optim
import tensorflow.keras.activations as activations
from tensorflow.keras.utils import Sequence
import tensorflow.keras.callbacks as callbacks
from tensorflow.keras.applications import EfficientNetB0 as efn
import cloudpickle

2021-11-02 21:02:10.989431: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/lib/python3.9/site-packages/cv2/../../lib64:
2021-11-02 21:02:10.989550: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
A_IMGPATH = "data/taskA/img"
A_DFPATH = "data/taskA/table"
B_IMGPATH = "data/taskB/img"
B_DFPATH = "data/taskB/table"
asset_df_A = pd.read_csv(os.path.join(A_DFPATH, "asset_data.csv"))
asset_df_B = pd.read_csv(os.path.join(B_DFPATH, "asset_data.csv"))

asset_df_A = pd.concat((asset_df_A, pd.get_dummies(asset_df_A['asset_contract.name'])), axis=1)
asset_df_B[asset_df_A.columns.values[8:]] = 0
asset_df_B = asset_df_B.rename(columns={"asset.num_sales": "num_sales"})
asset_df_A = asset_df_A.rename(columns={"last_sale.total_price": "current_price"})
asset_df_A['current_price'] = asset_df_A['current_price'].astype(float)

asset_df_A["full_path"] =\
    asset_df_A["image_id"].apply(lambda x: A_IMGPATH + "/" + x)
asset_df_B["full_path"] =\
    asset_df_B["image_id"].apply(lambda x: B_IMGPATH + "/" + x)

print(f"data shape: {asset_df_B.shape}")
asset_df_B.head(10)

data shape: (37306, 23)


Unnamed: 0,id,asset.image_url,base_price,current_price,payment_token,quantity,num_sales,asset.id,asset.token_id,asset.asset_contract.address,...,Axie,BoredApeYachtClub,CryptoPunks,CyberKongz,Doodles,GalaxyEggs,Jungle Freaks,KaijuKingz,Sneaky Vampire Syndicate,full_path
0,1314403002,https://lh3.googleusercontent.com/7X7Jh0Qj98Qq...,945400000000000000,9.454e+17,0xc02aaa39b223fe8d0a0e5c4f27ead9083c756cc2,1,2.0,50000279.0,4301,0x219b8ab790decc32444a6600971c7c3718252539,...,0,0,0,0,0,0,0,0,0,data/taskB/img/0.png
1,1314402997,https://lh3.googleusercontent.com/xryDC3BXvKyE...,115999999999999990,1.16e+17,0xc02aaa39b223fe8d0a0e5c4f27ead9083c756cc2,1,0.0,76984382.0,2324,0x99654fd49c0e51b8029d2ba7de5b99734ab7afec,...,0,0,0,0,0,0,0,0,0,data/taskB/img/1.png
2,1314402998,https://lh3.googleusercontent.com/8xqBxjjAyDDi...,3257200000000000000,3.2572e+18,0xc02aaa39b223fe8d0a0e5c4f27ead9083c756cc2,1,1.0,42197025.0,3132,0x60e4d786628fea6478f785a6d7e704777c86a7c6,...,0,0,0,0,0,0,0,0,0,data/taskB/img/2.png
3,1314402996,https://lh3.googleusercontent.com/76phNq05wbNE...,762700000000000000,7.627e+17,0xc02aaa39b223fe8d0a0e5c4f27ead9083c756cc2,1,4.0,38106506.0,4261,0x3bf2922f4520a8ba0c2efc3d2a1539678dad5e9d,...,0,0,0,0,0,0,0,0,0,data/taskB/img/3.png
4,1314402994,https://lh3.googleusercontent.com/P5uwk09Mrazk...,100900000000000000,1.009e+17,0xc02aaa39b223fe8d0a0e5c4f27ead9083c756cc2,1,3.0,62867754.0,3018,0x454cbc099079dc38b145e37e982e524af3279c44,...,0,0,0,0,0,0,0,0,0,data/taskB/img/4.png
5,1314402995,https://lh3.googleusercontent.com/nAgT63jOzUJw...,1747000000000000000,1.747e+18,0xc02aaa39b223fe8d0a0e5c4f27ead9083c756cc2,1,2.0,42171574.0,9734,0xc92ceddfb8dd984a89fb494c376f9a48b999aafc,...,0,0,0,0,0,0,0,0,0,data/taskB/img/5.png
6,1314402993,https://lh3.googleusercontent.com/YHxhR8vnrAU0...,275100000000000000,2.751e+17,0xc02aaa39b223fe8d0a0e5c4f27ead9083c756cc2,1,2.0,55557220.0,3592,0x8cd8155e1af6ad31dd9eec2ced37e04145acfcb3,...,0,0,0,0,0,0,0,0,0,data/taskB/img/6.png
7,1314402990,https://lh3.googleusercontent.com/F8QuEmWdc_5k...,2415000000000000000,2.415e+18,0xc02aaa39b223fe8d0a0e5c4f27ead9083c756cc2,1,0.0,23301696.0,3042,0x7bd29408f11d2bfc23c34f18275bbf23bb716bc7,...,0,0,0,0,0,0,0,0,0,data/taskB/img/7.png
8,1314402991,https://lh3.googleusercontent.com/c1rbXbpDbGf0...,350100000000000000,3.501e+17,0xc02aaa39b223fe8d0a0e5c4f27ead9083c756cc2,1,0.0,59778734.0,14856,0x4b3406a41399c7fd2ba65cbc93697ad9e7ea61e5,...,0,0,0,0,0,0,0,0,0,data/taskB/img/8.png
9,1314402992,https://lh3.googleusercontent.com/1GmOtUhjkFLk...,635100000000000000,6.351e+17,0xc02aaa39b223fe8d0a0e5c4f27ead9083c756cc2,1,1.0,36793170.0,8168,0x2acab3dea77832c09420663b0e1cb386031ba17b,...,0,0,0,0,0,0,0,0,0,data/taskB/img/9.png


In [3]:
meta_features = ['Axie', 'BoredApeYachtClub', 'CryptoPunks', 'CyberKongz',
                 'Doodles', 'GalaxyEggs', 'Jungle Freaks', 'KaijuKingz',
                 'Sneaky Vampire Syndicate', 'num_sales']

## Helper functions  

### Data Loader  

In [22]:
class FullPathDataLoader(Sequence):
    """
    Data loader that load images, meta data and targets.
    This class is inherited Sequence class of Keras.
    """

    def __init__(self, path_list: np.ndarray, meta_data: np.ndarray,
                 target: Optional[np.ndarray], batch_size: int, width: int = 256,
                 height: int = 256, resize: bool = True,
                 shuffle: bool = True, is_train: bool = True):
        """
        Constructor. This method determines class variables.

        Parameters
        ----------
        path_list : np.ndarray[str]
            The array of absolute paths of images.
        meta_data : np.ndarray[int]
            One-hot vector of collections.
        target : np.ndarray
            Array of target variavles.
        batch_size : int
            Batch size used when model training.
        width : int
            Width of resized image.
        height : int
            Height of resize image.
        resize : bool
            Flag determine whether to resize.
        shuffle : bool
            Flag determine whether to shuffle on epoch end.
        is_train : bool
            Determine whether this data loader will be used training model.
            if you won't this data loader, you have set 'is_train'=False.
        """
        self.path_list = path_list
        self.meta_data = meta_data
        self.batch_size = batch_size
        self.width = width
        self.height = height
        self.resize = resize
        self.shuffle = shuffle
        self.is_train = is_train
        self.length = math.ceil(len(self.path_list) / self.batch_size)

        if self.is_train:
            self.target = target

    def __len__(self):
        """
        Returns
        -------
        self.length : data length
        """
        return self.length

    def get_img(self, path_list: np.ndarray):
        """
        Load image data and resize image if 'resize'=True.

        Parameters
        ----------
        path_liist : np.ndarray
            The array of relative image paths from directory 'dir_name'.
            Size of this array is 'batch_size'.

        Returns
        -------
        img_list : np.ndarray
            The array of image data.
            Size of an image is (width, height, 3) if 'resize'=True.
        '"""
        img_list = []
        for path in path_list:
            img = cv2.imread(path)
            img = cv2.resize(img, (self.width, self.height))
            img = img / 255.
            img_list.append(img)

        img_list = np.array(img_list)
        return img_list

    def _shuffle(self):
        """
        Shuffle path_list, meta model.
        If 'is_train' is True, target is shuffled in association path_list.
        """
        idx = np.random.permutation(len(self.path_list))
        self.path_list = self.path_list[idx]
        self.meta_data = self.meta_data[idx]
        if self.is_train:
            self.target = self.target[idx]

    def __getitem__(self, idx):
        path_list = self.path_list[self.batch_size*idx:self.batch_size*(idx+1)]
        meta = self.meta_data[self.batch_size*idx:self.batch_size*(idx+1)]
        img_list = self.get_img(path_list)
        if self.is_train:
            target_list = self.target[self.batch_size*idx:self.batch_size*(idx+1)]

            return (img_list, meta), target_list
        else:
            return ((img_list, meta),)

    def on_epoch_end(self):
        if self.is_train:
            self._shuffle()

In [4]:
class DataLoader(Sequence):
    """
    Data loader that load images, meta data and targets.
    This class is inherited Sequence class of Keras.
    """

    def __init__(self, dir_name: str, path_list: np.ndarray, meta_data: np.ndarray,
                 target: Optional[np.ndarray], batch_size: int, width: int = 256,
                 height: int = 256, resize: bool = True,
                 shuffle: bool = True, is_train: bool = True):
        """
        Constructor. This method determines class variables.

        Parameters
        ----------
        dir_name : str
            Name of the directory that includes image data.
        path_list : np.ndarray[str]
            The array of relative paths of images from directory 'dir_name'.
        meta_data : np.ndarray[int]
            One-hot vector of collections.
        target : np.ndarray
            Array of target variavles.
        batch_size : int
            Batch size used when model training.
        width : int
            Width of resized image.
        height : int
            Height of resize image.
        resize : bool
            Flag determine whether to resize.
        shuffle : bool
            Flag determine whether to shuffle on epoch end.
        is_train : bool
            Determine whether this data loader will be used training model.
            if you won't this data loader, you have set 'is_train'=False.
        """
        self.dir_name = dir_name
        self.path_list = path_list
        self.meta_data = meta_data
        self.batch_size = batch_size
        self.width = width
        self.height = height
        self.resize = resize
        self.shuffle = shuffle
        self.is_train = is_train
        self.length = math.ceil(len(self.path_list) / self.batch_size)

        if self.is_train:
            self.target = target

    def __len__(self):
        """
        Returns
        -------
        self.length : data length
        """
        return self.length

    def get_img(self, path_list: np.ndarray):
        """
        Load image data and resize image if 'resize'=True.

        Parameters
        ----------
        path_liist : np.ndarray
            The array of relative image paths from directory 'dir_name'.
            Size of this array is 'batch_size'.

        Returns
        -------
        img_list : np.ndarray
            The array of image data.
            Size of an image is (width, height, 3) if 'resize'=True.
        '"""
        img_list = []
        if self.resize:
            for path in path_list:
                img = cv2.imread(os.path.join(self.dir_name, path))
                img = cv2.resize(img, (self.width, self.height))
                img = img / 255.
                img_list.append(img)

            img_list = np.array(img_list)
        else:
            for path in path_list:
                img = cv2.imread(os.path.join(self.dir_name, path))
                img = img / 255.
                img_list.append(img)

            img_list = np.array(img_list)
        return img_list

    def _shuffle(self):
        """
        Shuffle path_list, meta model.
        If 'is_train' is True, target is shuffled in association path_list.
        """
        idx = np.random.permutation(len(self.path_list))
        self.path_list = self.path_list[idx]
        self.meta_data = self.meta_data[idx]
        if self.is_train:
            self.target = self.target[idx]

    def __getitem__(self, idx):
        path_list = self.path_list[self.batch_size*idx:self.batch_size*(idx+1)]
        meta = self.meta_data[self.batch_size*idx:self.batch_size*(idx+1)]
        img_list = self.get_img(path_list)
        if self.is_train:
            target_list = self.target[self.batch_size*idx:self.batch_size*(idx+1)]

            return (img_list, meta), target_list
        else:
            return ((img_list, meta),)

    def on_epoch_end(self):
        if self.shuffle:
            self._shuffle()

### create model function  

In [13]:
def create_model(input_shape: Tuple[int], meta_shape: int,
                 output_shape: int, activation,
                 learning_rate: float = 0.001) -> models.Model:
    inputs = layers.Input(shape=input_shape)
    efn_model = efn(include_top=False, input_shape=input_shape,
                    weights=None)(inputs)
    ga = layers.GlobalAveragePooling2D()(efn_model)

    meta_inputs = layers.Input(shape=meta_shape)
    concate = layers.Concatenate()([ga, meta_inputs])
    dense1 = layers.Dense(units=128)(concate)
    bn1 = layers.BatchNormalization()(dense1)
    av1 = layers.Activation(activation)(bn1)
    dense2 = layers.Dense(units=64)(av1)
    bn2 = layers.BatchNormalization()(dense2)
    av2 = layers.Activation(activation)(bn2)
    outputs = layers.Dense(output_shape)(av2)

    model = models.Model(inputs=[inputs, meta_inputs], outputs=[outputs])
    model.compile(loss=losses.mean_absolute_error,
                  optimizer=optim.SGD(learning_rate=learning_rate, momentum=0.9),
                  metrics=['mae', 'mse'])
    return model

## training model (task B)  

In [7]:
path_list = asset_df_B.query("current_price <= 1e+19")['image_id'].values
meta_data = asset_df_B.query("current_price <= 1e+19")[meta_features].values
target = asset_df_B.query("current_price <= 1e+19")['current_price'].values * 1e-18

train_path, val_path, train_meta, val_meta, train_y, val_y =\
    train_test_split(path_list, meta_data, target, test_size=0.1, random_state=6174)

train_gen = DataLoader(dir_name=B_IMGPATH, path_list=train_path,
                       meta_data=train_meta, target=train_y,
                       batch_size=16)
val_gen = DataLoader(dir_name=B_IMGPATH, path_list=val_path,
                     meta_data=val_meta, target=val_y,
                     batch_size=1)
model = create_model(input_shape=(256, 256, 3), meta_shape=len(meta_features),
                     output_shape=1, activation=activations.gelu,
                     learning_rate=0.001)

ES = callbacks.EarlyStopping(monitor='val_loss', patience=5,
                             restore_best_weights=True)
MC = callbacks.ModelCheckpoint(filepath="./baseline.h5", monitor='val_loss',
                               save_best_only=True)

print("starting training")
print('*' + '-' * 30 + '*')

model.fit(train_gen, epochs=200, batch_size=16,
          validation_data=val_gen, callbacks=[ES, MC])

print("ending training")
print('*' + '-' * 30 + '*' + '\n')

val_gen = DataLoader(dir_name=B_IMGPATH, path_list=val_path,
                     meta_data=val_meta, target=val_y,
                     batch_size=1, shuffle=False, is_train=False)
pred = model.predict(val_gen)
mae = mean_absolute_error(val_y, pred)
rmse = np.sqrt(mean_squared_error(val_y, pred))

print(f"RMSE Score: {rmse}")
print(f"MAE Score: {mae}")

starting training
*------------------------------*
Epoch 1/200




Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
ending training
*------------------------------*

RMSE Score: 0.49922009596614036
MAE Score: 0.20840772779662736


## training model (task A)  

In [36]:
sample_df = asset_df_B.sample(frac=0.1)

path_list = np.vstack(
    (asset_df_A['full_path'].values.reshape(-1, 1),
     sample_df['full_path'].values.reshape(-1, 1))
).reshape(-1)
meta_data = np.vstack(
    (asset_df_A[meta_features].values.reshape(-1, 10),
     sample_df[meta_features].values.reshape(-1, 10))
)
target = np.vstack(
    (asset_df_A['current_price'].values.reshape(-1, 1) * 1e-18,
     sample_df['current_price'].values.reshape(-1, 1) * 1e-18)
).reshape(-1)

train_path, val_path, train_meta, val_meta, train_y, val_y =\
    train_test_split(path_list, meta_data, target, test_size=0.1, random_state=6174)

train_gen = FullPathDataLoader(path_list=train_path,
                               meta_data=train_meta, target=train_y,
                               batch_size=16)
val_gen = FullPathDataLoader(path_list=val_path,
                             meta_data=val_meta, target=val_y,
                             batch_size=1)
model = create_model(input_shape=(256, 256, 3), meta_shape=len(meta_features),
                     output_shape=1, activation=activations.gelu,
                     learning_rate=0.001)

ES = callbacks.EarlyStopping(monitor='val_loss', patience=5,
                             restore_best_weights=True)
MC = callbacks.ModelCheckpoint(filepath="./baselineA.h5", monitor='val_loss',
                               save_best_only=True)

print("starting training")
print('*' + '-' * 30 + '*')

model.fit(train_gen, epochs=200, batch_size=16,
          validation_data=val_gen, callbacks=[ES, MC])

print("ending training")
print('*' + '-' * 30 + '*' + '\n')

val_gen = FullPathDataLoader(path_list=val_path,
                             meta_data=val_meta, target=val_y,
                             batch_size=1, shuffle=False, is_train=False)
pred = model.predict(val_gen)
mae = mean_absolute_error(val_y, pred)
rmse = np.sqrt(mean_squared_error(val_y, pred))

print(f"RMSE Score: {rmse}")
print(f"MAE Score: {mae}")

starting training
*------------------------------*
Epoch 1/200




Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
ending training
*------------------------------*

RMSE Score: 0.8425242752012974
MAE Score: 0.4717933197680187


In [150]:
class NFTModel:
    def __init__(self, model_path: str):
        self.model_path = model_path
        # 随時追加
        self.collection_dict = {
             'Axie': 0,
             'BoredApeYachtClub': 1,
             'CryptoPunks': 2,
             'CyberKongz': 3,
             'Doodles': 4,
             'GalaxyEggs': 5,
             'Jungle Freaks': 6,
             'KaijuKingz': 7,
             'Sneaky Vampire Syndicate': 8
        }

    def predict(self, img_path: str, collection_name: str, num_sales: int):
        model = models.load_model(self.model_path)

        meta_data = np.zeros(shape=(len(self.collection_dict)+1))
        if collection_name in self.collection_dict.keys():
            meta_data[self.collection_dict[collection_name]] = 1
        meta_data[-1] = num_sales
        meta_data = meta_data.reshape(1, -1)

        img = cv2.resize(cv2.imread(img_path)/256., (256, 256))
        img = img.reshape(1, 256, 256, 3)

        pred = model.predict([img, meta_data])
        return pred[0][0]