# Task Aモデルの検討，評価  

In [1]:
import os
import sys
import gc
import warnings
import math
from typing import Optional

import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import KFold, train_test_split, StratifiedKFold
from sklearn.linear_model import LinearRegression
from tqdm.notebook import tqdm
import lightgbm as lgb
import tensorflow as tf
import tensorflow.keras.layers as layers
import tensorflow.keras.models as models
import tensorflow.keras.losses as losses
import tensorflow.keras.optimizers as optim
import tensorflow.keras.activations as activations
from tensorflow.keras.utils import Sequence
import matplotlib.pyplot as plt

sys.path.append('../src')
sys.path.append('../Swin-Transformer-TF')
from ml_utils import *
from utils import *
from swintransformer import SwinTransformer

warnings.filterwarnings('ignore')
pd.options.display.max_columns = 20
pd.options.display.max_rows = 20


%matplotlib inline

## データの読み込み  

In [23]:
DIR = "../data/taskA/table"
IMG_DIR = "../data/taskA/img"

# データの読み込み
asset_df = pd.read_csv(os.path.join(DIR, "asset_data.csv"))
# 画像のパスを相対パスにする
asset_df['full_path'] = asset_df['image_id'].apply(lambda x: IMG_DIR+'/'+x)

# targetにrename．ETHに変換
asset_df = asset_df.rename(columns={"last_sale.total_price": "target"})
asset_df['target'] = np.log1p(asset_df['target'].astype(float) * 1e-18)
asset_df = asset_df.query("target > 0").reset_index(drop=True)

# 最後に取引された日時をtimestampに変換
asset_df['last_sale.event_timestamp'] = pd.to_datetime(asset_df['last_sale.event_timestamp']).apply(lambda x: x.timestamp())

asset_df.head()

Unnamed: 0,id,token_id,num_sales,background_color,image_url,image_preview_url,image_thumbnail_url,image_original_url,animation_url,animation_original_url,...,last_sale.created_date,last_sale.quantity,last_sale.transaction.from_account.user.username,owner.user.username,last_sale.transaction.to_account.user.username,creator.user.username,creator,collection.display_data.images,image_id,full_path
0,527189,3604,3,,https://lh3.googleusercontent.com/F29AWc3Qgx3Q...,https://lh3.googleusercontent.com/F29AWc3Qgx3Q...,https://lh3.googleusercontent.com/F29AWc3Qgx3Q...,https://www.larvalabs.com/cryptopunks/cryptopu...,,,...,2021-11-12T15:19:23.231230,1,,,,,,,0.png,../data/taskA/img/0.png
1,528119,5108,1,,https://lh3.googleusercontent.com/CLeVgaNXAR3y...,https://lh3.googleusercontent.com/CLeVgaNXAR3y...,https://lh3.googleusercontent.com/CLeVgaNXAR3y...,https://www.larvalabs.com/cryptopunks/cryptopu...,,,...,2021-11-12T07:46:09.630431,1,Cool-Punks,,,,,,1.png,../data/taskA/img/1.png
2,527733,4503,1,,https://lh3.googleusercontent.com/uXSp3edvlFtO...,https://lh3.googleusercontent.com/uXSp3edvlFtO...,https://lh3.googleusercontent.com/uXSp3edvlFtO...,https://www.larvalabs.com/cryptopunks/cryptopu...,,,...,2021-11-12T01:01:22.486461,1,,,,,,,2.png,../data/taskA/img/2.png
3,176535,6729,1,,https://lh3.googleusercontent.com/hQ0qO_Kzf94W...,https://lh3.googleusercontent.com/hQ0qO_Kzf94W...,https://lh3.googleusercontent.com/hQ0qO_Kzf94W...,https://www.larvalabs.com/cryptopunks/cryptopu...,,,...,2021-11-12T00:31:25.335608,1,Cool-Punks,,,,,,3.png,../data/taskA/img/3.png
4,179178,4313,4,,https://lh3.googleusercontent.com/xJeddtlpPc1k...,https://lh3.googleusercontent.com/xJeddtlpPc1k...,https://lh3.googleusercontent.com/xJeddtlpPc1k...,https://www.larvalabs.com/cryptopunks/cryptopu...,,,...,2021-11-11T18:00:22.265147,1,,,,,,,5.png,../data/taskA/img/5.png


## 過去の取引履歴のパスをそれぞれ追加する．  

In [24]:
asset_df['event_path'] = '../data/taskA/table/' + asset_df['collection.name'] + '/' +\
    asset_df['asset_contract.address'] + '_' + asset_df['token_id'] + '.csv'
asset_df['event_path']

0        ../data/taskA/table/CryptoPunks/0xb47e3cd837dd...
1        ../data/taskA/table/CryptoPunks/0xb47e3cd837dd...
2        ../data/taskA/table/CryptoPunks/0xb47e3cd837dd...
3        ../data/taskA/table/CryptoPunks/0xb47e3cd837dd...
4        ../data/taskA/table/CryptoPunks/0xb47e3cd837dd...
                               ...                        
21742    ../data/taskA/table/Eponym by ART AI/0xaa20f90...
21743    ../data/taskA/table/Eponym by ART AI/0xaa20f90...
21744    ../data/taskA/table/Eponym by ART AI/0xaa20f90...
21745    ../data/taskA/table/Eponym by ART AI/0xaa20f90...
21746    ../data/taskA/table/Eponym by ART AI/0xaa20f90...
Name: event_path, Length: 21747, dtype: object

## データを分割  

In [25]:
train_df, test_df = train_test_split(asset_df, test_size=0.1, random_state=6174)

## helper function  

In [4]:
# SwinTransformerをベースとしたモデルを作成する関数
def create_model(input_shape: Tuple[int], output_shape: int,
                 activation, loss, learning_rate: float = 0.001,
                 pretrain: bool = False) -> models.Model:
    """
    The function for creating model.

    Parameters
    ----------
    input_shape : int
        Shape of input image data.
    output_shape : int
        Shape of model output.
    activation : function
        The activation function used hidden layers.
    loss : function
        The loss function of model.
    meta_shape : int
        Shape of input meta data of image.
    task : str
        Please determine this model will be used for task A or B(default=A).
    learning_rate : float
        The learning rate of model.
    pretrain : bool
        Flag that deterimine whether use pretrain model(default=False).

    Returns
    -------
    model : keras.models.Model
        Model instance.
    """
    if pretrain:
        weights = 'imagenet'
    else:
        weights = None

    inputs = layers.Input(shape=input_shape)
    base_model = SwinTransformer('swin_tiny_224', include_top=False, pretrained=True, use_tpu=False)(inputs)

    dense1 = layers.Dense(units=128)(base_model)
    av1 = layers.Activation(activation)(dense1)
    dr1 = layers.Dropout(0.3)(av1)
    dense2 = layers.Dense(units=64)(dr1)
    av2 = layers.Activation(activation)(dense2)
    dr2 = layers.Dropout(0.3)(av2)
    outputs = layers.Dense(output_shape)(dr2)

    model = models.Model(inputs=[inputs], outputs=[outputs])

    model.compile(loss=loss,
                  optimizer=optim.Adam(learning_rate=learning_rate),
                  metrics=['mae', 'mse'])
    return model

In [5]:
# LSTMモデルを作成する関数
def create_lstm(len_seq: int = 49, input_dim: int = 1, output_dim: int = 1):
    inputs = layers.Input(shape=(len_seq, input_dim))
    lstm = layers.LSTM(64)(inputs)
    outputs = layers.Dense(output_dim)(lstm)

    model = models.Model(inputs=[inputs], outputs=[outputs])
    model.compile(loss=losses.mean_squared_error, optimizer=optim.Adam())
    return model

In [6]:
# モデルの保存，予測，評価などするためのラッパー
# 現状SwinTransformerのみを保存することになるため，学習全体を保存できるように修正予定
class NFTModel(KerasRegressor):
    """
    Model class.
    This class is inherited KerasRegressor class of keras.
    """

    def __init__(self, model_func, input_shape, output_shape,
                 activation, loss, learning_rate, pretrain):
        """
        Constructor.

        Prameters
        ---------
        model_func : function
            The function for creating model.
        """
        self.model_func = model_func
        self.input_shape = input_shape
        self.output_shape = output_shape
        self.activation = activation
        self.loss = loss
        self.learning_rate = learning_rate
        self.pretrain = pretrain
        super().__init__(
            build_fn=model_func(input_shape, output_shape,
                                activation=activation, loss=loss,
                                learning_rate=learning_rate, pretrain=pretrain)
        )
        self.model = self.build_fn

    def __getstate__(self):
        result = {'sk_params': self.sk_params,
                  'model_func': self.model_func,
                  'input_shape': self.input_shape,
                  'output_shape': self.output_shape,
                  'activation': self.activation,
                  'loss': self.loss,
                  'learning_rate': self.learning_rate,
                  'pretrain': self.pretrain}
        with tempfile.TemporaryDirectory() as dir:
            if hasattr(self, 'model'):
                self.model.save_weights(dir + '/output.h5')
                with open(dir + '/output.h5', 'rb') as f:
                    result['weights'] = f.read()
        return result

    def __setstate__(self, serialized):
        self.sk_params = serialized['sk_params']
        self.model_func = serialized['model_func']
        self.input_shape = serialized['input_shape']
        self.output_shape = serialized['output_shape']
        self.activation = serialized['activation']
        self.loss = serialized['loss']
        self.learning_rate = serialized['learning_rate']
        self.pretrain = serialized['pretrain']
        self.model = self.model_func(
                    self.input_shape, self.output_shape,
                    activation=self.activation, loss=self.loss,
                    learning_rate=self.learning_rate, pretrain=self.pretrain
                )

        with tempfile.TemporaryDirectory() as dir:
            weight_data = serialized.get('weights')
            if weight_data:
                with open(dir + '/input.h5', 'wb') as f:
                    f.write(weight_data)
                self.model.load_weights(dir + '/input.h5')

    def fit(self, train_gen, val_gen, epochs, batch_size, callbacks=None):
        """
        Training model.

        Parameters
        ----------
        train_gen : iterator
            The generator of train data.
        val_gen : iterator
            The generator of validation data.
        epochs : int
            Number of epochs for training model.
        batch_size : int
            Size of batch for training model.
        callbacks : list
            The list of callbacks.
            For example [EarlyStopping instance, ModelCheckpoint instance]
        """
        self.model.fit(train_gen, epochs=epochs, batch_size=batch_size,
                       validation_data=val_gen, callbacks=callbacks)

    def evaluate(self, test_X, test_y):
        """
        Evaluate model.

        Parameters
        ----------
        test_X : iterator
            The generator of test data.
        test_y : np.ndarray
            The array of targets of test data.
        """
        pred = self.model.predict(test_X)
        pred = np.where(pred < 0, 0, pred)
        rmse = np.sqrt(mean_squared_error(test_y, pred))
        mae = np.sqrt(mean_absolute_error(test_y, pred))

        print(f"RMSE Score: {rmse}")
        print(f"MAE Score: {mae}")

    def predict(self, img: np.ndarray):
        """
        Predict data using trained model.

        Parameters
        ----------
        img : np.array
            Numpy array of image data.

        Returns
        -------
        pred : float
            The value of predict.
        """
        img = cv2.resize(img/255., (224, 224))
        img = img.reshape(1, 224, 224, 3)

        pred = self.model.predict(img)

        return pred[0][0]

    def predict_gen(self, gen: Sequence):
        """
        Predict data using trained model(use generator).

        Parameters
        ----------
        gen : sub class of keras.Sequence
            Data generator.
        Returns
        -------
        preds : float
            The value of predict.
        """
        preds = self.model.predict(gen)
        preds = np.where(preds < 0, 0, preds)

        return preds

In [7]:
# Generatorのクラス定義
class DataLoader(Sequence):
    """
    Data loader that load images, meta data and targets.
    This class is inherited Sequence class of Keras.
    """

    def __init__(self, path_list: np.ndarray, target: Optional[np.ndarray] = None,
                 batch_size: int = 16,width: int = 224, height: int = 224,
                 resize: bool = True, shuffle: bool = True, is_train: bool = True):
        """
        Constructor. This method determines class variables.

        Parameters
        ----------
        path_list : np.ndarray[str]
            The array of absolute paths of images.
        meta_data : np.ndarray[int]
            One-hot vector of collections.
        target : np.ndarray
            Array of target variavles.
        batch_size : int
            Batch size used when model training.
        task : str
            Please determine this data loader will be used for task A or B(default=A).
        width : int
            Width of resized image.
        height : int
            Height of resize image.
        resize : bool
            Flag determine whether to resize.
        shuffle : bool
            Flag determine whether to shuffle on epoch end.
        is_train : bool
            Determine whether this data loader will be used training model.
            if you won't this data loader, you have set 'is_train'=False.
        """
        self.path_list = path_list
        self.batch_size = batch_size
        self.width = width
        self.height = height
        self.resize = resize
        self.shuffle = shuffle
        self.is_train = is_train
        self.length = math.ceil(len(self.path_list) / self.batch_size)

        if self.is_train:
            self.target = target

    def __len__(self):
        """
        Returns
        -------
        self.length : data length
        """
        return self.length

    def get_img(self, path_list: np.ndarray):
        """
        Load image data and resize image if 'resize'=True.

        Parameters
        ----------
        path_liist : np.ndarray
            The array of relative image paths from directory 'dir_name'.
            Size of this array is 'batch_size'.

        Returns
        -------
        img_list : np.ndarray
            The array of image data.
            Size of an image is (width, height, 3) if 'resize'=True.
        '"""
        img_list = []
        for path in path_list:
            img = cv2.imread(path)
            img = cv2.resize(img, (self.width, self.height))
            img = img / 255.
            img_list.append(img)

        img_list = np.array(img_list)
        return img_list

    def _shuffle(self):
        """
        Shuffle path_list, meta model.
        If 'is_train' is True, target is shuffled in association path_list.
        """
        idx = np.random.permutation(len(self.path_list))
        self.path_list = self.path_list[idx]
        if self.is_train:
            self.target = self.target[idx]

    def __getitem__(self, idx):
        path_list = self.path_list[self.batch_size*idx:self.batch_size*(idx+1)]
        img_list = self.get_img(path_list)
        if self.is_train:
            target_list = self.target[self.batch_size*idx:self.batch_size*(idx+1)]
            return img_list, target_list
        else:
            return img_list

    def on_epoch_end(self):
        if self.is_train:
            self._shuffle()

In [191]:
# LSTMの学習用のデータローダー
class SequenceDataLoader(Sequence):
    """
    Data loader that transaction history data.
    This class i inherited Sequence class of Keras.
    """

    def __init__(self, path_list: np.ndarray, target: Optional[np.ndarray] = None,
                 target_time: Optional[np.ndarray] = None, batch_size: int = 32,
                 shuffle: bool = True, is_train: bool = True):
        """
        Constructor. This method dtermines class variables.

        Parameters
        ----------
        dir_name : str
            The directory name 
        path_list : np.ndarray[str]
            The array of file names of events data(csv).
        target : np.ndarray
            Array of target variables.
        target_time : np.ndarray
            Array of timedate at having got target value.
        batch_size : int
            Batch size used when model training.
        shuffle : bool
            The flag determine whether to shuffle on epoch end.
        is_train : bool
            Determine whether this data loader will be used training model.
            if you won't this data loader, you have set 'is_train'=False.
        """
        # クラス変数
        self.path_list = path_list
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.is_train = is_train
        self.length = math.ceil(len(self.path_list) / self.batch_size)

        # 数値の特徴量，ラベルエンコーディングした際の特徴量名，datetime型の特徴量名
        self.origin_columns = ['bid_amount', 'starting_price', 'ending_price',
                               'total_price', 'event_type', 'auction_type', 'created_date']
        self.num_columns = ['bid_amount', 'starting_price', 'ending_price', 'total_price']
        
        self.event_list = ['successful', 'cancelled', 'created',
                           'bid_entered', 'bid_withdrawn',
                           'offer_entered', 'custom', 'transfer']
        self.auction_list = ['dutch', 'english', 'min_price']
        self.datetime_column = 'created_date'

        # LSTMの学習に用いるすべての特徴量リスト
        self.use_columns = self.num_columns + self.event_list + self.auction_list  # timestampを特徴量として使用しない（扱いが難しい，これまでの取引の履歴に関する間隔を考慮しない）

        if self.is_train:
            self.target = target
            self.target_time = target_time

    def __len__(self):
        """
        Returns
        -------
        self.length : data length
        """
        return self.length

    def one_hot_encoding(self, df):
        """
        Apply label encoding to the data
        """
        events = pd.get_dummies(df['event_type'])
        auctions = pd.get_dummies(df['auction_type'])
        df = pd.concat((df, events), axis=1)
        df = pd.concat((df, auctions), axis=1)
        for column in (self.event_list + self.auction_list):
            if column not in df.columns:
                df[column] = 0

        return df

    def get_csv(self, path_list: np.ndarray, time_list: Optional[np.ndarray] = None):
        """
        Load events data(csv) and preprocessing(padding, resize etc...).

        Parameters
        ----------
        path_list : np.ndarray[str]
            The array of file names of events data(csv).

        Returns
        -------
        events_df : pd.DataFrame
            The DataFrame of events data.
        """
        events_df = pd.DataFrame(columns=self.origin_columns)
        if self.is_train:
            for path, timestamp in zip(path_list, time_list):
                tmp_df = pd.read_csv(path, usecols=self.origin_columns)
                tmp_df[self.datetime_column] = pd.to_datetime(tmp_df[self.datetime_column]).apply(lambda x: x.timestamp())  # timestampに変換
                # if time_list is not None:
                    # tmp_df = tmp_df.query('created_date < @timestamp')  # ターゲットが決まった時間以前のデータのみを取得
                nan_array = np.empty((100-tmp_df.shape[0], len(self.origin_columns)))  # 不足分をnanでパディング
                nan_array[:, :] = np.nan
                tmp_df = pd.concat((tmp_df, pd.DataFrame(nan_array, columns=tmp_df.columns)), axis=0).iloc[::-1, :].reset_index(drop=True)
                tmp_df = tmp_df[self.origin_columns]

                events_df = events_df.append(tmp_df)

        else:
            for path in path_list:
                tmp_df = pd.read_csv(path, usecols=self.origin_columns)
                tmp_df[self.datetime_column] = pd.to_datetime(tmp_df[self.datetime_column]).apply(lambda x: x.timestamp())
                nan_array = np.empty((100-tmp_df.shape[0], len(self.origin_columns)))  # 不足分をnanでパディング
                nan_array[:, :] = np.nan
                tmp_df = pd.concat((tmp_df, pd.DataFrame(nan_array, columns=tmp_df.columns)), axis=0).iloc[::-1, :].reset_index(drop=True)

                events_df = events_df.append(tmp_df)

        # DataFrameの空白をnanで置換
        events_df = events_df.replace({None: np.nan, r'^\s*$': np.nan}, regex=True)

        # 数値型の特徴量の変換 欠損値をゼロ埋め，対数変換
        events_df[self.num_columns] = events_df[self.num_columns].fillna(0)
        events_df[self.num_columns] = np.log1p(events_df[self.num_columns].astype(float) * 1e-18)

        # カテゴリ型の特徴量の変換 one-hot-encoding
        events_df = self.one_hot_encoding(events_df)

        # datetime型の特徴量の変換 ゼロ埋め，unix timestampに変換
        events_df[self.datetime_column] = events_df[self.datetime_column].fillna(0)
        events_df[self.datetime_column] = pd.to_datetime(events_df[self.datetime_column]).apply(lambda x: x.timestamp())
        # timestampをスケーリングする手法が必要

        # 使用する特徴量のみを抽出
        events_df = events_df[self.use_columns]

        return events_df.values.reshape(self.batch_size, 100, len(self.use_columns))

    def __getitem__(self, idx):
        path_list = self.path_list[idx*self.batch_size:(idx+1)*self.batch_size]
        if self.is_train:
            target = self.target[idx*self.batch_size:(idx+1)*self.batch_size]
            time_list = self.target_time[idx*self.batch_size:(idx+1)*self.batch_size]
            input_data = self.get_csv(path_list, time_list)
            return input_data, target

        else:
            input_data = get_csv(path_list)
            return input_data

    def _shuffle(self):
        idx = np.random.permutation(len(self.path_list))
        self.path_list = self.path_list[idx]
        if self.is_train:
            self.target = self.target[idx]

    def on_epoch_end(self):
        if self.shuffle:
            self._shuffle()

In [10]:
# SwinTransformerモデルの定義
swin_model = NFTModel(
    model_func=create_model, input_shape=(224, 224, 3),
    output_shape=1,activation=activations.relu, loss=losses.mean_squared_error,
    learning_rate=0.0001, pretrain=True
)
swin_model.model.summary()

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         [(None, 224, 224, 3)]     0         
_________________________________________________________________
swin_tiny_224 (SwinTransform (None, 768)               27769058  
_________________________________________________________________
dense_4 (Dense)              (None, 128)               98432     
_________________________________________________________________
activation_2 (Activation)    (None, 128)               0         
_________________________________________________________________
dropout_76 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 64)                8256      
_________________________________________________________________
activation_3 (Activation)    (None, 64)                0   

In [11]:
# LSTMモデルの定義
lstm_model = create_lstm(100, 15)
lstm_model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 100, 15)]         0         
_________________________________________________________________
lstm (LSTM)                  (None, 64)                20480     
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 65        
Total params: 20,545
Trainable params: 20,545
Non-trainable params: 0
_________________________________________________________________


## SwinTransformerを学習させて画像特徴量を抽出する  

### SwinTransformerの学習，単一モデルとしての評価  

In [14]:
# 学習に使うデータ（画像のパス，target）をDataFrameから抽出
path_list = train_df['full_path'].values
target = train_df['target'].values

# 学習データをホールドアウトで分割
train_path, val_path, train_target, val_target =\
    train_test_split(path_list, target, test_size=0.1, random_state=6174)

# データローダーの定義
train_dataloader = DataLoader(train_path, train_target)
val_dataloader = DataLoader(val_path, val_target, shuffle=False)

# モデルの学習
print("Start fit SwinTransformer model")
print("-" * 30)
set_seed()  # 再現性を高めるためのシード設定
swin_model.fit(train_dataloader, val_dataloader, epochs=50, batch_size=16)
print("-" * 30)
print("End fit SwinTransformer model\n")

# モデルの評価 事前に分割したテストデータを用いる
print("Start evaluate SwinTransformer model")
print("-" * 30)
test_path = test_df['full_path']
test_target = test_df['target']
test_gen = DataLoader(test_path, is_train=False, shuffle=False, batch_size=1)
swin_model.evaluate(test_gen, test_target)
print("-" * 30)
print("End evaluate SwinTransformer model")

Start fit SwinTransformer model
------------------------------
Epoch 1/50


2022-01-03 00:12:17.043837: I tensorflow/stream_executor/cuda/cuda_blas.cc:1760] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
------------------------------
End fit SwinTransformer model

Start evaluate SwinTransformer model
------------------------------
RMSE Score: 0.8650370523841816
MAE Score: 0.6134740951001023
------------------------------
End evaluate SwinTransformer model


### 画像特徴量の抽出  

In [15]:
# SwinTransforemrモデルの出力層を外す
swin_base = tf.keras.models.Sequential(swin_model.model.layers[:-1])
train_dataloader = DataLoader(path_list, shuffle=False, is_train=False, batch_size=1)
test_dataloader = DataLoader(test_path, shuffle=False, is_train=False, batch_size=1)

# 64次元の特徴量として抽出
img_feature_name = [f"image_feature{i}" for i in range(64)]
train_img_features = swin_base.predict(train_dataloader)
test_img_features = swin_base.predict(test_dataloader)

# DataFrameに結合する
train_df[img_feature_name] = train_img_features
test_df[img_feature_name] = test_img_features

display(train_df.head())
display(test_df.head())

Unnamed: 0,id,token_id,num_sales,background_color,image_url,image_preview_url,image_thumbnail_url,image_original_url,animation_url,animation_original_url,...,image_feature54,image_feature55,image_feature56,image_feature57,image_feature58,image_feature59,image_feature60,image_feature61,image_feature62,image_feature63
8175,31269688,994,3,,https://lh3.googleusercontent.com/1HPMcPtfRNAN...,https://lh3.googleusercontent.com/1HPMcPtfRNAN...,https://lh3.googleusercontent.com/1HPMcPtfRNAN...,https://arweave.net/5Tbgrq3sEI7E2_I73Zsne0CnWi...,,,...,0.0,0.16396,0.148411,0.040216,0.027406,0.216996,0.283691,0.157687,0.062804,0.056169
1289,95375526,204000036,1,,https://lh3.googleusercontent.com/6KEzm3kdYUSD...,https://lh3.googleusercontent.com/6KEzm3kdYUSD...,https://lh3.googleusercontent.com/6KEzm3kdYUSD...,https://media.artblocks.io/204000036.png,https://generator.artblocks.io/204000036,https://generator.artblocks.io/204000036,...,0.0,0.147121,0.242575,0.019034,0.0091,0.225406,0.326011,0.239911,0.0,0.043345
10571,73377130,3428,2,,https://lh3.googleusercontent.com/3C3DxHglMXcg...,https://lh3.googleusercontent.com/3C3DxHglMXcg...,https://lh3.googleusercontent.com/3C3DxHglMXcg...,https://ipfs.billionaireclubnft.com/ipfs/QmTcT...,,,...,1.131514,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15592,40808885,6338981479643114027188253887998465510674414714...,3,,https://lh3.googleusercontent.com/xXY7LRPOCi6z...,https://lh3.googleusercontent.com/xXY7LRPOCi6z...,https://lh3.googleusercontent.com/xXY7LRPOCi6z...,,,,...,0.0,0.167353,0.194405,0.0,0.0,0.20524,0.26246,0.066199,0.0,0.0
18226,95536911,6543,1,,https://storage.opensea.io/files/fb2b7976333dd...,https://storage.opensea.io/files/fb2b7976333dd...,https://storage.opensea.io/files/fb2b7976333dd...,,,,...,4.807098,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,id,token_id,num_sales,background_color,image_url,image_preview_url,image_thumbnail_url,image_original_url,animation_url,animation_original_url,...,image_feature54,image_feature55,image_feature56,image_feature57,image_feature58,image_feature59,image_feature60,image_feature61,image_feature62,image_feature63
3889,65555782,1793,3,,https://storage.opensea.io/files/d203d3452bbc3...,https://storage.opensea.io/files/d203d3452bbc3...,https://storage.opensea.io/files/d203d3452bbc3...,,,,...,0.0,0.25949,1.38303,0.034041,0.0,0.653891,0.850281,0.84313,0.028475,0.0
2927,92750404,1700,2,,https://lh3.googleusercontent.com/-eX4pQ1tImfX...,https://lh3.googleusercontent.com/-eX4pQ1tImfX...,https://lh3.googleusercontent.com/-eX4pQ1tImfX...,https://cosmiclabs.mypinata.cloud/ipfs/QmVHZ3A...,,,...,2.373254,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5979,91162491,3055,2,,https://lh3.googleusercontent.com/B_-3JJ6XFAWG...,https://lh3.googleusercontent.com/B_-3JJ6XFAWG...,https://lh3.googleusercontent.com/B_-3JJ6XFAWG...,https://gateway.pinata.cloud/ipfs/QmY7ZxeYkZ9b...,,,...,1.061933,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19211,67774513,1224,1,,https://lh3.googleusercontent.com/qTt4UWQ_AlAy...,https://lh3.googleusercontent.com/qTt4UWQ_AlAy...,https://lh3.googleusercontent.com/qTt4UWQ_AlAy...,https://ipfs.io/ipfs/QmPuWZwJBcrkZmZJnGWwPY9n1...,,,...,0.156162,0.0,0.129772,0.0,0.0,0.038459,0.181828,0.0,0.0,0.0
6019,90847168,148,2,,https://lh3.googleusercontent.com/EYRZYOCOyNKr...,https://lh3.googleusercontent.com/EYRZYOCOyNKr...,https://lh3.googleusercontent.com/EYRZYOCOyNKr...,https://gateway.pinata.cloud/ipfs/QmY7ZxeYkZ9b...,,,...,1.446893,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## LSTMを学習させて価格推移の特徴量を抽出する  
- 現在はtotal_priceの推移のみを扱っているが，bid_amoutなどのofferの価格を取り扱うことも検討中．  

### LSTMを学習する  

In [20]:
# 学習データを抽出
price_features = [f"price_{i}" for i in range(50)]
train_df[price_features] = np.log1p(train_df[price_features].astype(float) * 1e-18)
test_df[price_features] = np.log1p(test_df[price_features].astype(float) * 1e-18)
price_features = [f"price_{i}" for i in range(1, 50)] # 最新の価格を外す
price_trans = train_df[price_features].values.reshape(-1, 49, 1)

# 学習データをホールドアウトで分割
train_trans, val_trans, train_target, val_target =\
    train_test_split(price_trans, target, test_size=0.1, random_state=6174)

# LSTMを学習させる
print("Start fit LSTM model")
print("-" * 30)
lstm_model.fit(train_trans, train_target, epochs=30, batch_size=64,
               validation_data=(val_trans, val_target))
print("-" * 30)
print("End fit LSTM model")

Start fit LSTM model
------------------------------
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
------------------------------
End fit LSTM model


### LSTMで特徴量抽出する  

In [22]:
test_price_trans = test_df[price_features].values.reshape(-1, 49, 1)

# 64次元の特徴量として抽出（今後変える可能性あり）
base_lstm = tf.keras.models.Sequential(lstm_model.layers[:-1])
train_price_features = base_lstm.predict(price_trans)
test_price_features = base_lstm.predict(test_price_trans)

# DataFrameに結合する
trans_feature_name = [f"trans_feature{i}" for i in range(64)]
train_df[trans_feature_name] = train_price_features
test_df[trans_feature_name] = test_price_features

display(train_df.head())
display(test_df.head())

Unnamed: 0,id,token_id,num_sales,background_color,image_url,image_preview_url,image_thumbnail_url,image_original_url,animation_url,animation_original_url,...,trans_feature54,trans_feature55,trans_feature56,trans_feature57,trans_feature58,trans_feature59,trans_feature60,trans_feature61,trans_feature62,trans_feature63
8175,31269688,994,3,,https://lh3.googleusercontent.com/1HPMcPtfRNAN...,https://lh3.googleusercontent.com/1HPMcPtfRNAN...,https://lh3.googleusercontent.com/1HPMcPtfRNAN...,https://arweave.net/5Tbgrq3sEI7E2_I73Zsne0CnWi...,,,...,0.117737,-0.004397,-0.026743,0.035133,-0.030444,-0.00529,-0.072883,-0.132559,-0.006156,-0.265413
1289,95375526,204000036,1,,https://lh3.googleusercontent.com/6KEzm3kdYUSD...,https://lh3.googleusercontent.com/6KEzm3kdYUSD...,https://lh3.googleusercontent.com/6KEzm3kdYUSD...,https://media.artblocks.io/204000036.png,https://generator.artblocks.io/204000036,https://generator.artblocks.io/204000036,...,0.117737,-0.004397,-0.026743,0.035133,-0.030444,-0.00529,-0.072883,-0.132559,-0.006156,-0.265413
10571,73377130,3428,2,,https://lh3.googleusercontent.com/3C3DxHglMXcg...,https://lh3.googleusercontent.com/3C3DxHglMXcg...,https://lh3.googleusercontent.com/3C3DxHglMXcg...,https://ipfs.billionaireclubnft.com/ipfs/QmTcT...,,,...,0.117737,-0.004397,-0.026743,0.035133,-0.030444,-0.00529,-0.072883,-0.132559,-0.006156,-0.265413
15592,40808885,6338981479643114027188253887998465510674414714...,3,,https://lh3.googleusercontent.com/xXY7LRPOCi6z...,https://lh3.googleusercontent.com/xXY7LRPOCi6z...,https://lh3.googleusercontent.com/xXY7LRPOCi6z...,,,,...,0.117737,-0.004397,-0.026743,0.035133,-0.030444,-0.00529,-0.072883,-0.132559,-0.006156,-0.265413
18226,95536911,6543,1,,https://storage.opensea.io/files/fb2b7976333dd...,https://storage.opensea.io/files/fb2b7976333dd...,https://storage.opensea.io/files/fb2b7976333dd...,,,,...,0.117737,-0.004397,-0.026743,0.035133,-0.030444,-0.00529,-0.072883,-0.132559,-0.006156,-0.265413


Unnamed: 0,id,token_id,num_sales,background_color,image_url,image_preview_url,image_thumbnail_url,image_original_url,animation_url,animation_original_url,...,trans_feature54,trans_feature55,trans_feature56,trans_feature57,trans_feature58,trans_feature59,trans_feature60,trans_feature61,trans_feature62,trans_feature63
3889,65555782,1793,3,,https://storage.opensea.io/files/d203d3452bbc3...,https://storage.opensea.io/files/d203d3452bbc3...,https://storage.opensea.io/files/d203d3452bbc3...,,,,...,0.117737,-0.004397,-0.026743,0.035133,-0.030444,-0.00529,-0.072883,-0.132559,-0.006156,-0.265413
2927,92750404,1700,2,,https://lh3.googleusercontent.com/-eX4pQ1tImfX...,https://lh3.googleusercontent.com/-eX4pQ1tImfX...,https://lh3.googleusercontent.com/-eX4pQ1tImfX...,https://cosmiclabs.mypinata.cloud/ipfs/QmVHZ3A...,,,...,0.117737,-0.004397,-0.026743,0.035133,-0.030444,-0.00529,-0.072883,-0.132559,-0.006156,-0.265413
5979,91162491,3055,2,,https://lh3.googleusercontent.com/B_-3JJ6XFAWG...,https://lh3.googleusercontent.com/B_-3JJ6XFAWG...,https://lh3.googleusercontent.com/B_-3JJ6XFAWG...,https://gateway.pinata.cloud/ipfs/QmY7ZxeYkZ9b...,,,...,0.117737,-0.004397,-0.026743,0.035133,-0.030444,-0.00529,-0.072883,-0.132559,-0.006156,-0.265413
19211,67774513,1224,1,,https://lh3.googleusercontent.com/qTt4UWQ_AlAy...,https://lh3.googleusercontent.com/qTt4UWQ_AlAy...,https://lh3.googleusercontent.com/qTt4UWQ_AlAy...,https://ipfs.io/ipfs/QmPuWZwJBcrkZmZJnGWwPY9n1...,,,...,0.117737,-0.004397,-0.026743,0.035133,-0.030444,-0.00529,-0.072883,-0.132559,-0.006156,-0.265413
6019,90847168,148,2,,https://lh3.googleusercontent.com/EYRZYOCOyNKr...,https://lh3.googleusercontent.com/EYRZYOCOyNKr...,https://lh3.googleusercontent.com/EYRZYOCOyNKr...,https://gateway.pinata.cloud/ipfs/QmY7ZxeYkZ9b...,,,...,0.117737,-0.004397,-0.026743,0.035133,-0.030444,-0.00529,-0.072883,-0.132559,-0.006156,-0.265413


## 各コレクションの価格のターゲットエンコーディング，num_salesを特徴量として用いる．  
**メモ**  
<font color='red'>※ ターゲットエンコーディングでリークしていたため注意．</font>  
最新価格よりも前の中で最新のコレクションの値段の平均を割り出す必要がある．  