# 画像データの寄与が大きいのかを確認する．  
- TargetEncodingした特徴量のみで線型回帰，lgbmで予測を行う．  
- その結果と画像のみの結果，標準偏差と比較を行う．  

In [1]:
import os
import gc
import warnings

import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb
import matplotlib.pyplot as plt

warnings.filterwarnings('ignore')

In [2]:
DATAPATH = "../data/taskA/table"
df = pd.read_csv(os.path.join(DATAPATH, "asset_data.csv"))

df = df.rename(columns={"last_sale.total_price": "target"})
df['target'] = df['target'].astype(float) * 1e-18
df = df.query('target > 0').reset_index(drop=True)
df['target'] = df['target'].apply(lambda x: np.log1p(x))
display(df.head())
print(f"data shape: {df.shape}")

Unnamed: 0,id,token_id,num_sales,background_color,image_url,image_preview_url,image_thumbnail_url,image_original_url,animation_url,animation_original_url,...,last_sale.transaction.transaction_index,last_sale.created_date,last_sale.quantity,last_sale.transaction.from_account.user.username,owner.user.username,last_sale.transaction.to_account.user.username,creator.user.username,creator,collection.display_data.images,image_id
0,527189,3604,3,,https://lh3.googleusercontent.com/F29AWc3Qgx3Q...,https://lh3.googleusercontent.com/F29AWc3Qgx3Q...,https://lh3.googleusercontent.com/F29AWc3Qgx3Q...,https://www.larvalabs.com/cryptopunks/cryptopu...,,,...,164,2021-11-12T15:19:23.231230,1,,,,,,,0.png
1,528119,5108,1,,https://lh3.googleusercontent.com/CLeVgaNXAR3y...,https://lh3.googleusercontent.com/CLeVgaNXAR3y...,https://lh3.googleusercontent.com/CLeVgaNXAR3y...,https://www.larvalabs.com/cryptopunks/cryptopu...,,,...,36,2021-11-12T07:46:09.630431,1,Cool-Punks,,,,,,1.png
2,527733,4503,1,,https://lh3.googleusercontent.com/uXSp3edvlFtO...,https://lh3.googleusercontent.com/uXSp3edvlFtO...,https://lh3.googleusercontent.com/uXSp3edvlFtO...,https://www.larvalabs.com/cryptopunks/cryptopu...,,,...,251,2021-11-12T01:01:22.486461,1,,,,,,,2.png
3,176535,6729,1,,https://lh3.googleusercontent.com/hQ0qO_Kzf94W...,https://lh3.googleusercontent.com/hQ0qO_Kzf94W...,https://lh3.googleusercontent.com/hQ0qO_Kzf94W...,https://www.larvalabs.com/cryptopunks/cryptopu...,,,...,30,2021-11-12T00:31:25.335608,1,Cool-Punks,,,,,,3.png
4,179178,4313,4,,https://lh3.googleusercontent.com/xJeddtlpPc1k...,https://lh3.googleusercontent.com/xJeddtlpPc1k...,https://lh3.googleusercontent.com/xJeddtlpPc1k...,https://www.larvalabs.com/cryptopunks/cryptopu...,,,...,34,2021-11-11T18:00:22.265147,1,,,,,,,5.png


data shape: (21747, 121)


## 比較用の標準偏差を算出  

In [3]:
std = df.groupby(['collection.name'])['target'].std()
df['target_std'] = df['collection.name'].map(std)
df

Unnamed: 0,id,token_id,num_sales,background_color,image_url,image_preview_url,image_thumbnail_url,image_original_url,animation_url,animation_original_url,...,last_sale.created_date,last_sale.quantity,last_sale.transaction.from_account.user.username,owner.user.username,last_sale.transaction.to_account.user.username,creator.user.username,creator,collection.display_data.images,image_id,target_std
0,527189,3604,3,,https://lh3.googleusercontent.com/F29AWc3Qgx3Q...,https://lh3.googleusercontent.com/F29AWc3Qgx3Q...,https://lh3.googleusercontent.com/F29AWc3Qgx3Q...,https://www.larvalabs.com/cryptopunks/cryptopu...,,,...,2021-11-12T15:19:23.231230,1,,,,,,,0.png,0.503242
1,528119,5108,1,,https://lh3.googleusercontent.com/CLeVgaNXAR3y...,https://lh3.googleusercontent.com/CLeVgaNXAR3y...,https://lh3.googleusercontent.com/CLeVgaNXAR3y...,https://www.larvalabs.com/cryptopunks/cryptopu...,,,...,2021-11-12T07:46:09.630431,1,Cool-Punks,,,,,,1.png,0.503242
2,527733,4503,1,,https://lh3.googleusercontent.com/uXSp3edvlFtO...,https://lh3.googleusercontent.com/uXSp3edvlFtO...,https://lh3.googleusercontent.com/uXSp3edvlFtO...,https://www.larvalabs.com/cryptopunks/cryptopu...,,,...,2021-11-12T01:01:22.486461,1,,,,,,,2.png,0.503242
3,176535,6729,1,,https://lh3.googleusercontent.com/hQ0qO_Kzf94W...,https://lh3.googleusercontent.com/hQ0qO_Kzf94W...,https://lh3.googleusercontent.com/hQ0qO_Kzf94W...,https://www.larvalabs.com/cryptopunks/cryptopu...,,,...,2021-11-12T00:31:25.335608,1,Cool-Punks,,,,,,3.png,0.503242
4,179178,4313,4,,https://lh3.googleusercontent.com/xJeddtlpPc1k...,https://lh3.googleusercontent.com/xJeddtlpPc1k...,https://lh3.googleusercontent.com/xJeddtlpPc1k...,https://www.larvalabs.com/cryptopunks/cryptopu...,,,...,2021-11-11T18:00:22.265147,1,,,,,,,5.png,0.503242
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21742,69833705,5899,1,,https://lh3.googleusercontent.com/7HBOUJgyf65i...,https://lh3.googleusercontent.com/7HBOUJgyf65i...,https://lh3.googleusercontent.com/7HBOUJgyf65i...,https://ipfs.io/ipfs/QmdTuijJHYpkZvpJHPJex1bQN...,,,...,2021-11-06T16:57:48.917500,1,iamgub,iamgub,OpenSea-Orders,Eponym,,,21754.png,0.069797
21743,69778852,4550,1,,https://lh3.googleusercontent.com/lNQrfDhiXlls...,https://lh3.googleusercontent.com/lNQrfDhiXlls...,https://lh3.googleusercontent.com/lNQrfDhiXlls...,https://ipfs.io/ipfs/QmTMi8MBEFC61x7dpeBgMdkMn...,,,...,2021-11-06T16:40:42.496175,1,el_collectooor,BurnAddress,OpenSea-Orders,Eponym,,,21755.png,0.069797
21744,69747647,2311,2,,https://lh3.googleusercontent.com/Zp9xXj8yfxDr...,https://lh3.googleusercontent.com/Zp9xXj8yfxDr...,https://lh3.googleusercontent.com/Zp9xXj8yfxDr...,https://ipfs.io/ipfs/QmdA48nHNXo84Rzaf96w9iCDx...,,,...,2021-11-06T16:31:24.084151,1,,BurnAddress,OpenSea-Orders,Eponym,,,21756.png,0.069797
21745,69864384,7391,2,,https://lh3.googleusercontent.com/5ROEuPyJS0AB...,https://lh3.googleusercontent.com/5ROEuPyJS0AB...,https://lh3.googleusercontent.com/5ROEuPyJS0AB...,https://ipfs.io/ipfs/Qmc1v7vnXpPsuxphNufFDXK8d...,,,...,2021-11-06T16:17:43.655307,1,,BurnAddress,OpenSea-Orders,Eponym,,,21757.png,0.069797


In [4]:
std

collection.name
Angry Apes United               0.006908
Art Wars | AW                   0.244090
BearXLabs                       0.115839
Bears Deluxe                    0.142489
Bored Ape Yacht Club            0.450446
Chain Runners                   0.149483
Chromie Squiggle by Snowfro     0.495147
Cool Cats NFT                   0.221487
Cosmic Labs                     0.111132
CrypToadz by GREMPLIN           0.261591
CryptoPunks                     0.503242
CyberKongz                      0.830541
Decentraland                    3.788774
Desperate ApeWives              0.221327
Divine Anarchy                  0.201517
Doodles                         0.225762
Edifice by Ben Kovach           0.288578
Emblem Vault [Ethereum]         0.755248
Eponym by ART AI                0.069797
Fat Ape Club                    0.166158
Furballs.com Official           0.085911
Hor1zon Troopers                0.206734
JUNGLE FREAKS BY TROSLEY        0.191087
Kaiju Kingz                     0.355589


## CVを用いてスコアを算出する．  

In [5]:
def train_model(df, model='linear', n_splits=4):
    kf = KFold(n_splits=n_splits, random_state=6174, shuffle=True)
    rmse_scores = np.array([])
    mae_scores = np.array([])
    for train_idx, val_idx in kf.split(df):
        train_X, val_X = df.iloc[train_idx], df.iloc[val_idx]
        train_y, val_y = df.loc[train_idx, 'target'].values, df.loc[val_idx, 'target'].values

        enc = train_X.groupby(["collection.name"])['target'].mean()
        train_X['target_encoding'] = train_X['collection.name'].map(enc)
        val_X['target_encoding'] = val_X['collection.name'].map(enc)

        train_X = train_X['target_encoding'].values.reshape(-1, 1)
        val_X = val_X['target_encoding'].values.reshape(-1, 1)

        if model == 'linear':
            model = LinearRegression()
        elif model == 'lgb':
            model = lgb.LGBMRegressor()

        model.fit(train_X, train_y)
        pred = model.predict(val_X)
        rmse = np.sqrt(mean_squared_error(val_y, pred))
        mae = mean_absolute_error(val_y, pred)
        rmse_scores = np.append(rmse_scores, rmse)
        mae_scores = np.append(mae_scores, mae)

    print(f"RMSE score: {rmse_scores.mean()}")
    print(f"MAE score: {mae_scores.mean()}")

In [6]:
train_model(df)

RMSE score: 0.8102209382076688
MAE score: 0.293336579754202


In [7]:
train_model(df, model='lgb')

RMSE score: 0.8102208478575466
MAE score: 0.29333934042065035


In [8]:
print(f"Price std: {std.std()}")

Price std: 0.7214826744355846
