In [262]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
import xgboost as xgb
import datetime

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error


In [308]:
"""
eth_usd_rates ETH to USD price

6924 -- unique tokens were saled
10000 -- unique tokens in total
"""

eth_usd_rates = pd.read_csv('data/eth_usd_fx_rates.csv')
eth_usd_rates['date'] = pd.to_datetime(eth_usd_rates["date"], dayfirst=True).dt.date

token_metadata = pd.read_csv('data/token_metadata.csv')
token_metadata['log_rarity_score'] = np.log(token_metadata["rarity_score"])

token_sales = pd.read_csv('data/token_sales.csv')[['token_index', 'timestamp', 'eth', 'usd']]
token_sales['datetime'] = pd.to_datetime(token_sales['timestamp'],unit='s')
token_sales['date'] = token_sales['datetime'].dt.date
token_sales['transaction_recency'] = token_sales.groupby('token_index')['datetime'].rank(method='first', ascending=False).astype(int)



In [309]:
# process dataset
'''
-- Exchange rate movements
-- Market volume 
-- Token sales 
-- 
'''
# Add exchange rate movements 
eth_usd_rates['sma30'] = eth_usd_rates['open'].rolling(30).mean()
eth_usd_rates['sma7'] = eth_usd_rates['open'].rolling(7).mean()
eth_usd_rates['diff30'] = eth_usd_rates['sma30'].diff(periods=1)
eth_usd_rates['diff7'] = eth_usd_rates['sma7'].diff(periods=1)

In [310]:
# Join tables, keep all tokens from metadata
token_sales = token_sales.merge(eth_usd_rates, on='date', how='left')
token_data = token_metadata.merge(token_sales, on='token_index', how='left')

In [311]:
#  token_data.groupby(['date', 'Skin Tone'])

token_data['token_count_1m'] = token_data.sort_values(by=['datetime'], ascending=True)\
                       .groupby(['token_index'])['eth']\
                       .rolling(30, min_periods = 1).count()\
                       .reset_index(drop=True, level=0)

token_data['token_mean_1m'] = token_data.sort_values(by=['datetime'], ascending=True)\
                       .groupby(['token_index'])['eth']\
                       .rolling(30, min_periods = 1).mean()\
                       .reset_index(drop=True, level=0)

token_data['category_mean_1m'] = token_data.sort_values(by=['datetime'], ascending=True)\
                       .groupby(['Skin Tone'])['eth']\
                       .rolling(30, min_periods = 1).mean()\
                       .reset_index(drop=True, level=0)

# token_data['volume_1m'] = token_data.sort_values(by=['datetime'], ascending=True)\
#                        .groupby(['Skin Tone'])['eth']\
#                        .rolling(30, min_periods = 1).sum()\
#                        .reset_index(drop=True, level=0)

In [312]:
# aggregated by token/date with sum and mean
token_by_day = token_data.groupby(['date', 'token_index'])['eth'].agg(['sum', 'mean']).reset_index()
category_mean_by_day = token_data.groupby(['date', 'Skin Tone'])['eth'].agg(['sum', 'mean']).reset_index()

In [313]:
token_data[token_data['token_index']==4911]

Unnamed: 0,Skin Tone,Type,Hair,Eyewear,Mouth,Headwear,Facial Hair,Smoking Device,Other:Earring,Neckwear,...,date,transaction_recency,open,sma30,sma7,diff30,diff7,token_count_1m,token_mean_1m,category_mean_1m
11370,Darker,Female,Frumpy Hair,Purple Eye Shadow,Purple Lipstick,,,,,,...,2023-03-05,19.0,1566.92,1621.391333,1618.502857,-2.527333,-3.977143,1.0,64.25,75.660833
11371,Darker,Female,Frumpy Hair,Purple Eye Shadow,Purple Lipstick,,,,,,...,2023-03-05,18.0,1566.92,1621.391333,1618.502857,-2.527333,-3.977143,2.0,64.38,75.6895
11372,Darker,Female,Frumpy Hair,Purple Eye Shadow,Purple Lipstick,,,,,,...,2023-03-12,17.0,1483.07,1592.517667,1511.014286,-2.090333,-11.978571,3.0,64.966667,65.483333
11373,Darker,Female,Frumpy Hair,Purple Eye Shadow,Purple Lipstick,,,,,,...,2023-03-13,16.0,1591.33,1595.102333,1514.811429,2.584667,3.797143,4.0,65.425,67.748333
11374,Darker,Female,Frumpy Hair,Purple Eye Shadow,Purple Lipstick,,,,,,...,2023-03-14,15.0,1681.01,1599.843,1531.254286,4.740667,16.442857,5.0,65.784,68.183
11375,Darker,Female,Frumpy Hair,Purple Eye Shadow,Purple Lipstick,,,,,,...,2023-03-14,14.0,1681.01,1599.843,1531.254286,4.740667,16.442857,6.0,65.971667,68.368667
11376,Darker,Female,Frumpy Hair,Purple Eye Shadow,Purple Lipstick,,,,,,...,2023-03-21,13.0,1738.45,1627.940667,1731.192857,1.549,8.205714,7.0,65.862857,70.2907
11377,Darker,Female,Frumpy Hair,Purple Eye Shadow,Purple Lipstick,,,,,,...,2023-03-22,12.0,1807.45,1632.147333,1745.804286,4.206667,14.611429,8.0,65.27625,62.865666
11378,Darker,Female,Frumpy Hair,Purple Eye Shadow,Purple Lipstick,,,,,,...,2023-03-24,11.0,1817.28,1638.579667,1777.651429,5.254667,20.04,9.0,64.716667,60.344667
11379,Darker,Female,Frumpy Hair,Purple Eye Shadow,Purple Lipstick,,,,,,...,2023-03-25,10.0,1751.74,1642.195,1771.711429,3.615333,-5.94,10.0,64.072,58.879299


In [315]:
token_data.groupby('Skin Tone')['Facial Hair', 'Smoking Device'].nunique()


Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



Unnamed: 0_level_0,Facial Hair,Smoking Device
Skin Tone,Unnamed: 1_level_1,Unnamed: 2_level_1
Albino,12,3
Alien,0,1
Ape,0,2
Darker,12,3
Lighter,12,3
Medium,12,3
Zombie,10,1


In [316]:
print(token_data['Facial Hair'].value_counts())
print(token_data['Smoking Device'].value_counts())

Shadow Beard          1221
Muttonchops            834
Goat                   813
Chinstrap              765
Front Beard            752
Mustache               751
Front Beard Dark       747
Handlebars             711
Normal Beard           711
Normal Beard Black     640
Luxurious Beard        567
Big Beard              312
Name: Facial Hair, dtype: int64
Cigarette    2346
Pipe          721
Vape          587
Name: Smoking Device, dtype: int64


In [413]:
train_token_data = token_data.loc[token_data['transaction_recency']==1]

categorical_features_list = ['Skin Tone', 'Type', 'Hair', 'Eyewear', 'Mouth', 'Headwear', 'Facial Hair', 'Smoking Device', 'Other:Earring', 'Neckwear', 'Skin Feature', 'Other:Medical Mask', 'Other:Clown Nose', 'Trait Count',
       'rarest_property_name']
continuous_features_list = ['rarity_score', 'log_rarity_score', 'open', 'sma30', 'sma7', 'diff30', 'diff7', 'token_count_1m', 'category_mean_1m']
train_token_data = train_token_data.fillna(0)
train_token_data[categorical_features_list] = train_token_data[categorical_features_list].astype("category")

X = train_token_data[categorical_features_list+continuous_features_list]
y = train_token_data['eth']


In [414]:
# TODO: Train by periods
len(train_token_data.loc[train_token_data['date']>=datetime.date(2023, 4, 1)])

257

In [415]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

In [416]:
# define data_dmatrix
dtrain = xgb.DMatrix(X_train, label=y_train, enable_categorical=True)
dtest = xgb.DMatrix(X_test, label=y_test, enable_categorical=True)

In [417]:
# eval_set = [(X_train, y_train), (X_test, y_test)]
eval_set = [(dtest, 'eval'), (dtrain, 'train')]

evals_result = {}

params = {
    'booster': 'gbtree',
    'objective': 'reg:tweedie',
    'learning_rate': 0.1,
    'eval_metric': 'rmse', 
    'verbosity': 1, 
}

num_rounds = 500

In [418]:
model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_rounds,
    evals=eval_set,
    evals_result=evals_result
)
print(evals_result)
y_pred = np.abs(model.predict(dtest))
mape = mean_absolute_percentage_error(y_test, y_pred)
print("mape: %f" % (mape))

[0]	eval-rmse:157.97780	train-rmse:177.24796
[1]	eval-rmse:157.92477	train-rmse:177.20007
[2]	eval-rmse:157.86051	train-rmse:177.14203
[3]	eval-rmse:157.78278	train-rmse:177.07185
[4]	eval-rmse:157.68894	train-rmse:176.98710
[5]	eval-rmse:157.57593	train-rmse:176.88501
[6]	eval-rmse:157.44008	train-rmse:176.76235
[7]	eval-rmse:157.27726	train-rmse:176.61539
[8]	eval-rmse:157.08296	train-rmse:176.43995
[9]	eval-rmse:156.85197	train-rmse:176.23134
[10]	eval-rmse:156.57840	train-rmse:175.98442
[11]	eval-rmse:156.25613	train-rmse:175.69361
[12]	eval-rmse:155.87858	train-rmse:175.35309
[13]	eval-rmse:155.43895	train-rmse:174.95694
[14]	eval-rmse:154.92998	train-rmse:174.49920
[15]	eval-rmse:154.34542	train-rmse:173.97423
[16]	eval-rmse:153.67987	train-rmse:173.37694
[17]	eval-rmse:152.92780	train-rmse:172.70307
[18]	eval-rmse:152.08290	train-rmse:171.94881
[19]	eval-rmse:151.14378	train-rmse:171.11122
[20]	eval-rmse:150.10793	train-rmse:170.18862
[21]	eval-rmse:148.97288	train-rmse:169.1797

In [419]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=list(range(len(y_test))), y=y_test, mode='lines+markers', name='real'))
fig.add_trace(go.Scatter(x=list(range(len(y_pred))), y=y_pred, mode='lines+markers', name='predicted'))
fig.update_layout(title='predicted vs actual')
fig.show()

In [422]:
prediction_token_data = token_data.loc[np.isnan(token_data['transaction_recency']) | (token_data['transaction_recency']==1)]

categorical_features_list = ['Skin Tone', 'Type', 'Hair', 'Eyewear', 'Mouth', 'Headwear', 'Facial Hair', 'Smoking Device', 'Other:Earring', 'Neckwear', 'Skin Feature', 'Other:Medical Mask', 'Other:Clown Nose', 'Trait Count',
       'rarest_property_name']
continuous_features_list = ['rarity_score', 'log_rarity_score', 'open', 'sma30', 'sma7', 'diff30', 'diff7', 'token_count_1m', 'category_mean_1m']
prediction_token_data = prediction_token_data.fillna(0)
prediction_token_data[categorical_features_list] = prediction_token_data[categorical_features_list].astype("category")

X = prediction_token_data[categorical_features_list+continuous_features_list]

dpredict = xgb.DMatrix(X, enable_categorical=True)
y_pred = np.abs(model.predict(dpredict))
prediction_token_data['valuation'] = y_pred

In [426]:
prediction_token_data[['token_index', 'valuation']].to_csv('valuations.csv', index = False)