# Model description
**Outcome variable**
- price (eth)  

**Explanatory variables**
- collection features (daily): volume, total trades, price max, price min, price 5%  
- last sale: price_last_sale, time_last_sale  
- traits/rarity: (1 categorical variable for each trait), rarity rank  

**Models used**
- OLS
- random forest

In [4]:
import numpy as np
import pandas as pd

In [44]:
file = "bayc_cleaned.csv"
df = pd.read_csv(f"data/{file}", index_col=0)

In [45]:
df.head()

Unnamed: 0,project,version,block_date,block_month,block_time,token_id,collection,amount_usd,token_standard,trade_type,...,Mouth_value,Mouth_count,Eyes_value,Eyes_count,Fur_value,Fur_count,Clothes_value,Clothes_count,Earring_value,Earring_count
0,x2y2,v1,2023-01-22,2023-01-01,2023-01-22 00:30:23+00:00,5,Bored Ape Yacht Club,317355.814,erc721,secondary,...,Dumbfounded,505,X Eyes,243,Brown,1370,Bayc T Red,140.0,,
1,opensea,v1,2021-06-21,2021-06-01,2021-06-21 15:56:28+00:00,6,Bored Ape Yacht Club,19761.7,erc721,Single Item Trade: Fixed price,...,Bored,2272,Crazy,407,Cream,636,Tweed Suit,141.0,,
2,opensea,v1,2021-12-22,2021-12-01,2021-12-22 19:16:09+00:00,6,Bored Ape Yacht Club,251182.89,erc721,Single Item Trade: Fixed price,...,Bored,2272,Crazy,407,Cream,636,Tweed Suit,141.0,,
3,opensea,v3,2022-07-03,2022-07-01,2022-07-03 16:40:36+00:00,6,Bored Ape Yacht Club,93910.13,erc721,single item trade,...,Bored,2272,Crazy,407,Cream,636,Tweed Suit,141.0,,
4,opensea,v3,2022-10-21,2022-10-01,2022-10-21 00:56:23+00:00,8,Bored Ape Yacht Club,141705.3,erc721,single item trade,...,Bored,2272,Robot,350,Zombie,302,,,Gold Stud,439.0


In [46]:
# missing values
df.isna().sum()

project                    0
version                    0
block_date                 0
block_month                0
block_time                 0
token_id                   0
collection                 0
amount_usd                 0
token_standard             0
trade_type                 0
number_of_items            0
trade_category             0
evt_type                   0
trade_price                0
currency_symbol            0
tx_hash                    0
volume_eth                 0
price_p5_eth               0
price_min_eth              0
price_max_eth              0
last_trade_price        9040
last_trade_time         9040
last_trade_timediff     9040
rarity_rank                0
Background_value           0
Background_count           0
Hat_value               9944
Hat_count               9944
Mouth_value                0
Mouth_count                0
Eyes_value                 0
Eyes_count                 0
Fur_value                  0
Fur_count                  0
Clothes_value 

In [47]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [57]:
def pred_models(X_cols, y_col, df, models=None):
    
    if models is None:
        models = ['ols', 'rf']

    # fetch variables of interest, drop missing values
    data = df.copy()
    n = len(data)
    data = data.dropna(subset=X_cols+[y_col])
    print(f"Number of observations: {len(data)}/{n}")
    X = data[X_cols]
    y = data[y_col]

    # train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # OLS regression
    if 'ols' in models:
        ols = LinearRegression()
        ols.fit(X_train,y_train)
        ols_pred = ols.predict(X_test)
        ols_mse = mean_squared_error(y_test, ols_pred)
        print(f"OLS MSE: {ols_mse}")

    # Random Forest regression
    if 'rf' in models:
        rf = RandomForestRegressor(random_state=42)
        rf.fit(X_train, y_train)
        rf_predictions = rf.predict(X_test)
        rf_mse = mean_squared_error(y_test, rf_predictions)
        print(f"Random Forest MSE: {rf_mse}")
    

In [55]:
# define models by columns sets
feature_columns = {
    "base_cols" : ['volume_eth', 'price_p5_eth', 'price_max_eth', 'price_min_eth',
                'rarity_rank'],
    "base_traits" : ['Background_count', 'Mouth_count', 'Eyes_count', 'Fur_count'],
    "extra_traits" : ['Hat_count', 'Clothes_count', 'Earring_count'],
    "last_trade" : ['last_trade_timediff', 'last_trade_price'],
}

# model 1
X_cols_1 = feature_columns['base_cols'] + feature_columns['base_traits']
y_col_1 = "trade_price"

In [58]:
pred_models(X_cols_1, y_col_1, df)

Number of observations: 43402/43402
OLS MSE: 566.6908410152627
Random Forest MSE: 383.99172049777667
