In [4]:
%matplotlib inline
%pylab inline
%config InlineBackend.figure_formats = ['retina']

# Basics + EDA
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(context='notebook', style='whitegrid', font_scale=1.2)

# Modeling
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

Populating the interactive namespace from numpy and matplotlib


We read in the file and do a sanity check everything is working. Then we go on with our modeling.

Unfortunately, the model at most gives us an R2 score of 0.11 so we won't use it downstream in our process.

### 1. Read Data in

In [5]:
sales_df = pd.read_csv('sales.csv')

In [6]:
sales_df.head(3)

Unnamed: 0,_id,created_date,id,is_private,quantity,total_price,payment_token_symbol,payment_token_usd_price,asset_token_id,asset_num_sales,asset_id,asset_image_url,transaction_from_account_username,seller_username,winner_account_username
0,61b41aebd6ab32dbd9a84ea0,2021-12-11T03:27:23.942099,2427455930,False,1,365000000000000000,ETH,3983.77,1,14807,44596334,https://lh3.googleusercontent.com/9GgkzN-7si-y...,nikolas17,0xBub,nikolas17
1,61b41aebd6ab32dbd9a84ea1,2021-12-11T03:20:03.826890,2427391259,unknown,1,500000000000000000,WETH,3972.7,5594,2,59286920,https://lh3.googleusercontent.com/ko_eY806byoe...,adamludwin,adamludwin,858
2,61b41aebd6ab32dbd9a84ea2,2021-12-11T02:47:15.343322,2427093401,False,1,1000000000000000000,ETH,3983.77,8839,1,59336121,https://lh3.googleusercontent.com/yNssfCMygvRC...,Iamchef,DirtySderty,Iamchef


In [7]:
sales_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10050 entries, 0 to 10049
Data columns (total 15 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   _id                                10050 non-null  object 
 1   created_date                       10050 non-null  object 
 2   id                                 10050 non-null  int64  
 3   is_private                         10050 non-null  object 
 4   quantity                           10050 non-null  int64  
 5   total_price                        10050 non-null  object 
 6   payment_token_symbol               10050 non-null  object 
 7   payment_token_usd_price            10050 non-null  float64
 8   asset_token_id                     10050 non-null  int64  
 9   asset_num_sales                    10050 non-null  int64  
 10  asset_id                           10050 non-null  int64  
 11  asset_image_url                    10050 non-null  obj

In [8]:
sales_df.columns

Index(['_id', 'created_date', 'id', 'is_private', 'quantity', 'total_price',
       'payment_token_symbol', 'payment_token_usd_price', 'asset_token_id',
       'asset_num_sales', 'asset_id', 'asset_image_url',
       'transaction_from_account_username', 'seller_username',
       'winner_account_username'],
      dtype='object')

### 2. Target 

In [9]:
sales_df.total_price = sales_df.total_price.astype(float) 

In [10]:
# convert prices from wei to eth
sales_df.total_price = sales_df.total_price/10.**18

In [11]:
sales_df.total_price = sales_df.total_price * sales_df.payment_token_usd_price

In [12]:
sales_df.total_price.mean

<bound method NDFrame._add_numeric_operations.<locals>.mean of 0         1454.07605
1         1986.35000
2         3983.77000
3         1454.07605
4         3943.93230
            ...     
10045    11154.55600
10046     4776.54023
10047     7967.54000
10048     3983.77000
10049     5577.27800
Name: total_price, Length: 10050, dtype: float64>

### 3. Select Features

In [13]:
df = sales_df.loc[:,['created_date', 'is_private', 'quantity', 
       'payment_token_symbol', 'asset_token_id',
       'asset_num_sales','seller_username',
       'winner_account_username', 'total_price']]

In [14]:
df.describe()

Unnamed: 0,quantity,asset_token_id,asset_num_sales,total_price
count,10050.0,10050.0,10050.0,10050.0
mean,1.139801,6831.110846,7551.479005,4296.236805
std,1.095617,8427.32609,7401.743466,5392.35498
min,1.0,0.0,0.0,0.0
25%,1.0,1.0,1.0,2788.639
50%,1.0,1.0,14807.0,3943.9323
75%,1.0,14455.0,14807.0,4461.8224
max,50.0,28160.0,14807.0,167318.34


In [15]:
df.shape

(10050, 9)

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10050 entries, 0 to 10049
Data columns (total 9 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   created_date             10050 non-null  object 
 1   is_private               10050 non-null  object 
 2   quantity                 10050 non-null  int64  
 3   payment_token_symbol     10050 non-null  object 
 4   asset_token_id           10050 non-null  int64  
 5   asset_num_sales          10050 non-null  int64  
 6   seller_username          10050 non-null  object 
 7   winner_account_username  10050 non-null  object 
 8   total_price              10050 non-null  float64
dtypes: float64(1), int64(3), object(5)
memory usage: 706.8+ KB


In [17]:
df.corr()

Unnamed: 0,quantity,asset_token_id,asset_num_sales,total_price
quantity,1.0,-0.103427,0.121821,0.589215
asset_token_id,-0.103427,1.0,-0.826807,0.013486
asset_num_sales,0.121821,-0.826807,1.0,-0.084546
total_price,0.589215,0.013486,-0.084546,1.0


### 3.Baseline Model

In [19]:
X = df.loc[:,['quantity', 'asset_token_id', 'asset_num_sales']]

y = df['total_price']

In [20]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

val_score = lr_model.score(X_val, y_val)

print('Validation R^2 score:', val_score)
print('\nFeature coefficient results: ')
for feature, coef in zip(X.columns, lr_model.coef_):
    print(feature, ':', f'{coef:.2f}') 

Validation R^2 score: 0.10681497136676577

Feature coefficient results: 
quantity : 3417.49
asset_token_id : -0.09
asset_num_sales : -0.21


### 4. Get data ready

#### a. Changing data types

In [21]:
df.asset_token_id = df.asset_token_id.astype(float) 

In [22]:
df['created_date'] = pd.to_datetime(df['created_date'])

In [23]:
df.describe()

Unnamed: 0,quantity,asset_token_id,asset_num_sales,total_price
count,10050.0,10050.0,10050.0,10050.0
mean,1.139801,6831.110846,7551.479005,4296.236805
std,1.095617,8427.32609,7401.743466,5392.35498
min,1.0,0.0,0.0,0.0
25%,1.0,1.0,1.0,2788.639
50%,1.0,1.0,14807.0,3943.9323
75%,1.0,14455.0,14807.0,4461.8224
max,50.0,28160.0,14807.0,167318.34


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10050 entries, 0 to 10049
Data columns (total 9 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   created_date             10050 non-null  datetime64[ns]
 1   is_private               10050 non-null  object        
 2   quantity                 10050 non-null  int64         
 3   payment_token_symbol     10050 non-null  object        
 4   asset_token_id           10050 non-null  float64       
 5   asset_num_sales          10050 non-null  int64         
 6   seller_username          10050 non-null  object        
 7   winner_account_username  10050 non-null  object        
 8   total_price              10050 non-null  float64       
dtypes: datetime64[ns](1), float64(2), int64(2), object(4)
memory usage: 706.8+ KB


#### b. One hot encoding

*is_private*

In [25]:
df.is_private.value_counts()

False      8396
unknown    1627
True         27
Name: is_private, dtype: int64

In [26]:
X2 = X.copy()
X2['is_private'] = df.is_private

X2 = pd.get_dummies(X2['is_private'] )

In [27]:
X2_train, X2_val, y_train, y_val = train_test_split(X2, y, test_size=0.2, random_state=42)

lr_model = LinearRegression()
lr_model.fit(X2_train, y_train)

val_score = lr_model.score(X2_val, y_val)

print('Validation R^2 score:', val_score)
print('\nFeature coefficient results: ')
for feature, coef in zip(X.columns, lr_model.coef_):
    print(feature, ':', f'{coef:.2f}') 

Validation R^2 score: -0.0015218928913569574

Feature coefficient results: 
quantity : -3271216709141014.00
asset_token_id : -3271216709141479.00
asset_num_sales : -3271216709141004.50


*seller_username*

In [28]:
seller_counts = df.seller_username.value_counts()
seller_counts

unknown         1296
avarice          227
k_               105
770UR410N         79
VirtualBacon      73
                ... 
NFTcultureRS       1
SubtlyFuego        1
hellowelcome       1
adamsmoot          1
chiggun            1
Name: seller_username, Length: 3031, dtype: int64

In [29]:
seller_counts = list(seller_counts[seller_counts <= 5].index)
X3 = X.copy()

X3['seller_username'] = df['seller_username'].replace(seller_counts, 'other')

In [30]:
X3 = pd.get_dummies(X3['seller_username'] )

In [31]:
X3_train, X3_val, y_train, y_val = train_test_split(X3, y, test_size=0.2, random_state=42)

lr_model = LinearRegression()
lr_model.fit(X3_train, y_train)

val_score = lr_model.score(X3_val, y_val)

print('Validation R^2 score:', val_score)
print('\nFeature coefficient results: ')
for feature, coef in zip(X.columns, lr_model.coef_):
    print(feature, ':', f'{coef:.2f}') 

Validation R^2 score: -0.01114716980204844

Feature coefficient results: 
quantity : 1064037311085858.50
asset_token_id : 1064037311085874.00
asset_num_sales : 1064037311085885.00


*winner_account_username*

In [32]:
winner_counts = df.winner_account_username.value_counts()
winner_counts

unknown             2047
oxgbed               269
randaartcollect1     217
mexpex               210
HalfLifeXxVault      144
                    ... 
mcnutsmcnuts           1
HindsightCapita1       1
X404                   1
Edenarc                1
youaintmyfry           1
Name: winner_account_username, Length: 2453, dtype: int64

In [33]:
winner_counts = list(winner_counts[winner_counts <= 20].index)
X4 = X.copy()

X4['winner_account_username'] = df['winner_account_username'].replace(winner_counts, 'other')

In [34]:
X4 = pd.get_dummies(X4['winner_account_username'])

In [35]:
X4_train, X4_val, y_train, y_val = train_test_split(X4, y, test_size=0.2, random_state=42)

lr_model = LinearRegression()
lr_model.fit(X4_train, y_train)

val_score = lr_model.score(X4_val, y_val)

print('Validation R^2 score:', val_score)
print('\nFeature coefficient results: ')
for feature, coef in zip(X.columns, lr_model.coef_):
    print(feature, ':', f'{coef:.2f}') 

Validation R^2 score: 0.016378569993678482

Feature coefficient results: 
quantity : 19107782053627700.00
asset_token_id : 19107782053628024.00
asset_num_sales : 19107782053627372.00


None of that is bringing our score up.

One last thing before we go. 

In [45]:
X5 = X.copy()
X5['num_sales2']= df['asset_num_sales']**2 

#As a reminder:
#X = df.loc[:,['quantity', 'payment_token_usd_price', 'asset_token_id', 'asset_num_sales']]

In [46]:
X5_train, X5_val, y_train, y_val = train_test_split(X5, y, test_size=0.2, random_state=42)

lr_model = LinearRegression()
lr_model.fit(X5_train, y_train)

val_score = lr_model.score(X5_val, y_val)

print('Validation R^2 score:', val_score)
print('\nFeature coefficient results: ')
for feature, coef in zip(X.columns, lr_model.coef_):
    print(feature, ':', f'{coef:.2f}') 

Validation R^2 score: 0.10980127344055857

Feature coefficient results: 
quantity : 3416.97
asset_token_id : -0.10
asset_num_sales : -466.93
