# Cleaning and Feature Engineering

---

### Imports

In [191]:
import pandas as pd
import numpy as np
from datetime import datetime

### Data Read in


Price of Ethereum since being tracked by Yahoo finance, I beleive this might also be able to decrease irregularities of the start of the cryptocurrency.

In [192]:
price_data = pd.read_csv('../data/eth_price_per_day.csv')
track_start = int(datetime.strptime(price_data['Date'][0], "%Y-%m-%d").timestamp())
price_data['Date'] = pd.to_datetime(price_data['Date'])
price_data.set_index('Date', inplace=True)
price_data.sort_index(ascending=False, inplace=True)
price_data.head(3)

Unnamed: 0_level_0,Close
Date,Unnamed: 1_level_1
2024-01-31,2282.544434
2024-01-30,2344.493652
2024-01-29,2317.064209


Looking at only data after Yahoo began tracking so that we can compare volatility.

In [193]:
eth_data = pd.read_csv('../data/eth_trans_data.csv', index_col=None)
eth_data.drop(columns='Unnamed: 0', inplace=True)
eth_data['value'] = eth_data['value'].astype(float)
# eth_data_track = eth_data[eth_data['timeStamp'] > track_start]
# eth_data_track.head(3)

## Feature Engineering Price of Ethereum Data
----
Calculated volatility with recommendation from Corporate Finance Institute
https://corporatefinanceinstitute.com/resources/career-map/sell-side/capital-markets/volatility-vol/


In [194]:
display(price_data.head())
price_data.info()

Unnamed: 0_level_0,Close
Date,Unnamed: 1_level_1
2024-01-31,2282.544434
2024-01-30,2344.493652
2024-01-29,2317.064209
2024-01-28,2257.20874
2024-01-27,2267.885986


<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2275 entries, 2024-01-31 to 2017-11-09
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Close   2275 non-null   float64
dtypes: float64(1)
memory usage: 35.5 KB


In [195]:
price_data['Close_1']=price_data['Close'].shift(1)
price_data['Close_2']=price_data['Close'].shift(2)
price_data['Close_3']=price_data['Close'].shift(3)
price_data['Close_4']=price_data['Close'].shift(4)
price_data.head()


Unnamed: 0_level_0,Close,Close_1,Close_2,Close_3,Close_4
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2024-01-31,2282.544434,,,,
2024-01-30,2344.493652,2282.544434,,,
2024-01-29,2317.064209,2344.493652,2282.544434,,
2024-01-28,2257.20874,2317.064209,2344.493652,2282.544434,
2024-01-27,2267.885986,2257.20874,2317.064209,2344.493652,2282.544434


In [196]:
avg_last4 = (price_data['Close']+price_data['Close_1']+price_data['Close_2']+price_data['Close_3']+price_data['Close_4'])/5

diff = price_data['Close'] - avg_last4
diff1 = price_data['Close_1'] - avg_last4
diff2 = price_data['Close_2'] - avg_last4
diff3 = price_data['Close_3'] - avg_last4
diff4 = price_data['Close_4'] - avg_last4

price_data['volatility']= ((diff**2)+(diff1**2)+(diff2**2)+(diff3**2)+(diff4**2)/5)**0.5

price_data['dayChange'] = price_data['Close'] - price_data['Close_1']
price_data.head(10)

Unnamed: 0_level_0,Close,Close_1,Close_2,Close_3,Close_4,volatility,dayChange
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2024-01-31,2282.544434,,,,,,
2024-01-30,2344.493652,2282.544434,,,,,61.949219
2024-01-29,2317.064209,2344.493652,2282.544434,,,,-27.429443
2024-01-28,2257.20874,2317.064209,2344.493652,2282.544434,,,-59.855469
2024-01-27,2267.885986,2257.20874,2317.064209,2344.493652,2282.544434,71.736637,10.677246
2024-01-26,2267.199707,2267.885986,2257.20874,2317.064209,2344.493652,58.942905,-0.686279
2024-01-25,2217.710205,2267.199707,2267.885986,2257.20874,2317.064209,53.719713,-49.489502
2024-01-24,2233.561768,2217.710205,2267.199707,2267.885986,2257.20874,43.755391,15.851562
2024-01-23,2240.686035,2233.561768,2217.710205,2267.199707,2267.885986,38.80406,7.124268
2024-01-22,2310.826416,2240.686035,2233.561768,2217.710205,2267.199707,71.944097,70.140381


In [197]:
price_data.dropna(inplace=True)
price_data['perc75_Neg'] = ((price_data['volatility'] > 109) & (price_data['dayChange'] < 0))
price_data['perc75_Neg'] = price_data['perc75_Neg'].astype(int)
price_data.head(10)


Unnamed: 0_level_0,Close,Close_1,Close_2,Close_3,Close_4,volatility,dayChange,perc75_Neg
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2024-01-27,2267.885986,2257.20874,2317.064209,2344.493652,2282.544434,71.736637,10.677246,0
2024-01-26,2267.199707,2267.885986,2257.20874,2317.064209,2344.493652,58.942905,-0.686279,0
2024-01-25,2217.710205,2267.199707,2267.885986,2257.20874,2317.064209,53.719713,-49.489502,0
2024-01-24,2233.561768,2217.710205,2267.199707,2267.885986,2257.20874,43.755391,15.851562,0
2024-01-23,2240.686035,2233.561768,2217.710205,2267.199707,2267.885986,38.80406,7.124268,0
2024-01-22,2310.826416,2240.686035,2233.561768,2217.710205,2267.199707,71.944097,70.140381,0
2024-01-21,2453.913086,2310.826416,2240.686035,2233.561768,2217.710205,183.842881,143.08667,0
2024-01-20,2469.589111,2453.913086,2310.826416,2240.686035,2233.561768,206.010996,15.676025,0
2024-01-19,2489.498535,2469.589111,2453.913086,2310.826416,2240.686035,174.081425,19.909424,0
2024-01-18,2467.018799,2489.498535,2469.589111,2453.913086,2310.826416,89.136524,-22.479736,0


### Feature Engineering of Transaction Data
---

In [198]:
eth_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16024 entries, 0 to 16023
Data columns (total 20 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   blockNumber        16024 non-null  int64  
 1   timeStamp          16024 non-null  int64  
 2   hash               16024 non-null  object 
 3   nonce              16024 non-null  int64  
 4   blockHash          16024 non-null  object 
 5   transactionIndex   16024 non-null  int64  
 6   from               16024 non-null  object 
 7   to                 16024 non-null  object 
 8   value              16024 non-null  float64
 9   gas                16024 non-null  int64  
 10  gasPrice           16024 non-null  int64  
 11  isError            16024 non-null  int64  
 12  txreceipt_status   14658 non-null  float64
 13  input              16024 non-null  object 
 14  contractAddress    0 non-null      float64
 15  cumulativeGasUsed  16024 non-null  int64  
 16  gasUsed            160

In [199]:
eth_data['dateTime'] = pd.to_datetime(eth_data['timeStamp'], unit='s')
eth_data['timeOnly'] = eth_data['dateTime'].dt.time
eth_data['dateOnly'] = eth_data['dateTime'].dt.date
eth_data['hoursOftheday'] = eth_data['timeOnly'].apply(lambda x: x.hour)

eth_data.head(3)

Unnamed: 0,blockNumber,timeStamp,hash,nonce,blockHash,transactionIndex,from,to,value,gas,...,contractAddress,cumulativeGasUsed,gasUsed,confirmations,methodId,functionName,dateTime,timeOnly,dateOnly,hoursOftheday
0,17136393,1682584823,0x8366585ce739c108d93741d776596f93c16983d560b0...,36,0x7bcdb8aec9a817b3cde1cd7892608a7cb089e682a82a...,33,0x1833f626c2fbb5c51b7feeca189eb2ca95fb5aa6,0xe35e9842fceaca96570b734083f4a58e8f7c5f2a,7.88177e+17,23332,...,,2949253,23332,2044002,0x1186ec33,"deposit(address recipient,address originToken,...",2023-04-27 08:40:23,08:40:23,2023-04-27,8
1,17149378,1682742503,0xfa448c24afe28231f1e9d7ffa02197212e3eaaa7e7a3...,386,0xd81a9c262b025c1d9a55a4508a396094bc52ea973581...,44,0x980a9864331af230d08287a0e5f67fdb9bde4ce7,0xe35e9842fceaca96570b734083f4a58e8f7c5f2a,1.1832e+17,23320,...,,3929836,23320,2031017,0x1186ec33,"deposit(address recipient,address originToken,...",2023-04-29 04:28:23,04:28:23,2023-04-29,4
2,17932219,1692247283,0x917b569c0740d61555781173c18b1dabd9370566b996...,157,0x6669a0555cec9767cda3e61651880e076d7dd824d62e...,53,0x964edb5c66f6f974684cfcd22738c49664f1bf16,0xe35e9842fceaca96570b734083f4a58e8f7c5f2a,4.849222e+16,23320,...,,5929118,23320,1248176,0x1186ec33,"deposit(address recipient,address originToken,...",2023-08-17 04:41:23,04:41:23,2023-08-17,4


In [200]:
eth_data['ethValusd'] = eth_data['dateOnly'].map(price_data['Close'])
eth_data['volatility'] = eth_data['dateOnly'].map(price_data['volatility'])
eth_data['dayChange'] = eth_data['dateOnly'].map(price_data['dayChange'])
eth_data['perc75_Neg'] = eth_data['dateOnly'].map(price_data['perc75_Neg'])
eth_data['valueUSD'] = eth_data['value']*eth_data['ethValusd']/(1000000000000000000)
eth_data.head(5)

Unnamed: 0,blockNumber,timeStamp,hash,nonce,blockHash,transactionIndex,from,to,value,gas,...,functionName,dateTime,timeOnly,dateOnly,hoursOftheday,ethValusd,volatility,dayChange,perc75_Neg,valueUSD
0,17136393,1682584823,0x8366585ce739c108d93741d776596f93c16983d560b0...,36,0x7bcdb8aec9a817b3cde1cd7892608a7cb089e682a82a...,33,0x1833f626c2fbb5c51b7feeca189eb2ca95fb5aa6,0xe35e9842fceaca96570b734083f4a58e8f7c5f2a,7.88177e+17,23332,...,"deposit(address recipient,address originToken,...",2023-04-27 08:40:23,08:40:23,2023-04-27,8,1908.786377,43.753526,16.27356,0.0,1504.461461
1,17149378,1682742503,0xfa448c24afe28231f1e9d7ffa02197212e3eaaa7e7a3...,386,0xd81a9c262b025c1d9a55a4508a396094bc52ea973581...,44,0x980a9864331af230d08287a0e5f67fdb9bde4ce7,0xe35e9842fceaca96570b734083f4a58e8f7c5f2a,1.1832e+17,23320,...,"deposit(address recipient,address originToken,...",2023-04-29 04:28:23,04:28:23,2023-04-29,4,1908.916992,57.414384,31.992676,0.0,225.863103
2,17932219,1692247283,0x917b569c0740d61555781173c18b1dabd9370566b996...,157,0x6669a0555cec9767cda3e61651880e076d7dd824d62e...,53,0x964edb5c66f6f974684cfcd22738c49664f1bf16,0xe35e9842fceaca96570b734083f4a58e8f7c5f2a,4.849222e+16,23320,...,"deposit(address recipient,address originToken,...",2023-08-17 04:41:23,04:41:23,2023-08-17,4,1684.933472,21.006345,23.988403,0.0,81.70617
3,18206076,1695563579,0x172f6254a710be70e605d6154c67c33c0d1d9ff422aa...,2108,0xca7e163f2c2f57770a6fb204acbc1e5abcefad275e08...,30,0x9f9ebce72c0715cdbad4d589986eb22f6782a1ce,0xe35e9842fceaca96570b734083f4a58e8f7c5f2a,1000000000000000.0,34980,...,"deposit(address recipient,address originToken,...",2023-09-24 13:52:59,13:52:59,2023-09-24,13,1580.853394,35.948841,-7.469482,0.0,1.580853
4,18873557,1703638367,0xfb1704cbfa2b8b8c97ecfaee4849c0bdc60f6901aa84...,34,0x15ffd876b18b93d921be759f929d528c0e1656717ac8...,58,0x2dacd3c3ffe8bf9bfca57de0785e934bc689445d,0xe35e9842fceaca96570b734083f4a58e8f7c5f2a,0.0,701084,...,"deposit(address recipient,address originToken,...",2023-12-27 00:52:47,00:52:47,2023-12-27,0,2378.73999,75.191229,31.173828,0.0,0.0


In [206]:
eth_data.drop(columns=['gas','txreceipt_status'], inplace=True) ##probably not needed

In [207]:
eth_data.to_csv('../data/eth_trans_data_clean.csv')