# Feature Engineering

In [None]:
import pandas as pd
import numpy as np
import os
from stockstats import StockDataFrame as Sdf
import stockstats

## ETH


### Creating Target Variable

In [2]:
df = pd.read_csv(r'C:\Users\madha\Desktop\Dissertation\Data\Master Data\Merged\ETH_merged.csv')

df['date'] = pd.to_datetime(df['date'])
df.sort_values(by='date', inplace=True)

In [3]:
df.tail()

Unnamed: 0.1,Unnamed: 0,date,name,open,high,low,close,volume,marketCap,daily_weighted_sentiment,comment_volume,ETH_trends,gold_spot,gspc_spot,ndx_spot
360,4,2025-07-11,2781,2954.832779,3038.14115,2916.956467,2957.886198,36226560000.0,357117400000.0,0.313511,113.0,62.0,3356.0,6259.75,22780.59961
361,3,2025-07-12,2781,2958.333774,2979.779932,2907.193557,2942.911663,16317200000.0,355247200000.0,-0.004126,55.0,62.0,3356.0,6259.75,22780.59961
362,2,2025-07-13,2781,2942.853579,3016.393465,2938.736541,2973.358989,17361750000.0,358913100000.0,-0.129803,74.0,100.0,3356.0,6259.75,22780.59961
363,1,2025-07-14,2781,2973.225148,3079.985511,2965.32379,3013.350859,36349290000.0,363749600000.0,0.040624,44.0,100.0,3351.5,6268.560059,22855.63086
364,0,2025-07-15,2781,3013.293577,3142.427222,2934.371234,3139.889718,39013660000.0,379030200000.0,-0.065852,44.0,100.0,3329.800049,6243.759766,22884.58984


In [4]:
def assign_target(change):
    if pd.isna(change):
        return np.nan  
    elif change > 0:
        return 1       
    else:
        return -1     

In [5]:
df['price_change'] = df['close'].diff()
df['next_day_price_change'] = df['price_change'].shift(-1) 

In [6]:
df['target'] = df['next_day_price_change'].apply(assign_target)
df.drop(columns=['next_day_price_change'], inplace=True)
df.dropna(subset=['target'], inplace=True)

In [7]:
df.head()

Unnamed: 0.1,Unnamed: 0,date,name,open,high,low,close,volume,marketCap,daily_weighted_sentiment,comment_volume,ETH_trends,gold_spot,gspc_spot,ndx_spot,price_change,target
0,364,2024-07-16,2781,3486.144073,3498.219053,3351.779683,3443.513502,20446660000.0,413977800000.0,0.030913,17.0,57.0,2462.399902,5667.200195,20398.61914,,-1.0
1,363,2024-07-17,2781,3446.736719,3516.10421,3379.099525,3388.752373,16739120000.0,407399900000.0,0.030913,17.0,57.0,2454.800049,5588.27002,19799.14063,-54.761129,1.0
2,362,2024-07-18,2781,3388.030068,3488.715655,3374.99043,3426.258591,15035620000.0,411914200000.0,0.089105,45.0,57.0,2451.800049,5544.589844,19705.08984,37.506218,1.0
3,361,2024-07-19,2781,3425.90881,3540.588346,3377.877575,3505.734841,17705630000.0,421472500000.0,0.016569,33.0,57.0,2395.5,5505.0,19522.61914,79.47625,1.0
4,360,2024-07-20,2781,3505.721336,3539.904292,3482.488908,3519.295344,10360200000.0,423126700000.0,0.0,32.0,57.0,2395.5,5505.0,19522.61914,13.560503,1.0


### Percentage Changes (24h, 3days, 7days)

In [8]:
df['percent_change_24h'] = df['close'].pct_change() * 100
df['percent_change_3d'] = df['close'].pct_change(periods=3) * 100
df['percent_change_7d'] = df['close'].pct_change(periods=7) * 100
df['percent_change_14d'] = df['close'].pct_change(periods=14) * 100

### Moving Average

ma_7d: This feature tells the model about the level of the recent price trend. It acts as a smoothed, less noisy version of the price itself. Currently taken for 7 Days.

price_vs_ma7d: This feature tells the model about the deviation from the recent trend. It measures how "overextended" or "oversold" the current price is compared to its recent average. A model might learn that when this value is extremely high (e.g., +15%), the price is likely to come back down, even if the trend is generally up.

In [9]:
df['ma_7d'] = df['close'].rolling(window=7).mean()
df['price_vs_ma7d'] = (df['close'] - df['ma_7d']) / df['ma_7d'] * 100

In [10]:
df['ma_14d'] = df['close'].rolling(window=14).mean()
df['price_vs_ma14d'] = (df['close'] - df['ma_14d']) / df['ma_14d'] * 100

In [11]:
df.tail()

Unnamed: 0.1,Unnamed: 0,date,name,open,high,low,close,volume,marketCap,daily_weighted_sentiment,...,price_change,target,percent_change_24h,percent_change_3d,percent_change_7d,percent_change_14d,ma_7d,price_vs_ma7d,ma_14d,price_vs_ma14d
359,5,2025-07-10,2781,2770.737301,2995.152156,2757.266597,2954.845218,33929200000.0,356696200000.0,-0.205187,...,184.067584,1.0,6.643174,16.194645,14.042336,22.29577,2640.16808,11.91883,2564.122348,15.238074
360,4,2025-07-11,2781,2954.832779,3038.14115,2916.956467,2957.886198,36226560000.0,357117400000.0,0.313511,...,3.040981,-1.0,0.102915,13.09041,17.913683,22.031712,2704.363508,9.374579,2602.266587,13.665764
361,3,2025-07-12,2781,2958.333774,2979.779932,2907.193557,2942.911663,16317200000.0,355247200000.0,-0.004126,...,-14.974536,1.0,-0.506258,6.212481,16.908395,20.754184,2765.168029,6.427951,2638.395314,11.541726
362,2,2025-07-13,2781,2942.853579,3016.393465,2938.736541,2973.358989,17361750000.0,358913100000.0,-0.129803,...,30.447326,1.0,1.034599,0.626556,15.639266,18.888705,2822.614101,5.340613,2672.138097,11.272654
363,1,2025-07-14,2781,2973.225148,3079.985511,2965.32379,3013.350859,36349290000.0,363749600000.0,0.040624,...,39.991871,1.0,1.345006,1.875145,18.495287,21.19019,2889.805189,4.275225,2709.772848,11.20308


In [12]:
df.head(15)

Unnamed: 0.1,Unnamed: 0,date,name,open,high,low,close,volume,marketCap,daily_weighted_sentiment,...,price_change,target,percent_change_24h,percent_change_3d,percent_change_7d,percent_change_14d,ma_7d,price_vs_ma7d,ma_14d,price_vs_ma14d
0,364,2024-07-16,2781,3486.144073,3498.219053,3351.779683,3443.513502,20446660000.0,413977800000.0,0.030913,...,,-1.0,,,,,,,,
1,363,2024-07-17,2781,3446.736719,3516.10421,3379.099525,3388.752373,16739120000.0,407399900000.0,0.030913,...,-54.761129,1.0,-1.590269,,,,,,,
2,362,2024-07-18,2781,3388.030068,3488.715655,3374.99043,3426.258591,15035620000.0,411914200000.0,0.089105,...,37.506218,1.0,1.106785,,,,,,,
3,361,2024-07-19,2781,3425.90881,3540.588346,3377.877575,3505.734841,17705630000.0,421472500000.0,0.016569,...,79.47625,1.0,2.319622,1.806914,,,,,,
4,360,2024-07-20,2781,3505.721336,3539.904292,3482.488908,3519.295344,10360200000.0,423126700000.0,0.0,...,13.560503,1.0,0.386809,3.852243,,,,,,
5,359,2024-07-21,2781,3519.426557,3546.619238,3415.443789,3536.60544,13845910000.0,425202700000.0,0.0,...,17.310096,-1.0,0.491863,3.220622,,,,,,
6,358,2024-07-22,2781,3536.627172,3560.075505,3425.795747,3440.419892,18723200000.0,413677800000.0,0.170764,...,-96.185547,1.0,-2.719714,-1.863089,,,3465.79714,-0.73222,,
7,357,2024-07-23,2781,3440.76842,3539.531735,3395.420892,3481.995954,24468410000.0,418653300000.0,0.44457,...,41.576062,-1.0,1.208459,-1.059854,1.117535,,3471.294634,0.30828,,
8,356,2024-07-24,2781,3482.151959,3487.653057,3304.039213,3336.339424,16040950000.0,401125200000.0,-0.276593,...,-145.65653,-1.0,-4.183133,-5.662662,-1.546674,,3463.807069,-3.679987,,
9,355,2024-07-25,2781,3336.362309,3341.438456,3088.76449,3174.427315,25293750000.0,381610800000.0,-0.30181,...,-161.912108,1.0,-4.852987,-7.731399,-7.350037,,3427.831173,-7.392542,,


### Stockstats Features

In [14]:
df.rename(columns={
    'Close': 'close',
    'High': 'high',
    'Low': 'low',
    'Open': 'open',
    'Volume': 'volume'
}, inplace=True)

df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)

df.drop('Unnamed: 0', axis=1, inplace=True)

In [15]:
stock_df = Sdf.retype(df)

In [16]:
stock_df['close_7_ema']
stock_df['close_14_ema']
stock_df['adx']
stock_df['rsi_7']
stock_df['rsi']
stock_df['stochrsi']
stock_df['atr']
stock_df['mfi']

date
2024-07-16    0.500000
2024-07-17    0.500000
2024-07-18    0.500000
2024-07-19    0.500000
2024-07-20    0.500000
                ...   
2025-07-10    0.708691
2025-07-11    0.789401
2025-07-12    0.732622
2025-07-13    0.739863
2025-07-14    0.761125
Name: mfi, Length: 364, dtype: float64

In [18]:
stock_df['boll']
stock_df['boll_width'] = stock_df['boll_ub'] - stock_df['boll_lb']

In [19]:
stock_df['kdjk']
stock_df['kdjd']

#Checking if k line is over d line
stock_df['k_above_d'] = stock_df['kdjk'] > stock_df['kdjd']
stock_df['crossover'] = stock_df['k_above_d'].diff()

stock_df['kdj_signal'] = 0.0
# For Bullish signal
stock_df.loc[stock_df['crossover'] & stock_df['k_above_d'], 'kdj_signal'] = 1.0
# For Bearish signal
stock_df.loc[stock_df['crossover'] & ~stock_df['k_above_d'], 'kdj_signal'] = -1.0

stock_df = stock_df.drop(['k_above_d', 'crossover'], axis=1)

print(stock_df[['kdjk', 'kdjd', 'kdj_signal']].tail(10))


                 kdjk       kdjd  kdj_signal
date                                        
2025-07-05  64.206001  69.521705         0.0
2025-07-06  67.835674  68.959695         0.0
2025-07-07  66.591984  68.170458         0.0
2025-07-08  75.172632  70.504516         1.0
2025-07-09  81.546672  74.185235         0.0
2025-07-10  85.519353  77.963274         0.0
2025-07-11  85.589425  80.505324         0.0
2025-07-12  84.748580  81.919743         0.0
2025-07-13  85.900037  83.246507         0.0
2025-07-14  86.734556  84.409190         0.0


In [20]:
window=stockstats.set_dft_window('macd', (8, 20, 9))
stockstats.set_dft_window('macd', window)

(8, 20, 9)

In [21]:
stock_df = Sdf.retype(stock_df)

In [22]:
stock_df['macd']

date
2024-07-16      0.000000
2024-07-17     -1.228615
2024-07-18     -0.402434
2024-07-19      2.852652
2024-07-20      5.100965
                 ...    
2025-07-10     66.939518
2025-07-11     88.561261
2025-07-12    103.297578
2025-07-13    116.094792
2025-07-14    127.988316
Name: macd, Length: 364, dtype: float64

In [23]:
stock_df.head()

Unnamed: 0_level_0,name,open,high,low,close,volume,marketCap,daily_weighted_sentiment,comment_volume,ETH_trends,...,boll,boll_ub,boll_lb,boll_width,kdjk,kdjd,kdj_signal,macd,macds,macdh
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2024-07-16,2781,3486.144073,3498.219053,3351.779683,3443.513502,20446660000.0,413977800000.0,0.030913,17.0,57.0,...,3443.513502,,,,54.214289,51.404763,0.0,0.0,0.0,0.0
2024-07-17,2781,3446.736719,3516.10421,3379.099525,3388.752373,16739120000.0,407399900000.0,0.030913,17.0,57.0,...,3416.132937,3493.576868,3338.689007,154.887862,43.642793,48.817439,-1.0,-1.228615,-0.682564,-0.546051
2024-07-18,2781,3388.030068,3488.715655,3374.99043,3426.258591,15035620000.0,411914200000.0,0.089105,45.0,57.0,...,3419.508155,3475.503569,3363.512742,111.990827,44.203288,47.279389,0.0,-0.402434,-0.567757,0.165323
2024-07-19,2781,3425.90881,3540.588346,3377.877575,3505.734841,17705630000.0,421472500000.0,0.016569,33.0,57.0,...,3441.064827,3538.662806,3343.466847,195.195959,56.64896,50.402579,1.0,2.852652,0.590918,2.261733
2024-07-20,2781,3505.721336,3539.904292,3482.488908,3519.295344,10360200000.0,423126700000.0,0.0,32.0,57.0,...,3456.71093,3566.438025,3346.983836,219.454189,67.340122,56.048427,0.0,5.100965,1.932555,3.16841


In [24]:
all_features_df = pd.DataFrame(stock_df)

In [25]:
all_features_df.dropna(inplace=True)

In [26]:
all_features_df.to_csv(r'C:\Users\madha\Desktop\Dissertation\Data\Master Data\Feature Engineering\FeatureEngineered_ETH.csv')