# Feature Engineering

In [None]:
import pandas as pd
import numpy as np
import os
from stockstats import StockDataFrame as Sdf
import stockstats

## BTC

### Creating Target Variable

In [2]:
df = pd.read_csv(r'C:\Users\madha\Desktop\Dissertation\Data\Master Data\Merged\BTC_merged.csv')

df['date'] = pd.to_datetime(df['date'])
df.sort_values(by='date', inplace=True)

In [3]:
df.tail()

Unnamed: 0.1,Unnamed: 0,date,name,open,high,low,close,volume,marketCap,daily_weighted_sentiment,comment_volume,BTC_trends,gold_spot,gspc_spot,ndx_spot
360,4,2025-07-11,2781,115986.234797,118856.473739,115245.686349,117516.993668,86928360000.0,2337810000000.0,0.178156,84.0,45.0,3356.0,6259.75,22780.59961
361,3,2025-07-12,2781,117530.712896,118219.900043,116977.023698,117435.230053,45524560000.0,2335906000000.0,-0.539831,91.0,45.0,3356.0,6259.75,22780.59961
362,2,2025-07-13,2781,117432.200846,119449.571906,117265.437865,119116.117549,49021090000.0,2369445000000.0,-0.436295,76.0,53.0,3356.0,6259.75,22780.59961
363,1,2025-07-14,2781,119115.78751,123091.612801,118959.196786,119849.70572,181746400000.0,2384183000000.0,-0.157348,138.0,53.0,3351.5,6268.560059,22855.63086
364,0,2025-07-15,2781,119853.848476,119935.560408,115765.686727,117777.188995,98321660000.0,2342931000000.0,-0.212001,46.0,53.0,3329.800049,6243.759766,22884.58984


Changing target from 1,-1 to 1,0 due to deep learning model constraints

In [4]:
def assign_target(change):
    if pd.isna(change):
        return np.nan  
    elif change > 0:
        return 1       
    else:
        return -1     

In [5]:
df['price_change'] = df['close'].diff()
df['next_day_price_change'] = df['price_change'].shift(-1) 

In [6]:
df['target'] = df['next_day_price_change'].apply(assign_target)
df.drop(columns=['next_day_price_change'], inplace=True)
df.dropna(subset=['target'], inplace=True)

In [7]:
df.head()

Unnamed: 0.1,Unnamed: 0,date,name,open,high,low,close,volume,marketCap,daily_weighted_sentiment,comment_volume,BTC_trends,gold_spot,gspc_spot,ndx_spot,price_change,target
0,364,2024-07-16,2781,64784.418843,65354.3382,62487.96711,65097.146831,41617350000.0,1284145000000.0,-0.092346,59.0,36.0,2462.399902,5667.200195,20398.61914,,-1.0
1,363,2024-07-17,2781,65091.830709,66066.733027,63896.087193,64118.792669,32525070000.0,1264877000000.0,0.032045,29.0,36.0,2454.800049,5588.27002,19799.14063,-978.354161,-1.0
2,362,2024-07-18,2781,64104.737679,65104.661419,63246.164439,63974.066684,27239310000.0,1262047000000.0,-0.000932,24.0,36.0,2451.800049,5544.589844,19705.08984,-144.725985,1.0
3,361,2024-07-19,2781,63972.325499,67442.638116,63329.342025,66710.154313,37003860000.0,1316051000000.0,0.033252,30.0,36.0,2395.5,5505.0,19522.61914,2736.087629,1.0
4,360,2024-07-20,2781,66709.925615,67610.734373,66299.618373,67163.644924,19029580000.0,1325061000000.0,0.0,2.0,36.0,2395.5,5505.0,19522.61914,453.490611,1.0


### Percentage Changes (24h, 3days, 7days)

In [8]:
df['percent_change_24h'] = df['close'].pct_change() * 100
df['percent_change_3d'] = df['close'].pct_change(periods=3) * 100
df['percent_change_7d'] = df['close'].pct_change(periods=7) * 100
df['percent_change_14d'] = df['close'].pct_change(periods=14) * 100

### Moving Average

ma_7d: This feature tells the model about the level of the recent price trend. It acts as a smoothed, less noisy version of the price itself. Currently taken for 7 Days.

price_vs_ma7d: This feature tells the model about the deviation from the recent trend. It measures how "overextended" or "oversold" the current price is compared to its recent average. A model might learn that when this value is extremely high (e.g., +15%), the price is likely to come back down, even if the trend is generally up.

In [9]:
df['ma_7d'] = df['close'].rolling(window=7).mean()
df['price_vs_ma7d'] = (df['close'] - df['ma_7d']) / df['ma_7d'] * 100

In [10]:
df['ma_14d'] = df['close'].rolling(window=14).mean()
df['price_vs_ma14d'] = (df['close'] - df['ma_14d']) / df['ma_14d'] * 100

In [11]:
df.tail()

Unnamed: 0.1,Unnamed: 0,date,name,open,high,low,close,volume,marketCap,daily_weighted_sentiment,...,price_change,target,percent_change_24h,percent_change_3d,percent_change_7d,percent_change_14d,ma_7d,price_vs_ma7d,ma_14d,price_vs_ma14d
359,5,2025-07-10,2781,111329.195981,116608.784676,110660.749453,115987.206197,95911610000.0,2307032000000.0,0.525458,...,4660.654928,1.0,4.186472,7.098214,5.781436,8.439793,110008.782124,5.434497,108871.720976,6.53566
360,4,2025-07-11,2781,115986.234797,118856.473739,115245.686349,117516.993668,86928360000.0,2337810000000.0,0.178156,...,1529.787472,-1.0,1.318928,7.862961,8.777445,9.738273,111363.44707,5.525643,109616.618365,7.207279
361,3,2025-07-12,2781,117530.712896,118219.900043,116977.023698,117435.230053,45524560000.0,2335906000000.0,-0.539831,...,-81.763615,1.0,-0.069576,5.487171,8.504063,9.417444,112678.311079,4.221681,110338.58457,6.431699
362,2,2025-07-13,2781,117432.200846,119449.571906,117265.437865,119116.117549,49021090000.0,2369445000000.0,-0.436295,...,1680.887496,1.0,1.431332,2.697635,9.048668,9.900347,114090.31791,4.405106,111105.052246,7.210352
363,1,2025-07-14,2781,119115.78751,123091.612801,118959.196786,119849.70572,181746400000.0,2384183000000.0,-0.157348,...,733.588171,-1.0,0.61586,1.985,10.6647,11.86758,115740.297143,3.550543,112013.221618,6.996035


In [12]:
df.head(15)

Unnamed: 0.1,Unnamed: 0,date,name,open,high,low,close,volume,marketCap,daily_weighted_sentiment,...,price_change,target,percent_change_24h,percent_change_3d,percent_change_7d,percent_change_14d,ma_7d,price_vs_ma7d,ma_14d,price_vs_ma14d
0,364,2024-07-16,2781,64784.418843,65354.3382,62487.96711,65097.146831,41617350000.0,1284145000000.0,-0.092346,...,,-1.0,,,,,,,,
1,363,2024-07-17,2781,65091.830709,66066.733027,63896.087193,64118.792669,32525070000.0,1264877000000.0,0.032045,...,-978.354161,-1.0,-1.502914,,,,,,,
2,362,2024-07-18,2781,64104.737679,65104.661419,63246.164439,63974.066684,27239310000.0,1262047000000.0,-0.000932,...,-144.725985,1.0,-0.225715,,,,,,,
3,361,2024-07-19,2781,63972.325499,67442.638116,63329.342025,66710.154313,37003860000.0,1316051000000.0,0.033252,...,2736.087629,1.0,4.27687,2.477847,,,,,,
4,360,2024-07-20,2781,66709.925615,67610.734373,66299.618373,67163.644924,19029580000.0,1325061000000.0,0.0,...,453.490611,1.0,0.679792,4.748767,,,,,,
5,359,2024-07-21,2781,67164.911922,68372.905166,65842.2988,68154.522906,26652190000.0,1344591000000.0,0.0,...,990.877982,-1.0,1.475319,6.534611,,,,,,
6,358,2024-07-22,2781,68152.976133,68480.062791,66611.299868,67585.249972,42649110000.0,1333416000000.0,-0.330237,...,-569.272934,-1.0,-0.835268,1.311788,,,66114.7969,2.224091,,
7,357,2024-07-23,2781,67584.804355,67779.017554,65484.462081,65927.672361,35605670000.0,1300763000000.0,0.0,...,-1657.577611,-1.0,-2.452573,-1.84024,1.275825,,66233.443404,-0.461657,,
8,356,2024-07-24,2781,65927.858075,67113.984313,65146.994976,65372.133234,27470940000.0,1289901000000.0,0.0,...,-555.539128,1.0,-0.842649,-4.082473,1.954716,,66412.492056,-1.566511,,
9,355,2024-07-25,2781,65375.873485,66112.420035,63473.473285,65777.22394,38315760000.0,1297746000000.0,-0.002802,...,405.090706,1.0,0.619669,-2.675178,2.818575,,66670.08595,-1.339224,,


### Stockstats Features

Implementing the stockstats library to generate financial indicator features.

Columns have to be renamed to fit the stockstats wrapper convention.

In [14]:
df.rename(columns={
    'Close': 'close',
    'High': 'high',
    'Low': 'low',
    'Open': 'open',
    'Volume': 'volume'
}, inplace=True)

df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)

df.drop('Unnamed: 0', axis=1, inplace=True)

In [15]:
stock_df = Sdf.retype(df)

In [16]:
stock_df['close_7_ema']
stock_df['close_14_ema']
stock_df['adx']
stock_df['rsi_7']
stock_df['rsi']
stock_df['stochrsi']
stock_df['atr']
stock_df['mfi']

date
2024-07-16    0.500000
2024-07-17    0.500000
2024-07-18    0.500000
2024-07-19    0.500000
2024-07-20    0.500000
                ...   
2025-07-10    0.624905
2025-07-11    0.713836
2025-07-12    0.721526
2025-07-13    0.728298
2025-07-14    0.822124
Name: mfi, Length: 364, dtype: float64

In [17]:
stock_df['boll']
stock_df['boll_width'] = stock_df['boll_ub'] - stock_df['boll_lb']

In [18]:
stock_df['kdjk']
stock_df['kdjd']

#Checking if k line is over d line
stock_df['k_above_d'] = stock_df['kdjk'] > stock_df['kdjd']
stock_df['crossover'] = stock_df['k_above_d'].diff()

stock_df['kdj_signal'] = 0.0
# For Bullish signal
stock_df.loc[stock_df['crossover'] & stock_df['k_above_d'], 'kdj_signal'] = 1.0
# For Bearish signal
stock_df.loc[stock_df['crossover'] & ~stock_df['k_above_d'], 'kdj_signal'] = -1.0

stock_df = stock_df.drop(['k_above_d', 'crossover'], axis=1)

print(stock_df[['kdjk', 'kdjd', 'kdj_signal']].tail(10))


                 kdjk       kdjd  kdj_signal
date                                        
2025-07-05  67.044894  73.916183         0.0
2025-07-06  69.923343  72.585237         0.0
2025-07-07  66.070843  70.413772         0.0
2025-07-08  67.529358  69.452301         0.0
2025-07-09  75.403598  71.436066         1.0
2025-07-10  81.793074  74.888402         0.0
2025-07-11  83.999681  77.925495         0.0
2025-07-12  85.234989  80.361993         0.0
2025-07-13  89.226523  83.316836         0.0
2025-07-14  85.887001  84.173558         0.0


In [None]:
#changing default window for MACD

window=stockstats.set_dft_window('macd', (8, 20, 9))
stockstats.set_dft_window('macd', window)

In [20]:
stock_df = Sdf.retype(stock_df)

In [21]:
stock_df['macd']

date
2024-07-16       0.000000
2024-07-17     -21.950254
2024-07-18     -32.456924
2024-07-19      62.345275
2024-07-20     130.592282
                 ...     
2025-07-10    1644.884702
2025-07-11    2150.652051
2025-07-12    2515.877212
2025-07-13    2907.439207
2025-07-14    3239.605192
Name: macd, Length: 364, dtype: float64

In [22]:
stock_df.head()

Unnamed: 0_level_0,name,open,high,low,close,volume,marketCap,daily_weighted_sentiment,comment_volume,BTC_trends,...,boll,boll_ub,boll_lb,boll_width,kdjk,kdjd,kdj_signal,macd,macds,macdh
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2024-07-16,2781,64784.418843,65354.3382,62487.96711,65097.146831,41617350000.0,1284145000000.0,-0.092346,59.0,36.0,...,65097.146831,,,,63.675761,54.558587,0.0,0.0,0.0,0.0
2024-07-17,2781,65091.830709,66066.733027,63896.087193,64118.792669,32525070000.0,1264877000000.0,0.032045,29.0,36.0,...,64607.96975,65991.571474,63224.368026,2767.203448,57.640339,55.585838,0.0,-21.950254,-12.194585,-9.755668
2024-07-18,2781,64104.737679,65104.661419,63246.164439,63974.066684,27239310000.0,1262047000000.0,-0.000932,24.0,36.0,...,64396.668728,65618.53383,63174.803626,2443.730204,52.268718,54.480131,-1.0,-32.456924,-20.498823,-11.958102
2024-07-19,2781,63972.325499,67442.638116,63329.342025,66710.154313,37003860000.0,1316051000000.0,0.033252,30.0,36.0,...,64975.040124,67494.468321,62455.611928,5038.856393,63.251245,57.403836,1.0,62.345275,7.564896,54.780379
2024-07-20,2781,66709.925615,67610.734373,66299.618373,67163.644924,19029580000.0,1325061000000.0,0.0,2.0,36.0,...,65412.761084,68344.080173,62481.441995,5862.638178,72.591663,62.466445,0.0,130.592282,44.162762,86.42952


In [23]:
all_features_df = pd.DataFrame(stock_df)

In [24]:
all_features_df.dropna(inplace=True)

In [26]:
all_features_df.to_csv(r'C:\Users\madha\Desktop\Dissertation\Data\Master Data\Feature Engineering\FeatureEngineered_BTC.csv')