## Data Ingestion and Wrangling

### Importing Modules

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import talib
import math

In [2]:
df = pd.read_csv(r'vn30f_hourly_cleaned.csv', index_col = 0)

In [3]:
df = df.set_index('time')

In [4]:
df

Unnamed: 0_level_0,open,close,high,low,volume
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2017-11-06 10:00:00,841.3,841.2,841.3,841.2,3851.0
2017-11-06 11:00:00,842.3,842.4,842.4,842.3,4137.0
2017-11-06 13:30:00,842.3,842.0,842.3,842.0,4288.0
2017-11-06 14:30:00,844.8,844.8,844.8,844.8,4489.0
2017-11-07 10:00:00,846.2,846.6,846.6,846.2,5253.0
...,...,...,...,...,...
2023-06-22 14:30:00,1114.5,1111.7,1114.5,1111.7,83579.0
2023-06-23 10:00:00,1117.8,1118.2,1118.2,1117.7,40868.0
2023-06-23 11:00:00,1121.3,1120.5,1121.3,1120.4,37133.0
2023-06-23 13:30:00,1114.1,1115.0,1115.3,1114.1,52393.0


In [5]:
df = df.dropna(axis=0, how='any')

In [6]:
df.dtypes

open      float64
close     float64
high      float64
low       float64
volume    float64
dtype: object

### Getting price changes in previous periods

In [7]:
for i in range(1, 11):
    df['t' + str(i) + 'change'] = df['close'].pct_change(i)

In [8]:
df = df.dropna(how='any', axis='rows')

### Getting technical indicators

There are several technical indicators groups in TA-Lib:
* Overlap Studies
* Momentum Indicators
* Volume Indicators
* Volatility Indicators
* Cycle Indicators
* Pattern Recognition

In [9]:
close, high, low, volume = df['close'], df['high'], df['low'], df['volume']

In [10]:
# Overlap Studies indicator
df['upperband'], df['middleband'], df['lowerband'] = talib.BBANDS(close, timeperiod=5, nbdevup=2, nbdevdn=2, matype=0) #Bollinger Bands
df['dema'] = talib.DEMA(close, timeperiod=30) #Double Exponential Moving Average
df['ema'] = talib.EMA(close, timeperiod=30) #Exponential Moving Average
df['ht_trendline'] = talib.HT_TRENDLINE(close) #Hilbert Transform - Instantaneous Trendline
df['kama'] = talib.KAMA(close, timeperiod=30) #Kaufman Adaptive Moving Average
df['ma'] = talib.MA(close, timeperiod=30, matype=0) #Moving Average
df['midpoint'] = talib.MIDPOINT(close, timeperiod=14) #MidPoint over period
df['midprice'] = talib.MIDPRICE(high, low, timeperiod=14) #Midpoint Price over period
df['sma'] = talib.SMA(close, timeperiod=30) #Simple Moving Average
df['t3'] = talib.T3(close, timeperiod=5, vfactor=0) #Triple Exponential Moving Average (T3)
df['tema'] = talib.TEMA(close, timeperiod=30) #Triple Exponential Moving Average
df['trima'] = talib.TRIMA(close, timeperiod=30) #Triangular Moving Average
df['wma'] = talib.WMA(close, timeperiod=30) #Weighted Moving Average

In [11]:
#Moment Indicators
df['adx'] = talib.ADX(high, low, close, timeperiod=14) #Average Directional Movement Index
df['adxr'] = talib.ADXR(high, low, close, timeperiod=14) #Average Directional Movement Index Rating
df['apo'] = talib.APO(close, fastperiod=12, slowperiod=26, matype=0) #Absolute Price Oscillator
df['aroondown'], df['aroonup'] = talib.AROON(high, low, timeperiod=14) #Aroon
df['aroonosc'] = talib.AROONOSC(high, low, timeperiod=14) #Aroon Oscillator
df['bop'] = talib.BOP(df['open'], high, low, close) #Balance Of Power
df['cci'] = talib.CCI(high, low, close, timeperiod=14) #Commodity Channel Index
df['cmo'] = talib.CMO(close, timeperiod=14) #Chande Momentum Oscillator
df['dx'] = talib.DX(high, low, close, timeperiod=14) #Directional Movement Index
df['macd'], df['macdsignal'], df['macdhist'] = talib.MACD(close, fastperiod=12, slowperiod=26, signalperiod=9) #Moving Average Convergence/Divergence
df['mfi'] = talib.MFI(high, low, close, volume, timeperiod=14) #Money Flow Index
df['minus_di'] = talib.MINUS_DI(high, low, close, timeperiod=14) #Minus Directional Indicator
df['mom'] = talib.MOM(close, timeperiod=10) #Momentum
df['plus_di'] = talib.PLUS_DI(high, low, close, timeperiod=14) #Plus Directional Indicator
df['ppo'] = talib.PPO(close, fastperiod=12, slowperiod=26, matype=0) #Percentage Price Oscillator
df['roc'] = talib.ROC(close, timeperiod=10) #Rate of Change
df['rsi'] = talib.RSI(close, timeperiod=14) #Relative Strength Index
df['slowk'], df['slowd'] = talib.STOCH(high, low, close, fastk_period=5, slowk_period=3, slowk_matype=0, slowd_period=3, slowd_matype=0) #Stochastic
df['fastk'], df['fastd'] = talib.STOCHF(high, low, close, fastk_period=5, fastd_period=3, fastd_matype=0) #Stochastic Fast
df['fastk_rsi'], df['fastd_rsi'] = talib.STOCHRSI(close, timeperiod=14, fastk_period=5, fastd_period=3, fastd_matype=0) #Stochastic Relative Strength Index
df['ultosc'] = talib.ULTOSC(high, low, close, timeperiod1=7, timeperiod2=14, timeperiod3=28) #Ultimate Oscillator
df['willr'] = talib.WILLR(high, low, close, timeperiod=14) #Williams' %R

In [12]:
# Volume Indicators
df['ad'] = talib.AD(high, low, close, volume) #Chaikin A/D Line
df['adosc'] = talib.ADOSC(high, low, close, volume, fastperiod=3, slowperiod=10) #Chaikin A/D Oscillator
df['obv'] = talib.OBV(close, volume) #On Balance Volume

In [13]:
# Volatility Indicators
df['adr'] = talib.ATR(high, low, close, timeperiod=14)
df['natr'] = talib.NATR(high, low, close, timeperiod=14)
df['trange'] = talib.TRANGE(high, low, close)

In [14]:
# Cycle Indicators
df['ht_decperiod'] = talib.HT_DCPERIOD(close)
df['ht_dcphase'] = talib.HT_DCPHASE(close)
df['inphase'], df['quadrature'] = talib.HT_PHASOR(close)
df['sine'], df['leadsine'] = talib.HT_SINE(close)
df['integer'] = talib.HT_TRENDMODE(close)

In [15]:
df = df.dropna(axis=0, how='any')

In [16]:
df.describe()

Unnamed: 0,open,close,high,low,volume,t1change,t2change,t3change,t4change,t5change,...,adr,natr,trange,ht_decperiod,ht_dcphase,inphase,quadrature,sine,leadsine,integer
count,5531.0,5531.0,5531.0,5531.0,5531.0,5531.0,5531.0,5531.0,5531.0,5531.0,...,5531.0,5531.0,5531.0,5531.0,5531.0,5531.0,5531.0,5531.0,5531.0,5531.0
mean,1070.275484,1070.263551,1070.64397,1069.899168,38559.102152,5.9e-05,0.000119,0.000183,0.000246,0.000305,...,5.191108,0.495087,5.186115,22.48789,129.222844,0.266969,0.093831,-0.0416,-0.102351,0.606581
std,237.533181,237.518041,237.6143,237.4376,28353.662602,0.007252,0.0106,0.013194,0.015404,0.017131,...,2.813493,0.281174,5.799836,5.16479,99.240074,13.859603,21.837514,0.580559,0.738713,0.488553
min,573.6,574.5,574.7,573.5,668.0,-0.057452,-0.07541,-0.078603,-0.091015,-0.101907,...,0.767283,0.086377,0.0,12.102677,-44.98982,-85.817993,-119.976043,-0.999999,-1.0,0.0
25%,885.9,885.85,886.1,885.8,18963.5,-0.002506,-0.003996,-0.0054,-0.00638,-0.007132,...,3.007042,0.29074,1.5,18.709498,28.108028,-5.455512,-9.107653,-0.510321,-0.824074,0.0
50%,1008.4,1008.4,1008.6,1007.5,31535.0,9.9e-05,0.000228,0.000589,0.00072,0.001005,...,4.560555,0.408041,3.3,21.591973,159.751605,0.801017,-0.563003,-0.067092,-0.280749,1.0
75%,1230.7,1230.85,1231.35,1230.15,51054.0,0.003116,0.004997,0.006498,0.007859,0.00874,...,6.608154,0.61483,6.7,25.549728,204.262914,6.771238,8.094429,0.414302,0.673846,1.0
max,1572.7,1572.2,1572.8,1571.9,187676.0,0.046956,0.079029,0.088922,0.103647,0.099735,...,16.0499,1.884217,59.0,44.944129,314.968492,71.012613,209.435266,1.0,1.0,1.0


In [17]:
df

Unnamed: 0_level_0,open,close,high,low,volume,t1change,t2change,t3change,t4change,t5change,...,adr,natr,trange,ht_decperiod,ht_dcphase,inphase,quadrature,sine,leadsine,integer
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-12-08 11:00:00,938.0,939.0,939.0,938.0,3622.0,0.003634,-0.002867,0.002884,0.000320,-0.001064,...,5.976465,0.636471,3.4,17.918461,6.823577,-10.873480,28.049724,0.118813,0.786111,1
2017-12-08 13:30:00,939.0,939.0,939.0,939.0,4399.0,0.000000,0.003634,-0.002867,0.002884,0.000320,...,5.549575,0.591009,0.0,17.530451,20.384919,-0.478912,15.657483,0.348325,0.909127,1
2017-12-08 14:30:00,939.0,939.0,939.0,939.0,4321.0,0.000000,0.000000,0.003634,-0.002867,0.002884,...,5.153177,0.548794,0.0,17.361211,41.152165,0.157037,7.717505,0.658061,0.997746,0
2017-12-11 10:00:00,922.5,923.0,923.0,922.5,3999.0,-0.017039,-0.017039,-0.017039,-0.013467,-0.019858,...,5.963664,0.646117,16.5,17.346277,50.784492,3.571910,3.367185,0.774773,0.994908,1
2017-12-11 11:00:00,926.0,926.9,926.9,926.0,2239.0,0.004225,-0.012886,-0.012886,-0.012886,-0.009299,...,5.816260,0.627496,3.9,17.420426,56.963424,1.556506,-4.703891,0.838323,0.978280,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-06-22 14:30:00,1114.5,1111.7,1114.5,1111.7,83579.0,-0.001975,-0.002602,-0.002691,0.004790,0.012569,...,3.543252,0.318724,2.8,22.761381,66.154245,4.036894,6.630768,0.914637,0.932612,0
2023-06-23 10:00:00,1117.8,1118.2,1118.2,1117.7,40868.0,0.005847,0.003860,0.003230,0.003140,0.010665,...,3.754448,0.335758,6.5,22.609304,82.038206,7.412469,14.682855,0.990361,0.798234,0
2023-06-23 11:00:00,1121.3,1120.5,1121.3,1120.4,37133.0,0.002057,0.007916,0.005925,0.005293,0.005203,...,3.707702,0.330897,3.1,22.475532,105.312131,14.205266,10.038600,0.964502,0.495275,0
2023-06-23 13:30:00,1114.1,1115.0,1115.3,1114.1,52393.0,-0.004909,-0.002862,0.002968,0.000988,0.000359,...,3.900009,0.349777,6.4,22.621388,113.256227,15.137063,-3.518856,0.918748,0.370456,0


### Creating a y target

In [18]:
df['price_change'] = df['close'].diff()

# Encode 0 for price decrease and 1 for price increase
df['y_target'] = (df['price_change'] > 0).astype(int)
df['y_target'] = df['y_target'].shift(-1)

In [19]:
df['y_target'].value_counts(normalize = True)

1.0    0.502893
0.0    0.497107
Name: y_target, dtype: float64

In [20]:
df = df.drop(df.index[-1])
y = df['y_target']
df = df.drop('price_change', axis=1)
df = df.drop('y_target', axis=1)

### Min-Max Scaling

In [21]:
# Min-Max Scaling
min_val = df.min()
max_val = df.max()
df_scaled = (df - min_val) / (max_val - min_val)

### Dividing the dataset and output

In [22]:
df_scaled['y_target'] = y.to_list()

In [23]:
df_scaled = df_scaled.dropna(how='any', axis='rows')

In [24]:
df_scaled.to_csv(r'data_non_pca_new.csv')