In [1]:
from utils import ForexPreprocessor
import numpy as np

In [2]:
import seaborn as sns
sns.set_theme(style='whitegrid')
sns.set_palette('colorblind')

In [3]:
%matplotlib inline
from feature_cleaning import missing_data as ms

## Load dataset

In [4]:
timeframe = 1440
processor = ForexPreprocessor(timeframe)
raw_data = processor.load_and_preprocess(f'data/GBPUSD/GBPUSD_{timeframe}.csv', drop_na=False)
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5071 entries, 0 to 5070
Data columns (total 27 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   time              5071 non-null   datetime64[ns]
 1   close             5071 non-null   float64       
 2   volume            5071 non-null   int64         
 3   change            5070 non-null   float64       
 4   hour              5071 non-null   int32         
 5   day_of_week       5071 non-null   int32         
 6   month             5071 non-null   int32         
 7   is_weekend        5071 non-null   int64         
 8   sma_20            5048 non-null   float64       
 9   sma_50            5024 non-null   float64       
 10  rsi               5057 non-null   float64       
 11  high_low_pct      5048 non-null   float64       
 12  volatility        5047 non-null   float64       
 13  price_position    5071 non-null   int64         
 14  volume_ma         5048 n

In [5]:
raw_data.shape

(5071, 27)

In [6]:
raw_data.head(8)

Unnamed: 0,time,close,volume,change,hour,day_of_week,month,is_weekend,sma_20,sma_50,...,change_2d,change_5d,future_change,price_trend,volatility_level,volume_level,rsi_zone,trading_session,trend_position,target
0,2007-08-11,2.02278,3913,,0,5,8,1,,,...,,,-0.562592,,,,,AM,False,0
1,2007-08-13,2.0114,1338156,-0.562592,0,0,8,0,,,...,,,-0.556826,Down,,,,AM,False,0
2,2007-08-14,2.0002,1936844,-0.556826,0,1,8,0,,,...,,,-0.429957,Down,,,,AM,False,0
3,2007-08-15,1.9916,2079347,-0.429957,0,2,8,0,,,...,-0.562592,,-0.707973,Flat,,,,AM,False,0
4,2007-08-16,1.9775,1861756,-0.707973,0,3,8,0,,,...,-0.556826,,0.166877,Down,,,,AM,False,1
5,2007-08-17,1.9808,1263617,0.166877,0,4,8,0,,,...,-0.429957,,-0.020194,Flat,,,,AM,False,0
6,2007-08-18,1.9804,89159,-0.020194,0,5,8,1,,,...,-0.707973,-0.562592,0.126237,Flat,,,,AM,False,1
7,2007-08-20,1.9829,741363,0.126237,0,0,8,0,,,...,0.166877,-0.556826,-0.055474,Flat,,,,AM,False,0


## Missing value checking
check the total number & percentage of missing values
per variable of a pandas Dataframe

In [7]:
missing_check = ms.check_missing(data=raw_data, output_path='./output/')
print("Raw data missing values:")
print(missing_check)

Raw data missing values:
                  total missing  proportion
time                          0    0.000000
close                         0    0.000000
volume                        0    0.000000
change                        1    0.000197
hour                          0    0.000000
day_of_week                   0    0.000000
month                         0    0.000000
is_weekend                    0    0.000000
sma_20                       23    0.004536
sma_50                       47    0.009268
rsi                          14    0.002761
high_low_pct                 23    0.004536
volatility                   24    0.004733
price_position                0    0.000000
volume_ma                    23    0.004536
volume_ratio                 23    0.004536
change_1d                     2    0.000394
change_2d                     3    0.000592
change_5d                     6    0.001183
future_change                 1    0.000197
price_trend                   1    0.000197
volatil

In [8]:
data_clean = ms.drop_missing(data=raw_data)
print(f"Shape after dropping missing: {data_clean.shape}")

Shape after dropping missing: (5023, 27)


## Listwise deletion  
excluding all cases (listwise) that have missing values

In [9]:
raw_data_with_gaps = raw_data.copy()
gap_indices = np.random.choice(raw_data.index, size=100, replace=False)
raw_data_with_gaps.loc[gap_indices, 'close'] = np.nan

## Add a variable to denote NA
creating an additional variable indicating whether the data was missing for that observation

In [10]:
data_with_na_flag = ms.add_var_denote_NA(data=raw_data_with_gaps, NA_col=['close'])
print(data_with_na_flag.close_is_NA.value_counts())

close_is_NA
0    4971
1     100
Name: count, dtype: int64


In [11]:
data_with_na_flag.head(8)

Unnamed: 0,time,close,volume,change,hour,day_of_week,month,is_weekend,sma_20,sma_50,...,change_5d,future_change,price_trend,volatility_level,volume_level,rsi_zone,trading_session,trend_position,target,close_is_NA
0,2007-08-11,2.02278,3913,,0,5,8,1,,,...,,-0.562592,,,,,AM,False,0,0
1,2007-08-13,2.0114,1338156,-0.562592,0,0,8,0,,,...,,-0.556826,Down,,,,AM,False,0,0
2,2007-08-14,2.0002,1936844,-0.556826,0,1,8,0,,,...,,-0.429957,Down,,,,AM,False,0,0
3,2007-08-15,1.9916,2079347,-0.429957,0,2,8,0,,,...,,-0.707973,Flat,,,,AM,False,0,0
4,2007-08-16,1.9775,1861756,-0.707973,0,3,8,0,,,...,,0.166877,Down,,,,AM,False,1,0
5,2007-08-17,1.9808,1263617,0.166877,0,4,8,0,,,...,,-0.020194,Flat,,,,AM,False,0,0
6,2007-08-18,1.9804,89159,-0.020194,0,5,8,1,,,...,-0.562592,0.126237,Flat,,,,AM,False,1,0
7,2007-08-20,1.9829,741363,0.126237,0,0,8,0,,,...,-0.556826,-0.055474,Flat,,,,AM,False,0,0


## Arbitrary Value Imputation
Replacing the NA by arbitrary values

In [12]:
data_arbitrary = ms.impute_NA_with_arbitrary(data=raw_data_with_gaps, impute_value=-999, NA_col=['close'])
data_arbitrary.head(8)

Unnamed: 0,time,close,volume,change,hour,day_of_week,month,is_weekend,sma_20,sma_50,...,change_2d,change_5d,future_change,price_trend,volatility_level,volume_level,rsi_zone,trading_session,trend_position,target
0,2007-08-11,2.02278,3913,,0,5,8,1,,,...,,,-0.562592,,,,,AM,False,0
1,2007-08-13,2.0114,1338156,-0.562592,0,0,8,0,,,...,,,-0.556826,Down,,,,AM,False,0
2,2007-08-14,2.0002,1936844,-0.556826,0,1,8,0,,,...,,,-0.429957,Down,,,,AM,False,0
3,2007-08-15,1.9916,2079347,-0.429957,0,2,8,0,,,...,-0.562592,,-0.707973,Flat,,,,AM,False,0
4,2007-08-16,1.9775,1861756,-0.707973,0,3,8,0,,,...,-0.556826,,0.166877,Down,,,,AM,False,1
5,2007-08-17,1.9808,1263617,0.166877,0,4,8,0,,,...,-0.429957,,-0.020194,Flat,,,,AM,False,0
6,2007-08-18,1.9804,89159,-0.020194,0,5,8,1,,,...,-0.707973,-0.562592,0.126237,Flat,,,,AM,False,1
7,2007-08-20,1.9829,741363,0.126237,0,0,8,0,,,...,0.166877,-0.556826,-0.055474,Flat,,,,AM,False,0


## Mean/Median/Mode Imputation
Replacing the NA by mean/median/mode of that variable

In [13]:
data_median = ms.impute_NA_with_avg(data=raw_data_with_gaps, strategy='median', NA_col=['close'])
data_median.head(8)


Unnamed: 0,time,close,volume,change,hour,day_of_week,month,is_weekend,sma_20,sma_50,...,change_2d,change_5d,future_change,price_trend,volatility_level,volume_level,rsi_zone,trading_session,trend_position,target
0,2007-08-11,2.02278,3913,,0,5,8,1,,,...,,,-0.562592,,,,,AM,False,0
1,2007-08-13,2.0114,1338156,-0.562592,0,0,8,0,,,...,,,-0.556826,Down,,,,AM,False,0
2,2007-08-14,2.0002,1936844,-0.556826,0,1,8,0,,,...,,,-0.429957,Down,,,,AM,False,0
3,2007-08-15,1.9916,2079347,-0.429957,0,2,8,0,,,...,-0.562592,,-0.707973,Flat,,,,AM,False,0
4,2007-08-16,1.9775,1861756,-0.707973,0,3,8,0,,,...,-0.556826,,0.166877,Down,,,,AM,False,1
5,2007-08-17,1.9808,1263617,0.166877,0,4,8,0,,,...,-0.429957,,-0.020194,Flat,,,,AM,False,0
6,2007-08-18,1.9804,89159,-0.020194,0,5,8,1,,,...,-0.707973,-0.562592,0.126237,Flat,,,,AM,False,1
7,2007-08-20,1.9829,741363,0.126237,0,0,8,0,,,...,0.166877,-0.556826,-0.055474,Flat,,,,AM,False,0


##  End of distribution Imputation
replacing the NA by values that are at the far end of the distribution of that variable
calculated by mean + 3*std

In [14]:
data_end_dist = ms.impute_NA_with_end_of_distribution(data=raw_data_with_gaps, NA_col=['close'])
data_end_dist.head(8)


Unnamed: 0,time,close,volume,change,hour,day_of_week,month,is_weekend,sma_20,sma_50,...,change_2d,change_5d,future_change,price_trend,volatility_level,volume_level,rsi_zone,trading_session,trend_position,target
0,2007-08-11,2.02278,3913,,0,5,8,1,,,...,,,-0.562592,,,,,AM,False,0
1,2007-08-13,2.0114,1338156,-0.562592,0,0,8,0,,,...,,,-0.556826,Down,,,,AM,False,0
2,2007-08-14,2.0002,1936844,-0.556826,0,1,8,0,,,...,,,-0.429957,Down,,,,AM,False,0
3,2007-08-15,1.9916,2079347,-0.429957,0,2,8,0,,,...,-0.562592,,-0.707973,Flat,,,,AM,False,0
4,2007-08-16,1.9775,1861756,-0.707973,0,3,8,0,,,...,-0.556826,,0.166877,Down,,,,AM,False,1
5,2007-08-17,1.9808,1263617,0.166877,0,4,8,0,,,...,-0.429957,,-0.020194,Flat,,,,AM,False,0
6,2007-08-18,1.9804,89159,-0.020194,0,5,8,1,,,...,-0.707973,-0.562592,0.126237,Flat,,,,AM,False,1
7,2007-08-20,1.9829,741363,0.126237,0,0,8,0,,,...,0.166877,-0.556826,-0.055474,Flat,,,,AM,False,0


##  Random Imputation
replacing the NA with random sampling from the pool of available observations of the variable


In [15]:
data_random = ms.impute_NA_with_random(data=raw_data_with_gaps, NA_col=['close'])
data_random.head(8)

Unnamed: 0,time,close,volume,change,hour,day_of_week,month,is_weekend,sma_20,sma_50,...,change_2d,change_5d,future_change,price_trend,volatility_level,volume_level,rsi_zone,trading_session,trend_position,target
0,2007-08-11,2.02278,3913,,0,5,8,1,,,...,,,-0.562592,,,,,AM,False,0
1,2007-08-13,2.0114,1338156,-0.562592,0,0,8,0,,,...,,,-0.556826,Down,,,,AM,False,0
2,2007-08-14,2.0002,1936844,-0.556826,0,1,8,0,,,...,,,-0.429957,Down,,,,AM,False,0
3,2007-08-15,1.9916,2079347,-0.429957,0,2,8,0,,,...,-0.562592,,-0.707973,Flat,,,,AM,False,0
4,2007-08-16,1.9775,1861756,-0.707973,0,3,8,0,,,...,-0.556826,,0.166877,Down,,,,AM,False,1
5,2007-08-17,1.9808,1263617,0.166877,0,4,8,0,,,...,-0.429957,,-0.020194,Flat,,,,AM,False,0
6,2007-08-18,1.9804,89159,-0.020194,0,5,8,1,,,...,-0.707973,-0.562592,0.126237,Flat,,,,AM,False,1
7,2007-08-20,1.9829,741363,0.126237,0,0,8,0,,,...,0.166877,-0.556826,-0.055474,Flat,,,,AM,False,0


In [16]:
weekend_mask = raw_data['time'].dt.dayofweek >= 5
raw_data_weekend_gaps = raw_data.copy()
raw_data_weekend_gaps.loc[weekend_mask, 'volume'] = np.nan


In [None]:
data_weekend = ms.impute_NA_with_avg(data=raw_data_weekend_gaps, strategy='median', NA_col=['volume'])
data_weekend.head(8)


Unnamed: 0,time,close,volume,change,hour,day_of_week,month,is_weekend,sma_20,sma_50,...,change_2d,change_5d,future_change,price_trend,volatility_level,volume_level,rsi_zone,trading_session,trend_position,target
0,2007-08-11,2.02278,160088.0,,0,5,8,1,,,...,,,-0.562592,,,,,AM,False,0
1,2007-08-13,2.0114,1338156.0,-0.562592,0,0,8,0,,,...,,,-0.556826,Down,,,,AM,False,0
2,2007-08-14,2.0002,1936844.0,-0.556826,0,1,8,0,,,...,,,-0.429957,Down,,,,AM,False,0
3,2007-08-15,1.9916,2079347.0,-0.429957,0,2,8,0,,,...,-0.562592,,-0.707973,Flat,,,,AM,False,0
4,2007-08-16,1.9775,1861756.0,-0.707973,0,3,8,0,,,...,-0.556826,,0.166877,Down,,,,AM,False,1
5,2007-08-17,1.9808,1263617.0,0.166877,0,4,8,0,,,...,-0.429957,,-0.020194,Flat,,,,AM,False,0
6,2007-08-18,1.9804,160088.0,-0.020194,0,5,8,1,,,...,-0.707973,-0.562592,0.126237,Flat,,,,AM,False,1
7,2007-08-20,1.9829,741363.0,0.126237,0,0,8,0,,,...,0.166877,-0.556826,-0.055474,Flat,,,,AM,False,0
