In [1]:
import os
try:
    print(file_path)
except:
    file_path = os.path.abspath('')
    os.chdir(os.path.dirname(file_path))
    print(file_path)

/root/code_Bao/Vietnamese_stocks_forecasting/notebook


In [2]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

# Draw
import math

from vnstock import * #Load vietnamese data
import holidays # Load Vietnamese holidays

from tqdm import tqdm
import yaml

from src.utils.config_parse import DataConfigReader
from src.database.feature_engineer import *
from src.database.ts_fill_data import *
from src.utils.convert_data_type import *

# Read data

In [3]:
symbol = 'MWG'
start_date = "2021-01-01" #(YYYY-MM-DD)
end_date = '2023-07-18'

In [4]:
data_config = DataConfigReader(
    data_config_path='config/raw_data_config.yaml',
    dtype_config_path='config/data_type_dict.yaml',
    section='Historical_price'
)

In [5]:
data_path = 'data/raw/historical_price/MWG.csv'
df = pd.read_csv(data_path, usecols= data_config.read_column_names(), 
                 dtype=data_config.read_data_type())
df.head()

Unnamed: 0,time,open,high,low,close,volume,ticker
0,2021-01-04,39080,39430,39010,39430,1147400,MWG
1,2021-01-05,39230,40090,39100,39860,967500,MWG
2,2021-01-06,40210,40210,39460,39660,995500,MWG
3,2021-01-07,39400,39630,39130,39500,1118100,MWG
4,2021-01-08,39500,40180,39500,39830,1277600,MWG


In [6]:
data_purpose = data_config.read_data_purpose()

In [7]:
data_purpose

{'num_cols': ['open', 'high', 'low', 'close', 'volume'],
 'cat_cols': ['ticker'],
 'date_cols': ['time'],
 'ma_cols': ['close', 'volume'],
 'pct_cols': ['close'],
 'lag_cols': ['close', 'volume']}

In [8]:
duplicate_checker = DuplicateCheck(df)
df = duplicate_checker.remove_duplicate_rows()

There are 0 duplicate rows.
Duplicated rows has been removed!


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 132 entries, 0 to 131
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   time    132 non-null    object  
 1   open    132 non-null    int64   
 2   high    132 non-null    int64   
 3   low     132 non-null    int64   
 4   close   132 non-null    int64   
 5   volume  132 non-null    int64   
 6   ticker  132 non-null    category
dtypes: category(1), int64(5), object(1)
memory usage: 6.4+ KB


In [10]:
type_converter = TypeConverter(df, data_purpose)
df = type_converter.convert_types()

In [11]:
df.columns

Index(['time', 'open', 'high', 'low', 'close', 'volume', 'ticker'], dtype='object')

In [12]:
duplicate_checker = DuplicateCheck(df)
df = duplicate_checker.remove_duplicate_rows()

There are 0 duplicate rows.
Duplicated rows has been removed!


In [13]:
df.head()

Unnamed: 0,time,open,high,low,close,volume,ticker
0,2021-01-04,39080,39430,39010,39430,1147400,MWG
1,2021-01-05,39230,40090,39100,39860,967500,MWG
2,2021-01-06,40210,40210,39460,39660,995500,MWG
3,2021-01-07,39400,39630,39130,39500,1118100,MWG
4,2021-01-08,39500,40180,39500,39830,1277600,MWG


In [14]:
window_widths=[2,4]

In [15]:
ma_generator = RollingMACalculator(df,MA_cols=data_purpose['ma_cols'], window_widths=window_widths,
                                   fill_method = BackwardFillStrategy(limit = max(window_widths)))
df = ma_generator.calculate_MA()

calculate_MA took 0.01 seconds to execute.


In [16]:
df.head()

Unnamed: 0,ticker,time,open,high,low,close,volume,close_MA_2,volume_MA_2,close_MA_4,volume_MA_4
0,MWG,2021-01-04,39080,39430,39010,39430,1147400,39645.0,1057450.0,39612.5,1057125.0
1,MWG,2021-01-05,39230,40090,39100,39860,967500,39645.0,1057450.0,39612.5,1057125.0
2,MWG,2021-01-06,40210,40210,39460,39660,995500,39760.0,981500.0,39612.5,1057125.0
3,MWG,2021-01-07,39400,39630,39130,39500,1118100,39580.0,1056800.0,39612.5,1057125.0
4,MWG,2021-01-08,39500,40180,39500,39830,1277600,39665.0,1197850.0,39712.5,1089675.0


In [17]:
pct_generator = PercentageChangeCalculator(df, pct_cols=data_purpose['pct_cols'], 
                                         fill_method=ConstantFillStrategy(constant_value=0))
df=pct_generator.calculate_pct_change()

calculate_pct_change took 0.01 seconds to execute.


In [18]:
df.head()

Unnamed: 0,ticker,time,open,high,low,close,volume,close_MA_2,volume_MA_2,close_MA_4,volume_MA_4,close_pct
0,MWG,2021-01-04,39080,39430,39010,39430.0,1147400,39645.0,1057450.0,39612.5,1057125.0,0.0
1,MWG,2021-01-05,39230,40090,39100,39860.0,967500,39645.0,1057450.0,39612.5,1057125.0,0.010905
2,MWG,2021-01-06,40210,40210,39460,39660.0,995500,39760.0,981500.0,39612.5,1057125.0,-0.005018
3,MWG,2021-01-07,39400,39630,39130,39500.0,1118100,39580.0,1056800.0,39612.5,1057125.0,-0.004034
4,MWG,2021-01-08,39500,40180,39500,39830.0,1277600,39665.0,1197850.0,39712.5,1089675.0,0.008354


In [19]:
lag_dict = {'close': [1,2,3],
            'volume': [1,2,3],
            }

In [20]:
lag_generator = LagEngine(df, lag_dict=lag_dict,
                          fill_method=ForwardFillStrategy())
df = lag_generator.create_lag_features()

create_lag_features took 0.01 seconds to execute.


In [21]:
df.head()

Unnamed: 0,ticker,time,open,high,low,close,volume,close_MA_2,volume_MA_2,close_MA_4,volume_MA_4,close_pct,close_lagged_by_1,close_lagged_by_2,close_lagged_by_3,volume_lagged_by_1,volume_lagged_by_2,volume_lagged_by_3
0,MWG,2021-01-04,39080,39430,39010,39430.0,1147400,39645.0,1057450.0,39612.5,1057125.0,0.0,,,,,,
1,MWG,2021-01-05,39230,40090,39100,39860.0,967500,39645.0,1057450.0,39612.5,1057125.0,0.010905,39430.0,,,1147400.0,,
2,MWG,2021-01-06,40210,40210,39460,39660.0,995500,39760.0,981500.0,39612.5,1057125.0,-0.005018,39860.0,39430.0,,967500.0,1147400.0,
3,MWG,2021-01-07,39400,39630,39130,39500.0,1118100,39580.0,1056800.0,39612.5,1057125.0,-0.004034,39660.0,39860.0,39430.0,995500.0,967500.0,1147400.0
4,MWG,2021-01-08,39500,40180,39500,39830.0,1277600,39665.0,1197850.0,39712.5,1089675.0,0.008354,39500.0,39660.0,39860.0,1118100.0,995500.0,967500.0
