# Data Ingesting & Wrangling Code

## Importing Modules & Data

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv(f"../data/VN30INDEX-20230627.csv")

In [3]:
df.head()

Unnamed: 0,SYMBOL,TRADING_DATE,OPEN_PRICE/1000,CLOSE_PRICE/1000,HIGH_PRICE/1000,LOW_PRICE/1000,OPEN_INTEREST,VOLUME,BUY_VOLUME,SELL_VOLUME
0,VN30,20230626 14:45:01,1131.23,1131.23,1131.23,1131.23,0,10136300,0,0
1,VN30,20230626 14:30:01,1128.91,1128.91,1128.91,1128.91,0,39500,0,0
2,VN30,20230626 14:29:56,1129.01,1128.45,1129.24,1128.45,0,1426300,0,0
3,VN30,20230626 14:28:58,1128.24,1129.04,1129.14,1127.72,0,1364400,0,0
4,VN30,20230626 14:27:58,1128.34,1128.32,1128.52,1128.02,0,1616400,0,0


In [4]:
df.shape

(140122, 10)

## Time Data Cleaning
### 1. Function to round the datetime to the nearest minute

In [5]:
# Pass in a datetime series column in format YYYYMMDD HH:mm:ss
def round_datetime(dt) -> pd.Series():
    year = dt.str[:4].astype(int)
    month = dt.str[4:6].astype(int)
    day = dt.str[6:8].astype(int)
    hour = dt.str[9:11].astype(int)
    minute = dt.str[12:14].astype(int)
    second = dt.str[15:17].astype(int)
    
    datetime = pd.to_datetime({'year': year, 'month': month, 'day': day, 'hour': hour, 'minute': minute, 'second': second})
    rounded_datetime = pd.Series(datetime).dt.round("min")

    return rounded_datetime

  def round_datetime(dt) -> pd.Series():


### 2. Functions to create template dataframe on trading days

In [6]:
custom_holidays = pd.to_datetime([
    '01-02-2017', '01-27-2017', '01-30-2017', '01-31-2017', '02-01-2017', '02-02-2017', '04-06-2017', '05-01-2017', '05-02-2017', '09-04-2017',
    '01-01-2018', '02-14-2018', '02-15-2018', '02-16-2018', '02-17-2018', '02-18-2018', '02-19-2018', '02-20-2018', '04-25-2018', '04-30-2018', '05-01-2018', '09-03-2018',
    '12-31-2018', '01-01-2019', '02-04-2019', '02-05-2019', '02-06-2019', '02-07-2019', '02-08-2019', '04-15-2019', '04-29-2019', '04-30-2019', '05-01-2019', '09-02-2019',
    '01-01-2020', '01-23-2020', '01-24-2020', '01-25-2020', '01-26-2020', '01-27-2020', '01-28-2020', '01-29-2020', '04-02-2020', '04-30-2020', '05-01-2020', '09-02-2020',
    '01-01-2021', '02-10-2021', '02-11-2021', '02-12-2021', '02-15-2021', '02-16-2021', '04-21-2021', '04-30-2021', '05-03-2021', '09-02-2021', '09-03-2021',
    '01-03-2022', '01-31-2022', '02-01-2022', '02-02-2022', '02-03-2022', '02-04-2022', '04-11-2022', '05-02-2022', '05-03-2022', '09-01-2022', '09-02-2022',
    '01-02-2023', '01-20-2023', '01-21-2023', '01-22-2023', '01-23-2023', '01-24-2023', '01-25-2023', '01-26-2023', '05-01-2023', '05-02-2023', '05-03-2023', '09-04-2023', '09-05-2023'
])

In [7]:
from pandas.tseries.offsets import CustomBusinessDay
# Create a Pandas series with minute frequency between 9-11:30AM and 1-2:30PM each day for specified period
def get_tradingperiods(start_date, end_date) -> pd.Series():
    custom_bday = CustomBusinessDay(holidays=custom_holidays)
    business_days = pd.bdate_range(start=start_date, end=end_date, freq=custom_bday)
    
    dates_range = pd.date_range(start=start_date, end=end_date, freq = 'D')
    dt = []
    for date in dates_range:
        if date.date() in business_days.date:
            morning_range = pd.date_range(start=date.replace(hour=9, minute=16), end=date.replace(hour=11, minute=30), freq='min')
            afternoon_range = pd.date_range(start=date.replace(hour=13,minute=0), end=date.replace(hour=14, minute=30), freq='min')
            dt.extend(morning_range)
            dt.extend(afternoon_range)
    dt = pd.Series(dt)
    
    return dt

  def get_tradingperiods(start_date, end_date) -> pd.Series():


### 3. Function to join dataframe based on datetime & fill NaN values

In [8]:
# Left join based
def join_by_datetime(df1, df2) -> pd.DataFrame():
    df = df1.merge(df2, how='left', left_index=True, right_index=True)
    
    df.fillna(method='ffill')
    return df

## Creating a date cleaned dataframe

In [9]:
# Round the datetime values
df = df.set_index(round_datetime(df["TRADING_DATE"]))

In [10]:
# Removing duplicates in dates / index
duplicates_index = df.index.duplicated(keep='last')
df = df[~duplicates_index]

In [11]:
# Make an empty dataframe with the right datetime as index
start = df.index.min()
end = df.index.max() - pd.DateOffset(days=1)
df1 = pd.DataFrame(index = get_tradingperiods(start, end), data = None)

In [12]:
# Merge data
df = df1.merge(df, how='left', left_index=True, right_index=True)

In [13]:
# Forward fill for NaN values
df = df.fillna(method = "ffill")

In [14]:
df.columns = df.columns.str.lower()

## Create new date & time features
### Calculate number of days until maturity

In [15]:
def trading_days_to_maturity(dt):
    dates = dt.groupby(dt.date)
    days_to_maturity = []
    
    for date in dates:
        this_month_beg = date - pd.offsets.MonthBegin()
        this_month_end = this_month_beg + pd.offsets.MonthEnd()
        next_month_beg = date + pd.offsets.MonthBegin()
        next_month_end = next_month_beg + pd.offsets.MonthEnd()
        
        third_thurs = pd.date_range(start = this_month_beg, end = this_month_end, freq = 'WOM-3THU')[0]

        if date <= third_thurs.date():
            maturity = third_thurs
        else:
            maturity = pd.date_range(start = next_month_beg, end = next_month_end, freq = 'WOM-3THU')[0]
        
        custom_bday = CustomBusinessDay(holidays=custom_holidays)
        business_days = pd.bdate_range(start=date, end=maturity, freq=custom_bday)
        num_days = len(business_days)
        repeat_count = len(df[dt.date == date])

        new_series = np.tile(num_days, repeat_count)
        days_to_maturity.extend(new_series)

    return days_to_maturity

In [16]:
days_to_maturity = trading_days_to_maturity(df.index)

In [17]:
df["days_to_maturity"] = days_to_maturity

## Other datetime variables

In [18]:
df['day_of_week'] = df.index.day_of_week
df['day_of_year'] = df.index.day_of_year
df['is_month_end'] = df.index.is_month_end.astype(int)
df['is_month_start'] = df.index.is_month_start.astype(int)

In [19]:
df['is_morning'] = np.where((df.index.hour < 12), 1, 0)

## Resampling to get other intervals

In [20]:
df.rename(columns = {"open_price/1000": "open", "close_price/1000": "close", "high_price/1000": "high", "low_price/1000": "low", "trading_date": "raw_date", "open_interest" : "eod_open_interest"}, inplace=True)

In [21]:
aggregations = {
    'open': 'last',
    'close': 'last',
    'high' : 'max',
    'low' : 'min',
    'eod_open_interest' : 'last',
    'volume' : 'sum',
    'buy_volume': 'sum',
    'sell_volume': 'sum',
    'day_of_week': 'last',
    'day_of_year': 'last',
    'is_month_end': 'last',
    'is_month_start': 'last',
    'is_morning': 'last',
    'days_to_maturity': 'last',
}

df_1min = df
df_5min = df.resample('5min', label='right', closed='right').agg(aggregations)
df_10min = df.resample('10min', label='right', closed='right').agg(aggregations)
df_15min = df.resample('15min', label='right', closed='right').agg(aggregations)
df_30min = df.resample('30min', label='right', closed='right').agg(aggregations)
df_1hour = df.resample('1H', label='right', closed='right').agg(aggregations)

In [22]:
df_5min.dropna(axis=0, how='any', inplace=True)
df_10min.dropna(axis=0, how='any', inplace=True)
df_15min.dropna(axis=0, how='any', inplace=True)
df_30min.dropna(axis=0, how='any', inplace=True)
df_1hour.dropna(axis=0, how='any', inplace=True)

## Export results

In [23]:
# Checking the number of data rows
num_days = pd.Series(df.index.date).nunique()
print(f"The data series has {num_days} days")
print(f"The data series has {df.shape[0]} periods")
print(f"Each day the series has {df.shape[0] / num_days} rows")

The data series has 617 days
The data series has 139442 periods
Each day the series has 226.0 rows


In [24]:
df_1min.to_csv(r"../data/VN30_1min_cleaned.csv")
df_5min.to_csv(r"../data/VN30_5min_cleaned.csv")
df_10min.to_csv(r"../data/VN30_10min_cleaned.csv")
df_15min.to_csv(r"../data/VN30_15min_cleaned.csv")
df_30min.to_csv(r"../data/VN30_30min_cleaned.csv")
df_1hour.to_csv(r"../data/VN30_1hour_cleaned.csv")

## Other Ideas