# Python notebook for dataset synthetics

## 0. Notebook Configuration

In [3]:
import pandas as pd

In [8]:
DATASET_BASE_DIR = r'../../data/'
RAW_DATA_DIR = DATASET_BASE_DIR + r'/raw/'
SYNTHETIZED_DATA_DIR = DATASET_BASE_DIR + r'/synthesized/'

## 1. General Description

| Feature | Detail |
|----|----|
|Time Duration| Jan 1st 2020 - Sep 30th 2025|
|Datetime Format| `dd/mm/yyyy`|

## 2. Data Synthesis

### 2.1. Macro Indices

#### 2.1.1. CPI index

In [39]:
CPI_PATH = RAW_DATA_DIR + r'macro/vietnam_cpi.csv'
cpi_index = pd.read_csv(CPI_PATH, parse_dates=['Ngày Phát Hành'])
cpi_index.head(5)

Unnamed: 0,Ngày Phát Hành,Thời gian,Thực tế,Dự báo,Trước đó
0,2025-10-06,02:00,3.38%,,3.24%
1,2025-09-06,02:00,3.24%,,3.19%
2,2025-08-06,02:00,3.19%,,3.57%
3,2025-07-05,04:30,3.57%,,3.24%
4,2025-06-06,02:00,3.24%,,3.12%


In [40]:
def get_cpi_data(
    cpi_index,
    begin_date = '01/01/2020',
    end_date = '30/09/2025'
):
    # Rename columns and select relevant ones
    cpi_index = cpi_index.rename(columns={
        'Ngày Phát Hành': 'date',
        'Thực tế': 'cpi_rate'
    })
    cpi_index = cpi_index[['date', 'cpi_rate']]
    
    # Reformat date column, from yyyy-mm-dd to dd/mm/yyyy
    cpi_index['date'] = pd.to_datetime(cpi_index['date'], format='%Y-%m-%d').dt.strftime('%d/%m/%Y')

    # Reformat cpi_rate to float, for example 2.82% -> 0.0282
    cpi_index['cpi_rate'] = cpi_index['cpi_rate'].str.rstrip('%').astype('float') / 100.0
    return cpi_index

get_cpi_data(cpi_index)


Unnamed: 0,date,cpi_rate
0,06/10/2025,0.0338
1,06/09/2025,0.0324
2,06/08/2025,0.0319
3,05/07/2025,0.0357
4,06/06/2025,0.0324
...,...,...
77,29/04/2019,0.0290
78,29/03/2019,0.0270
79,28/02/2019,0.0264
80,29/01/2019,0.0256


#### 2.1.2. USD-VND exchange rate

In [41]:
USD_VND_exchange_rate_path = RAW_DATA_DIR + r'macro/USD_VND.csv'
usd_vnd_exchange_rate = pd.read_csv(USD_VND_exchange_rate_path, parse_dates=['Ngày'])
usd_vnd_exchange_rate.head(5)

Unnamed: 0,Ngày,Lần cuối,Mở,Cao,Thấp,KL,% Thay đổi
0,09/10/2025,26347.5,26354.0,26363.0,26346.5,,-0.05%
1,08/10/2025,26360.0,26357.5,26373.5,26342.0,,-0.02%
2,07/10/2025,26365.0,26357.5,26385.0,26347.5,,0.00%
3,06/10/2025,26365.0,26372.5,26406.5,26349.0,,-0.06%
4,03/10/2025,26381.0,26377.5,26399.0,26363.5,,-0.05%


In [42]:
def get_usd_vnd_exchange_rate_data(
    usd_vnd_exchange_rate,
    begin_date = '01/01/2020',
    end_date = '30/09/2025'
):
    # Rename columns and select relevant ones
    usd_vnd_exchange_rate = usd_vnd_exchange_rate.rename(columns={
        'Ngày': 'date',
        'Cao': 'usd_vnd_rate'
    })
    usd_vnd_exchange_rate = usd_vnd_exchange_rate[['date', 'usd_vnd_rate']]
    
    # Reformat usd_vnd_rate to float, removing commas
    usd_vnd_exchange_rate['usd_vnd_rate'] = usd_vnd_exchange_rate['usd_vnd_rate'].str.replace(',', '').astype('float')
    return usd_vnd_exchange_rate

In [43]:
get_usd_vnd_exchange_rate_data(
    usd_vnd_exchange_rate,
    begin_date = '01/01/2020',
    end_date = '30/09/2025'
)

Unnamed: 0,date,usd_vnd_rate
0,09/10/2025,26363.0
1,08/10/2025,26373.5
2,07/10/2025,26385.0
3,06/10/2025,26406.5
4,03/10/2025,26399.0
...,...,...
4895,05/01/2007,16045.0
4896,04/01/2007,16040.0
4897,03/01/2007,16058.0
4898,02/01/2007,16056.0


#### 2.1.3. Annual GDP

In [45]:
GDP_PATH = RAW_DATA_DIR + r'macro/vietnam_GDP.csv'
gdp = pd.read_csv(GDP_PATH, parse_dates=['Year'])
gdp.head(5)

Unnamed: 0,Year,GDP_in_USD
0,2010-01-01,147201200000.0
1,2011-01-01,172595000000.0
2,2012-01-01,195590700000.0
3,2013-01-01,213708800000.0
4,2014-01-01,233451500000.0
