# Python notebook for dataset synthetics

## 0. Notebook Configuration

In [2]:
import pandas as pd
from functools import reduce


In [40]:
DATASET_BASE_DIR = r'../data/'
RAW_DATA_DIR = DATASET_BASE_DIR + r'raw/'
SYNTHETIZED_DATA_DIR = DATASET_BASE_DIR

## 1. General Description

| Feature | Detail |
|----|----|
|Time Duration| Jan 1st 2020 - Sep 30th 2025|
|Datetime Format| `dd/mm/yyyy`|

## 2. Data Synthesis

### 2.1. Macro Indices

#### 2.1.1. CPI Index

In [41]:
CPI_PATH = RAW_DATA_DIR + r'macro/vietnam_cpi.csv'
cpi_index = pd.read_csv(CPI_PATH, parse_dates=['Ngày Phát Hành'])
cpi_index.head(5)

Unnamed: 0,Ngày Phát Hành,Thời gian,Thực tế,Dự báo,Trước đó
0,2025-10-06,02:00,3.38%,,3.24%
1,2025-09-06,02:00,3.24%,,3.19%
2,2025-08-06,02:00,3.19%,,3.57%
3,2025-07-05,04:30,3.57%,,3.24%
4,2025-06-06,02:00,3.24%,,3.12%


In [8]:
def get_cpi_data(
    cpi_index
):
    # Rename columns and select relevant ones
    cpi_index = cpi_index.rename(columns={
        'Ngày Phát Hành': 'date',
        'Thực tế': 'cpi_rate'
    })
    cpi_index = cpi_index[['date', 'cpi_rate']]

    # Reformat cpi_rate to float, for example 2.82% -> 0.0282
    cpi_index['cpi_rate'] = cpi_index['cpi_rate'].str.rstrip('%').astype('float') / 100.0
    
    # Reformat 'date' column, from yyyy-mm-dd to dd/mm/yyyy (to datetime)
    # Rearrange by date and reset index
    # Finally, convert 'date' back to string in dd/mm/yyyy format
    cpi_index['date'] = pd.to_datetime(cpi_index['date'], format='%Y-%m-%d')
    cpi_index = cpi_index.sort_values(by='date').reset_index(drop=True)
    cpi_index['date'] = cpi_index['date'].dt.strftime('%d/%m/%Y')
    

    
    return cpi_index

In [9]:
df_cpi = get_cpi_data(cpi_index)
df_cpi

Unnamed: 0,date,cpi_rate
0,27/12/2018,0.0298
1,29/01/2019,0.0256
2,28/02/2019,0.0264
3,29/03/2019,0.0270
4,29/04/2019,0.0290
...,...,...
77,06/06/2025,0.0324
78,05/07/2025,0.0357
79,06/08/2025,0.0319
80,06/09/2025,0.0324


#### 2.1.2. USD-VND Exchange Rate

In [10]:
USD_VND_exchange_rate_path = RAW_DATA_DIR + r'macro/USD_VND.csv'
usd_vnd_exchange_rate = pd.read_csv(USD_VND_exchange_rate_path, parse_dates=['Ngày'])
usd_vnd_exchange_rate.head(5)

Unnamed: 0,Ngày,Lần cuối,Mở,Cao,Thấp,KL,% Thay đổi
0,09/10/2025,26347.5,26354.0,26363.0,26346.5,,-0.05%
1,08/10/2025,26360.0,26357.5,26373.5,26342.0,,-0.02%
2,07/10/2025,26365.0,26357.5,26385.0,26347.5,,0.00%
3,06/10/2025,26365.0,26372.5,26406.5,26349.0,,-0.06%
4,03/10/2025,26381.0,26377.5,26399.0,26363.5,,-0.05%


In [11]:
def get_usd_vnd_exchange_rate_data(
    usd_vnd_exchange_rate
):
    # Rename columns and select relevant ones
    usd_vnd_exchange_rate = usd_vnd_exchange_rate.rename(columns={
        'Ngày': 'date',
        'Cao': 'usd_vnd_rate'
    })
    usd_vnd_exchange_rate = usd_vnd_exchange_rate[['date', 'usd_vnd_rate']]
    
    
    # Reformat usd_vnd_rate to float, removing commas
    # Convert 'date' column to datetime objects
    # Sort the DataFrame chronologically now that 'date' is a datetime
    # Convert the date back to a string format 'dd/mm/YYYY'
    usd_vnd_exchange_rate['usd_vnd_rate'] = usd_vnd_exchange_rate['usd_vnd_rate'].str.replace(',', '').astype('float')
    usd_vnd_exchange_rate['date'] = pd.to_datetime(usd_vnd_exchange_rate['date'], format='%d/%m/%Y')
    usd_vnd_exchange_rate = usd_vnd_exchange_rate.sort_values(by='date').reset_index(drop=True)
    usd_vnd_exchange_rate['date'] = usd_vnd_exchange_rate['date'].dt.strftime('%d/%m/%Y')
    
    return usd_vnd_exchange_rate

In [12]:
df_usd_vnd = get_usd_vnd_exchange_rate_data(
    usd_vnd_exchange_rate
)
df_usd_vnd

Unnamed: 0,date,usd_vnd_rate
0,01/01/2007,16051.0
1,02/01/2007,16056.0
2,03/01/2007,16058.0
3,04/01/2007,16040.0
4,05/01/2007,16045.0
...,...,...
4895,03/10/2025,26399.0
4896,06/10/2025,26406.5
4897,07/10/2025,26385.0
4898,08/10/2025,26373.5


#### 2.1.3. Annual GDP

In [13]:
GDP_PATH = RAW_DATA_DIR + r'macro/vietnam_GDP.csv'
gdp = pd.read_csv(GDP_PATH, parse_dates=['Year'])
gdp.head(5)

Unnamed: 0,Year,GDP_in_USD
0,2010-01-01,147201200000.0
1,2011-01-01,172595000000.0
2,2012-01-01,195590700000.0
3,2013-01-01,213708800000.0
4,2014-01-01,233451500000.0


In [14]:
def get_gdp_data(
    gdp
):
    # Rename columns and select relevant ones
    gdp = gdp.rename(columns={
        'Year': 'date',
        'GDP_in_USD': 'gdp_value'
    })
    gdp_final = gdp[['date', 'gdp_value']]
    
    # Change date format (YYYY -> 31/12/YYYY)
    gdp_final['date'] = pd.to_datetime(gdp_final['date'], format='%Y') + pd.offsets.YearEnd(0)
    
    # Reformat 'date' column to string dd/mm/YYYY
    gdp_final['date'] = gdp_final['date'].dt.strftime('%d/%m/%Y')

    return gdp_final

In [15]:
df_gdp = get_gdp_data(
    gdp
)
df_gdp

Unnamed: 0,date,gdp_value
0,31/12/2010,147201200000.0
1,31/12/2011,172595000000.0
2,31/12/2012,195590700000.0
3,31/12/2013,213708800000.0
4,31/12/2014,233451500000.0
5,31/12/2015,239258300000.0
6,31/12/2016,257096000000.0
7,31/12/2017,281353600000.0
8,31/12/2018,310106500000.0
9,31/12/2019,334365300000.0


#### 2.1.4. XAU-USD Exchange Rate

In [16]:
XAU_USD_PATH = RAW_DATA_DIR + r'macro/XAU_USD.csv'
xau_usd = pd.read_csv(XAU_USD_PATH, parse_dates=['Ngày'])
xau_usd.head(5)

Unnamed: 0,Ngày,Lần cuối,Mở,Cao,Thấp,KL,% Thay đổi
0,10/11/2025,4087.86,4011.89,4088.94,4011.27,,1.89%
1,09/11/2025,4011.85,4006.44,4012.28,4004.39,,0.30%
2,07/11/2025,3999.72,3977.92,4027.63,3974.41,,0.55%
3,06/11/2025,3977.87,3982.59,4019.84,3964.46,,-0.12%
4,05/11/2025,3982.62,3932.06,3990.6,3929.61,,1.29%


In [17]:
def get_xau_usd_exchange_rate_data(
    xau_usd,
    begin_date = '01/01/2020',
    end_date = '30/09/2025'
):
    # 1. Đổi tên cột và chọn các cột liên quan
    xau_usd = xau_usd.rename(columns={
        'Ngày': 'date',
        'Cao': 'xau_usd_rate'
    })
    xau_usd = xau_usd[['date', 'xau_usd_rate']]
    
    # 2. Dọn dẹp cột 'xau_usd_rate' (xóa dấu phẩy, chuyển sang float)
    xau_usd['xau_usd_rate'] = xau_usd['xau_usd_rate'].str.replace(',', '').astype('float')

    # 3. Chuyển 'date' sang định dạng datetime (Đây là bước quan trọng)
    # Thao tác này cho phép sắp xếp và lọc chính xác
    xau_usd['date'] = pd.to_datetime(xau_usd['date'], format="%d/%m/%Y")
    
    # 4. Sắp xếp theo cột datetime (BÂY GIỜ sắp xếp mới đúng)
    xau_usd = xau_usd.sort_values(by='date', ascending=True)

    # 5. Lọc DataFrame theo dải ngày mong muốn (Sử dụng tham số)
    # Chuyển đổi các tham số đầu vào sang datetime để so sánh
    start = pd.to_datetime(begin_date, format='%d/%m/%Y')
    end = pd.to_datetime(end_date, format='%d/%m/%Y')
    
    # Giữ lại các hàng nằm trong khoảng [start, end]
    xau_usd = xau_usd[(xau_usd['date'] >= start) & (xau_usd['date'] <= end)]

    # 6. Chuyển đổi 'date' về lại string (thực hiện sau cùng)
    xau_usd['date'] = xau_usd['date'].dt.strftime('%d/%m/%Y')
    
    # 7. Đặt lại chỉ mục (index)
    xau_usd = xau_usd.reset_index(drop=True)

    return xau_usd

In [18]:
df_xau_usd = get_xau_usd_exchange_rate_data(
        xau_usd
)
df_xau_usd

Unnamed: 0,date,xau_usd_rate
0,01/01/2020,1517.48
1,02/01/2020,1531.30
2,03/01/2020,1553.45
3,06/01/2020,1582.69
4,07/01/2020,1577.38
...,...,...
1490,24/09/2025,3779.54
1491,25/09/2025,3761.66
1492,26/09/2025,3783.88
1493,29/09/2025,3834.58


#### 2.1.5. Macroeconomics Indicator Synthesis

In [19]:
def merge_multiple_dfs(df_list, on_column, how='outer'):
    """
    Merges a list of pandas DataFrames on a common column.

        Args:
            df_list (list): The list of pandas DataFrames to merge.
            on_column (str): The name of the common column to merge on (e.g., 'date').
            how (str, optional): The merge method ('outer', 'inner', 'left', 'right').
                                Defaults to 'outer' to keep all rows.

        Returns:
            pd.DataFrame: A single, merged DataFrame.
    """
    if not df_list:
        return pd.DataFrame()
    
    merged_df = reduce(lambda left, right: pd.merge(left, right, on=on_column, how=how), df_list)
    
    return merged_df

In [20]:
macro = merge_multiple_dfs(
    df_list=[df_cpi, df_gdp, df_usd_vnd, df_xau_usd],
    on_column='date',
    how='outer',
)
print(macro)

            date  cpi_rate     gdp_value  usd_vnd_rate  xau_usd_rate
0     01/01/2007       NaN           NaN       16051.0           NaN
1     01/01/2008       NaN           NaN       16018.0           NaN
2     01/01/2009       NaN           NaN       17480.0           NaN
3     01/01/2010       NaN           NaN       18469.0           NaN
4     01/01/2013       NaN           NaN       20820.0           NaN
...          ...       ...           ...           ...           ...
4921  31/12/2021       NaN  3.664748e+11       22857.5       1830.58
4922  31/12/2022       NaN  4.134452e+11           NaN           NaN
4923  31/12/2023       NaN  4.338577e+11           NaN           NaN
4924  31/12/2024       NaN  4.763882e+11       25512.5       2627.85
4925  31/12/2025       NaN  4.847300e+11           NaN           NaN

[4926 rows x 5 columns]


### 2.2. ICT Industry Metrics

#### 2.2.1. P/E & Market Cap

In [21]:
PRICE_PER_EARNING_PATH = RAW_DATA_DIR + r'industry/price_per_earning.csv'
pe = pd.read_csv(PRICE_PER_EARNING_PATH, parse_dates=['Date'])
pe.head(5)

Unnamed: 0,Date,Market Cap,Revenue,Earnings,PE,Absolute PE,PS
0,09-11-2025,₫193.8t,₫112.0t,₫9.5t,7.8x,20.4x,1.7x
1,24-07-2025,₫214.7t,₫105.5t,₫8.6t,20.8x,24.9x,2x
2,07-04-2025,₫184.3t,₫102.7t,₫8.2t,20.8x,22.4x,1.8x
3,20-12-2024,₫241.8t,₫97.8t,₫7.8t,37x,31.1x,2.5x
4,03-09-2024,₫221.3t,₫94.5t,₫7.4t,32.5x,29.9x,2.3x


In [22]:
def get_pe_data(
    pe
):
    pe = pe.rename(columns={
        'Date': 'date',
        'Market Cap': 'market_cap',
        'Absolute PE': 'pe_ratio'
    })
    pe = pe[['date', 'market_cap','pe_ratio']]
    
    pe['date'] = pd.to_datetime(pe['date'], format='%d-%m-%Y')
    pe = pe.sort_values(by='date').reset_index(drop=True)
    pe['date'] = pe['date'].dt.strftime('%d/%m/%Y')
    
    return pe

In [23]:
def get_pe_data(
    pe
):
    # 1. Đổi tên cột
    pe = pe.rename(columns={
        'Date': 'date',
        'Market Cap': 'market_cap',
        'Absolute PE': 'pe_ratio'
    })
    pe = pe[['date', 'market_cap','pe_ratio']]
    
    # 2. Xử lý cột 'date'
    # (Giả sử format đầu vào là dd-mm-YYYY như lỗi trước)
    pe['date'] = pd.to_datetime(pe['date'], format='%d-%m-%Y')
    pe = pe.sort_values(by='date').reset_index(drop=True)
    
    # 3. (ĐÃ SỬA) Dọn dẹp cột 'market_cap'
    # Bỏ ký tự '₫' và 't'
    # Chuyển sang dạng số (float)
    # Nhân với 1 ngàn tỷ (1_000_000_000_000)
    pe['market_cap'] = (
        pe['market_cap']
        .str.replace('₫', '')
        .str.replace('t', '')
        .astype(float) * 1_000_000_000_000
    )
    
    # 4. Dọn dẹp cột 'pe_ratio'
    # Bỏ ký tự 'x'
    pe['pe_ratio'] = pe['pe_ratio'].str.replace('x', '').astype(float)

    # 5. Chuyển 'date' về string (làm sau cùng)
    pe['date'] = pe['date'].dt.strftime('%d/%m/%Y')
    
    return pe

In [24]:
df_pe = get_pe_data(
    pe
)
print(df_pe)

          date    market_cap  pe_ratio
0   06/02/2016  2.190000e+13      10.5
1   24/05/2016  2.290000e+13      11.4
2   09/09/2016  2.340000e+13      11.5
3   26/12/2016  2.350000e+13      10.9
4   13/04/2017  2.510000e+13      11.5
5   30/07/2017  2.830000e+13      12.7
6   15/11/2017  3.010000e+13      13.1
7   03/03/2018  3.620000e+13      11.6
8   19/06/2018  3.230000e+13      10.1
9   05/10/2018  3.270000e+13       9.6
10  21/01/2019  3.000000e+13      10.8
11  09/05/2019  3.540000e+13      12.2
12  25/08/2019  3.820000e+13      12.4
13  11/12/2019  4.440000e+13      13.4
14  28/03/2020  4.300000e+13      13.1
15  14/07/2020  4.210000e+13      11.9
16  30/10/2020  4.670000e+13      13.2
17  15/02/2021  5.750000e+13      15.1
18  03/06/2021  8.880000e+13      22.0
19  19/09/2021  1.064000e+14      25.2
20  05/01/2022  1.060000e+14      23.8
21  23/04/2022  1.251000e+14      25.5
22  09/08/2022  1.151000e+14      20.4
23  25/11/2022  9.210000e+13      15.2
24  13/03/2023  1.037000e

#### 2.2.2. Industry Metrics Synthesis

In [25]:
industry = merge_multiple_dfs(
    df_list=[df_pe],
    on_column='date',
    how='outer',
)

In [26]:
print(
    industry
)

          date    market_cap  pe_ratio
0   06/02/2016  2.190000e+13      10.5
1   24/05/2016  2.290000e+13      11.4
2   09/09/2016  2.340000e+13      11.5
3   26/12/2016  2.350000e+13      10.9
4   13/04/2017  2.510000e+13      11.5
5   30/07/2017  2.830000e+13      12.7
6   15/11/2017  3.010000e+13      13.1
7   03/03/2018  3.620000e+13      11.6
8   19/06/2018  3.230000e+13      10.1
9   05/10/2018  3.270000e+13       9.6
10  21/01/2019  3.000000e+13      10.8
11  09/05/2019  3.540000e+13      12.2
12  25/08/2019  3.820000e+13      12.4
13  11/12/2019  4.440000e+13      13.4
14  28/03/2020  4.300000e+13      13.1
15  14/07/2020  4.210000e+13      11.9
16  30/10/2020  4.670000e+13      13.2
17  15/02/2021  5.750000e+13      15.1
18  03/06/2021  8.880000e+13      22.0
19  19/09/2021  1.064000e+14      25.2
20  05/01/2022  1.060000e+14      23.8
21  23/04/2022  1.251000e+14      25.5
22  09/08/2022  1.151000e+14      20.4
23  25/11/2022  9.210000e+13      15.2
24  13/03/2023  1.037000e

In [27]:
final_dataset = merge_multiple_dfs(
    df_list=[macro, industry],
    on_column='date',
    how='outer',
)
final_dataset

Unnamed: 0,date,cpi_rate,gdp_value,usd_vnd_rate,xau_usd_rate,market_cap,pe_ratio
0,01/01/2007,,,16051.0,,,
1,01/01/2008,,,16018.0,,,
2,01/01/2009,,,17480.0,,,
3,01/01/2010,,,18469.0,,,
4,01/01/2013,,,20820.0,,,
...,...,...,...,...,...,...,...
4931,31/12/2021,,3.664748e+11,22857.5,1830.58,,
4932,31/12/2022,,4.134452e+11,,,,
4933,31/12/2023,,4.338577e+11,,,,
4934,31/12/2024,,4.763882e+11,25512.5,2627.85,,


### 2.3. FPT Corporation Financial Metrics

#### 2.3.1. Financial Report

In [28]:
FINANCIAL_REPORT_PATH = RAW_DATA_DIR + r'fpt/fpt_income_statement.csv'
financial_report = pd.read_csv(FINANCIAL_REPORT_PATH, parse_dates=['Năm', 'Kỳ'])
financial_report.head()

  financial_report = pd.read_csv(FINANCIAL_REPORT_PATH, parse_dates=['Năm', 'Kỳ'])


Unnamed: 0,CP,Năm,Kỳ,Tăng trưởng doanh thu (%),Doanh thu (đồng),Lợi nhuận sau thuế của Cổ đông công ty mẹ (đồng),Tăng trưởng lợi nhuận (%),Thu nhập tài chính,Chi phí tiền lãi vay,Doanh thu bán hàng và cung cấp dịch vụ,...,Thu nhập khác,"Lãi lỗ trong công ty liên doanh, liên kết",Thu nhập/Chi phí khác,Lợi nhuận khác,LN trước thuế,Chi phí thuế TNDN hiện hành,Chi phí thuế TNDN hoãn lại,Lợi nhuận thuần,Cổ đông thiểu số,Cổ đông của Công ty mẹ
0,FPT,2025-01-01,2,0.092644,16658335760739,2257462588123,0.203878,1237012067887,-216761295705,16658335760739,...,28751046396,0,-17127043966,11624002430,3141032584460,-463961750401,63200566619,2740271400678,482808812555,2257462588123
1,FPT,2025-01-01,1,0.139902,16064980391264,2174301386525,0.209268,573331893191,-152389538333,16064980391264,...,39212521539,0,-8362389503,30850132036,3024693510849,-377287599563,-51848430977,2595557480309,421256093784,2174301386525
2,FPT,2024-01-01,4,0.201095,17651065378939,2094725967705,0.211945,582674583940,-134854268998,17651065378939,...,68481380356,0,-30860884043,37620496313,2958495178069,-530698618727,73049924176,2500846483518,406120515813,2094725967705
3,FPT,2024-01-01,3,0.16063,15972397069700,2088852212408,0.200945,326689006856,-128825950601,15972397069700,...,34674999189,0,-74280370385,-39605371196,2908620633374,-441224951554,11201940680,2478597622500,389745410092,2088852212408
4,FPT,2024-01-01,2,0.221184,15245892288520,1875158768537,0.242469,569292544523,-150125422101,15245892288520,...,27511312610,0,1004275617,28515588227,2668968551824,-561248620389,179993203349,2287713134784,412554366247,1875158768537


In [29]:
financial_report.columns

Index(['CP', 'Năm', 'Kỳ', 'Tăng trưởng doanh thu (%)', 'Doanh thu (đồng)',
       'Lợi nhuận sau thuế của Cổ đông công ty mẹ (đồng)',
       'Tăng trưởng lợi nhuận (%)', 'Thu nhập tài chính',
       'Chi phí tiền lãi vay', 'Doanh thu bán hàng và cung cấp dịch vụ',
       'Các khoản giảm trừ doanh thu', 'Doanh thu thuần', 'Giá vốn hàng bán',
       'Lãi gộp', 'Chi phí tài chính', 'Lãi/lỗ từ công ty liên doanh',
       'Chi phí bán hàng', 'Chi phí quản lý DN',
       'Lãi/Lỗ từ hoạt động kinh doanh', 'Thu nhập khác',
       'Lãi lỗ trong công ty liên doanh, liên kết', 'Thu nhập/Chi phí khác',
       'Lợi nhuận khác', 'LN trước thuế', 'Chi phí thuế TNDN hiện hành',
       'Chi phí thuế TNDN hoãn lại', 'Lợi nhuận thuần', 'Cổ đông thiểu số',
       'Cổ đông của Công ty mẹ'],
      dtype='object')

In [42]:
def get_financial_report_data(
    financial_report
):
    # --- Đổi tên cột ---
    financial_report = financial_report.rename(columns={
        'Doanh thu thuần': 'fpt_net_revenue',
        'Lãi gộp': 'fpt_gross_profit',
        'Lãi/Lỗ từ hoạt động kinh doanh': 'fpt_operating_profit',
        'Lợi nhuận sau thuế của Cổ đông công ty mẹ (đồng)': 'fpt_net_profit',
        
        'Năm': 'year',
        'Kỳ': 'quarter'
    })
    
    # --- Chọn các cột cần thiết ---
    cols_to_keep = ['year', 'quarter', 'fpt_net_revenue', 'fpt_gross_profit', 'fpt_operating_profit', 'fpt_net_profit']
    financial_report = financial_report[cols_to_keep]
    
    temp_date_col = pd.to_datetime(financial_report['year'])
    year_str = temp_date_col.dt.year.astype(str)
    period_str = year_str + 'Q' + financial_report['quarter'].astype(str)
    financial_report['date'] = pd.PeriodIndex(period_str, freq='Q').to_timestamp(how='end')
    financial_report = financial_report.sort_values(by='date', ascending=True)
    financial_report['date'] = financial_report['date'].dt.strftime('%d/%m/%Y')
    final_cols = ['date', 'fpt_net_revenue', 'fpt_gross_profit', 'fpt_operating_profit', 'fpt_net_profit']
    financial_report = financial_report[final_cols]
    
    # --- Đặt lại chỉ mục ---
    financial_report = financial_report.reset_index(drop=True)

    return financial_report

In [43]:
(
    get_financial_report_data(
        financial_report
    )
)

Unnamed: 0,date,fpt_net_revenue,fpt_gross_profit,fpt_operating_profit,fpt_net_profit
0,31/03/2007,5663584718970,402931492406,162197301132,155046633674
1,30/06/2007,6070562308072,561245558702,268288470953,210921693967
2,31/03/2013,8611039318749,1193175950373,551903080045,358322731666
3,30/06/2013,9336561564148,1365206492783,675879966784,444785653324
4,30/09/2013,12000599396256,1346075028103,546818928836,330589868211
5,31/12/2013,11283426979266,1638113635317,676635490863,474009965781
6,31/03/2014,10867321675354,1345058640185,566862962628,361517383294
7,30/06/2014,10685411011162,1554003572904,628383604663,427585557091
8,30/09/2014,13477214908110,1606660749355,590932786533,389097954831
9,31/12/2014,15333191333048,1768215975970,612409548679,450528882174


#### 2.3.2. FPT Corp Financial Metric Synthesis

In [44]:
fpt = merge_multiple_dfs(
    df_list=[get_financial_report_data(financial_report)],
    on_column='date',
    how='outer',
)

In [45]:
fpt

Unnamed: 0,date,fpt_net_revenue,fpt_gross_profit,fpt_operating_profit,fpt_net_profit
0,31/03/2007,5663584718970,402931492406,162197301132,155046633674
1,30/06/2007,6070562308072,561245558702,268288470953,210921693967
2,31/03/2013,8611039318749,1193175950373,551903080045,358322731666
3,30/06/2013,9336561564148,1365206492783,675879966784,444785653324
4,30/09/2013,12000599396256,1346075028103,546818928836,330589868211
5,31/12/2013,11283426979266,1638113635317,676635490863,474009965781
6,31/03/2014,10867321675354,1345058640185,566862962628,361517383294
7,30/06/2014,10685411011162,1554003572904,628383604663,427585557091
8,30/09/2014,13477214908110,1606660749355,590932786533,389097954831
9,31/12/2014,15333191333048,1768215975970,612409548679,450528882174


### 2.4. FPT Corp Stock Price

In [46]:
STOCK_PRICE_PATH = RAW_DATA_DIR + r'fpt/fpt_historical_price.csv'
stock_price = pd.read_csv(STOCK_PRICE_PATH, parse_dates=['time'])
stock_price.head(5)

Unnamed: 0,time,open,high,low,close,volume
0,2007-01-02,11.09,11.09,10.85,10.85,81170
1,2007-01-03,10.85,11.09,10.85,11.09,74660
2,2007-01-04,11.64,11.64,11.64,11.64,148510
3,2007-01-05,12.2,12.2,12.2,12.2,377800
4,2007-01-08,12.57,12.57,12.08,12.08,351470


In [47]:
def get_stock_price(
    stock_price
):
    stock_price = stock_price.rename(columns={
        'time': 'date',
        'close': 'fpt_stock_price',
        'volume': 'fpt_stock_volume'
    })
    stock_price = stock_price[['date', 'fpt_stock_price', 'fpt_stock_volume']]
    
    stock_price['date'] = pd.to_datetime(stock_price['date'], format='%d/%m/%Y')
    stock_price = stock_price.sort_values(by='date').reset_index(drop=True)
    stock_price['date'] = stock_price['date'].dt.strftime('%d/%m/%Y')
    
    return stock_price


In [48]:
get_stock_price(
    stock_price
)

Unnamed: 0,date,fpt_stock_price,fpt_stock_volume
0,02/01/2007,10.85,81170
1,03/01/2007,11.09,74660
2,04/01/2007,11.64,148510
3,05/01/2007,12.20,377800
4,08/01/2007,12.08,351470
...,...,...,...
4673,03/10/2025,93.40,7681300
4674,06/10/2025,95.50,6580000
4675,07/10/2025,95.50,5997500
4676,08/10/2025,94.50,5132100


### 2.5. Data Synthesis

In [49]:
dataset = merge_multiple_dfs(
    df_list=[
        macro, 
        industry, 
        fpt, 
        get_stock_price(stock_price)
    ],
    on_column='date',
    how='outer',
)
dataset

Unnamed: 0,date,cpi_rate,gdp_value,usd_vnd_rate,xau_usd_rate,market_cap,pe_ratio,fpt_net_revenue,fpt_gross_profit,fpt_operating_profit,fpt_net_profit,fpt_stock_price,fpt_stock_volume
0,01/01/2007,,,16051.0,,,,,,,,,
1,01/01/2008,,,16018.0,,,,,,,,,
2,01/01/2009,,,17480.0,,,,,,,,,
3,01/01/2010,,,18469.0,,,,,,,,,
4,01/01/2013,,,20820.0,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4944,31/12/2021,,3.664748e+11,22857.5,1830.58,,,1.070406e+13,4.094368e+12,1.739746e+12,1.301522e+12,47.70,1607400.0
4945,31/12/2022,,4.134452e+11,,,,,1.304243e+13,5.241274e+12,1.978262e+12,1.351601e+12,,
4946,31/12/2023,,4.338577e+11,,,,,1.469041e+13,5.631864e+12,2.386698e+12,1.728400e+12,,
4947,31/12/2024,,4.763882e+11,25512.5,2627.85,,,1.760782e+13,6.377570e+12,2.920875e+12,2.094726e+12,131.49,3612655.0


In [50]:
SYNTHETIZED_DATA_OUTPUT_DIR = SYNTHETIZED_DATA_DIR + r'synthetized_dataset.csv'
print(SYNTHETIZED_DATA_OUTPUT_DIR)

../data/synthetized_dataset.csv


In [None]:
dataset['date'] = pd.to_datetime(dataset['date'], format='%d/%m/%Y')
dataset['index'] = dataset['date']
dataset_final = dataset.set_index('index').sort_index()


In [58]:
df_filtered = dataset_final[(dataset_final.index >= '2020-01-01') & (dataset_final.index <= '2025-09-30')]
df_filtered

Unnamed: 0_level_0,date,cpi_rate,gdp_value,usd_vnd_rate,xau_usd_rate,market_cap,pe_ratio,fpt_net_revenue,fpt_gross_profit,fpt_operating_profit,fpt_net_profit,fpt_stock_price,fpt_stock_volume
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2020-01-01,2020-01-01,,,23171.0,1517.48,,,,,,,,
2020-01-02,2020-01-02,,,23175.5,1531.30,,,,,,,21.39,896720.0
2020-01-03,2020-01-03,,,23174.5,1553.45,,,,,,,21.03,2047880.0
2020-01-06,2020-01-06,,,23178.0,1582.69,,,,,,,20.81,1091660.0
2020-01-07,2020-01-07,,,23179.0,1577.38,,,,,,,21.21,837240.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-09-24,2025-09-24,,,26432.0,3779.54,,,,,,,99.50,7421100.0
2025-09-25,2025-09-25,,,26428.0,3761.66,,,,,,,98.00,8627700.0
2025-09-26,2025-09-26,,,26422.5,3783.88,,,,,,,97.50,8080800.0
2025-09-29,2025-09-29,,,26433.5,3834.58,,,,,,,95.50,10430900.0


In [59]:
df_filtered.to_csv(
    SYNTHETIZED_DATA_DIR + r'synthetized_dataset.csv',
    index=False
)

In [56]:
dataset_final.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 4949 entries, 2007-01-01 to 2025-12-31
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   date                  4949 non-null   datetime64[ns]
 1   cpi_rate              82 non-null     float64       
 2   gdp_value             16 non-null     float64       
 3   usd_vnd_rate          4900 non-null   float64       
 4   xau_usd_rate          1495 non-null   float64       
 5   market_cap            34 non-null     float64       
 6   pe_ratio              34 non-null     float64       
 7   fpt_net_revenue       52 non-null     float64       
 8   fpt_gross_profit      52 non-null     float64       
 9   fpt_operating_profit  52 non-null     float64       
 10  fpt_net_profit        52 non-null     float64       
 11  fpt_stock_price       4678 non-null   float64       
 12  fpt_stock_volume      4678 non-null   float64       
dtype