## Import Các Thư Viện Cần Thiết

In [1]:
import numpy as np
import pandas as pd

from datetime import datetime
import os
import glob

## 1. Đọc dữ liệu

In [2]:
base_dir = os.getcwd()

# Nhảy lên 1 cấp để tới project/
project_dir = os.path.dirname(base_dir)

# Ghép tới thư mục data
market_data_dir = os.path.join(project_dir, "data/sliver/market_data")
internal_data_dir = os.path.join(project_dir, "data/sliver/internal_data")
growth_and_inflation_data_dir = os.path.join(project_dir, "data/sliver/macro_economic_data/growth_and_inflation")
policy_interest_rate_dir = os.path.join(project_dir, "data/sliver/macro_economic_data/policy_interest_rate")

# Hàm để đọc tất cả file CSV trong một thư mục
def read_csv_files_from_directory(directory_path):
    """Đọc tất cả file CSV trong thư mục và trả về dictionary với tên file làm key"""
    csv_files = glob.glob(os.path.join(directory_path, "*.csv"))
    dataframes = {}
    
    for csv_file in csv_files:
        file_name = os.path.basename(csv_file)
        try:
            df = pd.read_csv(csv_file)
            dataframes[file_name] = df
            print(f"Đã đọc thành công: {file_name} - Shape: {df.shape}")
        except Exception as e:
            print(f"Lỗi khi đọc {file_name}: {e}")
    
    return dataframes

# Đọc dữ liệu từ các thư mục
print("=== ĐỌC DỮ LIỆU MARKET ===")
market_dataframes = read_csv_files_from_directory(market_data_dir)

print("\n=== ĐỌC DỮ LIỆU INTERNAL ===")
internal_dataframes = read_csv_files_from_directory(internal_data_dir)

print("\n=== ĐỌC DỮ LIỆU GROWTH & INFLATION ===")
growth_inflation_dataframes = read_csv_files_from_directory(growth_and_inflation_data_dir)

print("\n=== ĐỌC DỮ LIỆU POLICY INTEREST RATE ===")
policy_interest_dataframes = read_csv_files_from_directory(policy_interest_rate_dir)

=== ĐỌC DỮ LIỆU MARKET ===
Đã đọc thành công: CBOE_Volatility_Index_FRED.csv - Shape: (1003, 2)
Đã đọc thành công: CDS_5Y_CS_1D.csv - Shape: (1003, 2)
Đã đọc thành công: HNXINDEX_1D.csv - Shape: (1553, 3)
Đã đọc thành công: PRICE_CS_1D.csv - Shape: (1003, 3)
Đã đọc thành công: SP_500_1D.csv - Shape: (1003, 3)
Đã đọc thành công: SX7E_STOXX_Banks_EUR_Price.csv - Shape: (1003, 2)
Đã đọc thành công: VNINDEX_1D.csv - Shape: (1553, 3)

=== ĐỌC DỮ LIỆU INTERNAL ===
Đã đọc thành công: Internal_Data_Financial_Report.csv - Shape: (1003, 11)

=== ĐỌC DỮ LIỆU GROWTH & INFLATION ===
Đã đọc thành công: ECONOMICS_ USCPI-1D.csv - Shape: (1003, 2)
Đã đọc thành công: ECONOMICS_CHCPI-1D.csv - Shape: (1003, 2)
Đã đọc thành công: ECONOMICS_CHGDPCP_1D.csv - Shape: (1003, 2)
Đã đọc thành công: ECONOMICS_EUCPI-1D.csv - Shape: (1003, 2)
Đã đọc thành công: ECONOMICS_EUGDPCP_1D.csv - Shape: (1003, 2)
Đã đọc thành công: ECONOMICS_USGDPCP_1D.csv - Shape: (1003, 2)

=== ĐỌC DỮ LIỆU POLICY INTEREST RATE ===
Đã đọc t

In [3]:
# Tạo các dataframe chính
print("\n=== TẠO CÁC DATAFRAME CHÍNH ===")

# Khởi tạo dataframe chính với cột time từ dataframe đầu tiên
main_df = None

# Danh sách các dataframe để merge
dataframes_to_merge = []

# Market data
if 'PRICE_CS_1D.csv' in market_dataframes:
    df_price_cs = market_dataframes['PRICE_CS_1D.csv'].copy()
    # Đổi tên các cột (trừ cột time)
    df_price_cs = df_price_cs.rename(columns={col: f"{col}_PRICE_CS_1D" for col in df_price_cs.columns if col != 'time'})
    dataframes_to_merge.append(('PRICE_CS_1D', df_price_cs))

if 'SP_500_1D.csv' in market_dataframes:
    df_sp500 = market_dataframes['SP_500_1D.csv'].copy()
    df_sp500 = df_sp500.rename(columns={col: f"{col}_SP_500_1D" for col in df_sp500.columns if col != 'time'})
    dataframes_to_merge.append(('SP_500_1D', df_sp500))

if 'CDS_5Y_CS_1D.csv' in market_dataframes:
    df_cds = market_dataframes['CDS_5Y_CS_1D.csv'].copy()
    df_cds = df_cds.rename(columns={col: f"{col}_CDS_5Y_CS_1D" for col in df_cds.columns if col != 'time'})
    dataframes_to_merge.append(('CDS_5Y_CS_1D', df_cds))

if 'CBOE_Volatility_Index_FRED.csv' in market_dataframes:
    df_vix = market_dataframes['CBOE_Volatility_Index_FRED.csv'].copy()
    df_vix = df_vix.rename(columns={col: f"{col}_CBOE_Volatility_Index_FRED" for col in df_vix.columns if col != 'time'})
    dataframes_to_merge.append(('CBOE_Volatility_Index_FRED', df_vix))

if 'SX7E_STOXX_Banks_EUR_Price.csv' in market_dataframes:
    df_sx7e = market_dataframes['SX7E_STOXX_Banks_EUR_Price.csv'].copy()
    df_sx7e = df_sx7e.rename(columns={col: f"{col}_SX7E_STOXX_Banks_EUR_Price" for col in df_sx7e.columns if col != 'time'})
    dataframes_to_merge.append(('SX7E_STOXX_Banks_EUR_Price', df_sx7e))

# Internal data
if 'Internal_Data_Financial_Report.csv' in internal_dataframes:
    df_financial_report = internal_dataframes['Internal_Data_Financial_Report.csv'].copy()
    df_financial_report = df_financial_report.rename(columns={col: f"{col}_Internal_Data_Financial_Report" for col in df_financial_report.columns if col != 'time'})
    dataframes_to_merge.append(('Internal_Data_Financial_Report', df_financial_report))

# Growth and Inflation data
# United States Data
if 'ECONOMICS_USGDPCP_1D.csv' in growth_inflation_dataframes:
    df_us_gdp = growth_inflation_dataframes['ECONOMICS_USGDPCP_1D.csv'].copy()
    df_us_gdp = df_us_gdp.rename(columns={col: f"{col}_ECONOMICS_USGDPCP_1D" for col in df_us_gdp.columns if col != 'time'})
    dataframes_to_merge.append(('ECONOMICS_USGDPCP_1D', df_us_gdp))

if 'ECONOMICS_USCPI-1D.csv' in growth_inflation_dataframes:
    df_us_cpi = growth_inflation_dataframes['ECONOMICS_USCPI-1D.csv'].copy()
    df_us_cpi = df_us_cpi.rename(columns={col: f"{col}_ECONOMICS_USCPI_1D" for col in df_us_cpi.columns if col != 'time'})
    dataframes_to_merge.append(('ECONOMICS_USCPI_1D', df_us_cpi))

# European Union Data
if 'ECONOMICS_EUGDPCP_1D.csv' in growth_inflation_dataframes:
    df_eu_gdp = growth_inflation_dataframes['ECONOMICS_EUGDPCP_1D.csv'].copy()
    df_eu_gdp = df_eu_gdp.rename(columns={col: f"{col}_ECONOMICS_EUGDPCP_1D" for col in df_eu_gdp.columns if col != 'time'})
    dataframes_to_merge.append(('ECONOMICS_EUGDPCP_1D', df_eu_gdp))

if 'ECONOMICS_EUCPI-1D.csv' in growth_inflation_dataframes:
    df_eu_cpi = growth_inflation_dataframes['ECONOMICS_EUCPI-1D.csv'].copy()
    df_eu_cpi = df_eu_cpi.rename(columns={col: f"{col}_ECONOMICS_EUCPI_1D" for col in df_eu_cpi.columns if col != 'time'})
    dataframes_to_merge.append(('ECONOMICS_EUCPI_1D', df_eu_cpi))

# Switzerland Data
if 'ECONOMICS_CHGDPCP_1D.csv' in growth_inflation_dataframes:
    df_ch_gdp = growth_inflation_dataframes['ECONOMICS_CHGDPCP_1D.csv'].copy()
    df_ch_gdp = df_ch_gdp.rename(columns={col: f"{col}_ECONOMICS_CHGDPCP_1D" for col in df_ch_gdp.columns if col != 'time'})
    dataframes_to_merge.append(('ECONOMICS_CHGDPCP_1D', df_ch_gdp))

if 'ECONOMICS_CHCPI-1D.csv' in growth_inflation_dataframes:
    df_ch_cpi = growth_inflation_dataframes['ECONOMICS_CHCPI-1D.csv'].copy()
    df_ch_cpi = df_ch_cpi.rename(columns={col: f"{col}_ECONOMICS_CHCPI_1D" for col in df_ch_cpi.columns if col != 'time'})
    dataframes_to_merge.append(('ECONOMICS_CHCPI_1D', df_ch_cpi))

# Policy Interest Rate data
if 'FED_FUNDS.csv' in policy_interest_dataframes:
    df_fed_funds = policy_interest_dataframes['FED_FUNDS.csv'].copy()
    df_fed_funds = df_fed_funds.rename(columns={col: f"{col}_FED_FUNDS" for col in df_fed_funds.columns if col != 'time'})
    dataframes_to_merge.append(('FED_FUNDS', df_fed_funds))

if 'ECB_INTEREST_RATE_FRED.csv' in policy_interest_dataframes:
    df_ecb_rate = policy_interest_dataframes['ECB_INTEREST_RATE_FRED.csv'].copy()
    df_ecb_rate = df_ecb_rate.rename(columns={col: f"{col}_ECB_INTEREST_RATE_FRED" for col in df_ecb_rate.columns if col != 'time'})
    dataframes_to_merge.append(('ECB_INTEREST_RATE_FRED', df_ecb_rate))

# Merge all dataframes
print(f"Số lượng dataframes cần merge: {len(dataframes_to_merge)}")

if dataframes_to_merge:
    # Bắt đầu với dataframe đầu tiên
    main_df = dataframes_to_merge[0][1].copy()
    print(f"Khởi tạo với {dataframes_to_merge[0][0]}: {main_df.shape}")
    
    # Merge các dataframe còn lại
    for name, df in dataframes_to_merge[1:]:
        print(f"Đang merge {name}: {df.shape}")
        main_df = pd.merge(main_df, df, on='time', how='outer')
        print(f"Sau khi merge {name}: {main_df.shape}")

    print(f"\nDataframe chính cuối cùng: {main_df.shape}")
    print(f"Các cột trong dataframe chính:")
    for i, col in enumerate(main_df.columns):
        print(f"{i+1:2d}. {col}")
    
    # Kiểm tra số lượng missing values
    print(f"\nSố lượng missing values trong từng cột:")
    missing_counts = main_df.isnull().sum()
    for col, count in missing_counts.items():
        if count > 0:
            print(f"{col}: {count}")
    
    # Sắp xếp theo thời gian
    main_df = main_df.sort_values('time').reset_index(drop=True)
    print(f"\nDataframe đã được sắp xếp theo thời gian")
    print(f"Khoảng thời gian: {main_df['time'].min()} đến {main_df['time'].max()}")
    
else:
    print("Không có dataframe nào để merge!")


=== TẠO CÁC DATAFRAME CHÍNH ===
Số lượng dataframes cần merge: 13
Khởi tạo với PRICE_CS_1D: (1003, 3)
Đang merge SP_500_1D: (1003, 3)
Sau khi merge SP_500_1D: (1003, 5)
Đang merge CDS_5Y_CS_1D: (1003, 2)
Sau khi merge CDS_5Y_CS_1D: (1003, 6)
Đang merge CBOE_Volatility_Index_FRED: (1003, 2)
Sau khi merge CBOE_Volatility_Index_FRED: (1003, 7)
Đang merge SX7E_STOXX_Banks_EUR_Price: (1003, 2)
Sau khi merge SX7E_STOXX_Banks_EUR_Price: (1003, 8)
Đang merge Internal_Data_Financial_Report: (1003, 11)
Sau khi merge Internal_Data_Financial_Report: (1003, 18)
Đang merge ECONOMICS_USGDPCP_1D: (1003, 2)
Sau khi merge ECONOMICS_USGDPCP_1D: (1003, 19)
Đang merge ECONOMICS_EUGDPCP_1D: (1003, 2)
Sau khi merge ECONOMICS_EUGDPCP_1D: (1003, 20)
Đang merge ECONOMICS_EUCPI_1D: (1003, 2)
Sau khi merge ECONOMICS_EUCPI_1D: (1003, 21)
Đang merge ECONOMICS_CHGDPCP_1D: (1003, 2)
Sau khi merge ECONOMICS_CHGDPCP_1D: (1003, 22)
Đang merge ECONOMICS_CHCPI_1D: (1003, 2)
Sau khi merge ECONOMICS_CHCPI_1D: (1003, 23)
Đa

In [4]:
 # Hiển thị thông tin tóm tắt
if main_df is not None:
    print("\n" + "="*50)
    print("THÔNG TIN DATAFRAME CHÍNH")
    print("="*50)
    print(main_df.info())


THÔNG TIN DATAFRAME CHÍNH
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1003 entries, 0 to 1002
Data columns (total 25 columns):
 #   Column                                                                    Non-Null Count  Dtype  
---  ------                                                                    --------------  -----  
 0   time                                                                      1003 non-null   object 
 1   close_PRICE_CS_1D                                                         1003 non-null   float64
 2   volume_PRICE_CS_1D                                                        1003 non-null   float64
 3   close_SP_500_1D                                                           1003 non-null   float64
 4   volume_SP_500_1D                                                          1003 non-null   float64
 5   close_CDS_5Y_CS_1D                                                        1003 non-null   float64
 6   close_CBOE_Volatility_Index_FRED     

In [5]:
main_df.head(10)

Unnamed: 0,time,close_PRICE_CS_1D,volume_PRICE_CS_1D,close_SP_500_1D,volume_SP_500_1D,close_CDS_5Y_CS_1D,close_CBOE_Volatility_Index_FRED,close_SX7E_STOXX_Banks_EUR_Price,Provision for credit losses_Internal_Data_Financial_Report,Non-accrual loans / Gross loans_Internal_Data_Financial_Report,...,Tier 1 leverage ratio_Internal_Data_Financial_Report,Cost/income ratio_Internal_Data_Financial_Report,Return on Equity (ROE)_Internal_Data_Financial_Report,close_ECONOMICS_USGDPCP_1D,close_ECONOMICS_EUGDPCP_1D,close_ECONOMICS_EUCPI_1D,close_ECONOMICS_CHGDPCP_1D,close_ECONOMICS_CHCPI_1D,close_FED_FUNDS,close_ECB_INTEREST_RATE_FRED
0,2020-10-01,9.91,1629038.0,3380.8,2355862000.0,57.69,26.7,53.99,94.0,0.008,...,0.063,0.827,0.048,20511780000000.0,2768364000000.0,104.96,178392900000.0,100.34,0.09,0.0
1,2020-10-02,10.07,2478219.0,3348.44,2350091000.0,56.2,27.63,54.15,94.0,0.008,...,0.063,0.827,0.048,20511780000000.0,2768364000000.0,104.96,178392900000.0,100.34,0.09,0.0
2,2020-10-03,10.07,2478219.0,3348.44,2350091000.0,56.2,27.63,54.15,94.0,0.008,...,0.063,0.827,0.048,20511780000000.0,2768364000000.0,104.96,178392900000.0,100.34,0.09,0.0
3,2020-10-04,10.07,2478219.0,3348.44,2350091000.0,56.2,27.63,54.15,94.0,0.008,...,0.063,0.827,0.048,20511780000000.0,2768364000000.0,104.96,178392900000.0,100.34,0.09,0.0
4,2020-10-05,10.32,2244035.0,3408.63,2040148000.0,54.94,27.96,55.31,94.0,0.008,...,0.063,0.827,0.048,20511780000000.0,2768364000000.0,104.96,178392900000.0,100.34,0.09,0.0
5,2020-10-06,10.3,2938302.0,3360.95,2549973000.0,52.94,29.48,57.85,94.0,0.008,...,0.063,0.827,0.048,20511780000000.0,2768364000000.0,104.96,178392900000.0,100.34,0.09,0.0
6,2020-10-07,10.48,2255763.0,3419.45,2074262000.0,52.18,28.06,57.37,94.0,0.008,...,0.063,0.827,0.048,20511780000000.0,2768364000000.0,104.96,178392900000.0,100.34,0.09,0.0
7,2020-10-08,10.57,1648040.0,3446.83,2088860000.0,51.2,26.36,58.4,94.0,0.008,...,0.063,0.827,0.048,20511780000000.0,2768364000000.0,104.96,178392900000.0,100.34,0.09,0.0
8,2020-10-09,10.47,1494779.0,3477.13,2227950000.0,49.45,25.0,57.9,94.0,0.008,...,0.063,0.827,0.048,20511780000000.0,2768364000000.0,104.96,178392900000.0,100.34,0.09,0.0
9,2020-10-10,10.47,1494779.0,3477.13,2227950000.0,49.45,25.0,57.9,94.0,0.008,...,0.063,0.827,0.048,20511780000000.0,2768364000000.0,104.96,178392900000.0,100.34,0.09,0.0


In [6]:
# Tạo thư mục đích nếu chưa có
sliver_dir = os.path.join(project_dir, "data/gold")
os.makedirs(sliver_dir, exist_ok=True)

# Ghép path cho file xuất ra
output_path = os.path.join(sliver_dir, "gold_df.csv")

# Xuất df_sliver ra CSV
main_df.to_csv(output_path, index=False)

print(f"Đã lưu file tại: {output_path}")

Đã lưu file tại: d:\Git\ToanMoHinh\tmh2025-vong2\data/gold\gold_df.csv
