In [77]:
import pandas as pd
import glob
import os
import numpy as np
from pathlib import Path
from datetime import datetime
import logging


## Tính tỷ lệ FF

In [78]:
def process_free_float_data(tennganh, external_market_cap_file=None):
    folder_path = Path(f"D:\\nckh\\data\\{tennganh}")
    excel_files = glob.glob(str(folder_path / "*.xlsx"))

    if not excel_files:
        print(f"No .xlsx files found in {folder_path}")
        return None

    dataframes = []
    for file in excel_files:
        print(f"Reading {file}...")
        try:
            df = pd.read_excel(file, header=7)
            df['SourceFile'] = os.path.basename(file)
            dataframes.append(df)
        except (pd.errors.ParserError, FileNotFoundError) as e:
            print(f"Error reading {file}: {e}")
            continue

    if not dataframes:
        print(f"No valid DataFrames created from files in {folder_path}")
        return None

    df = pd.concat(dataframes, ignore_index=True)

    # Drop rows with all critical metadata missing
    initial_rows = len(df)
    df = df.dropna(subset=['STT', 'Mã', 'Ngày'], how='any')
    print(f"Dropped {initial_rows - len(df)} rows with missing Stock_Code, STT, and Date.")

    # Clean messy header names
    df.columns = df.columns.str.replace('\nĐơn vị:.*', '', regex=True).str.strip()

    # Standardize column names
    df = df.rename(columns={
        'Số CP Free Float': 'Free_Float',
        'Số CP lưu hành hiện thời': 'Luu_Hanh',
        'Số CP niêm yết': 'Niem_Yet',
        'Ngày': 'Date',
        'Mã': 'Stock_Code',
        'Tên công ty': 'Company_Name',
        'Vốn hóa thị trường': 'Market_Cap',
        'STT': 'Index',
        'SourceFile': 'Source_File'
    })

    # Convert data types
    numeric_cols = ['Index', 'Free_Float', 'Luu_Hanh', 'Niem_Yet', 'Market_Cap']
    for col in numeric_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')
    df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

    # Merge external Market_Cap if provided
    if external_market_cap_file:
        print(f"Loading external Market_Cap from {external_market_cap_file}")
        external_data = pd.read_csv(external_market_cap_file)
        external_data['Date'] = pd.to_datetime(external_data['Date'], errors='coerce')
        external_data = external_data[['Stock_Code', 'Date', 'Market_Cap']]
        df = df.drop(columns=['Market_Cap'], errors='ignore')
        df = df.merge(external_data, on=['Stock_Code', 'Date'], how='left')
        print("\n=== External Market_Cap Merged ===")
        print(df[['Stock_Code', 'Date', 'Market_Cap']].head())

    # Print initial structure
    print("\n=== First 5 rows ===")
    print(df.head())
    print("\n=== Column names and info ===")
    print(df.info())

    # Log missing data
    print("\n=== Missing Data Summary ===")
    for col in df.columns:
        missing_count = df[col].isna().sum()
        if missing_count > 0:
            print(f"Column {col}: {missing_count} missing values")

    # Sort by Stock_Code and Date
    df = df.sort_values(['Stock_Code', 'Date'])

    # --- Conditional forward-fill ---
    max_gap_days = 5  # <= define max gap for conditional fill

    def conditional_ffill(series, dates, max_gap_days=max_gap_days):
        filled = series.copy()
        last_valid = None
        last_date = None
        for i in range(len(series)):
            val = series.iloc[i]
            d = dates.iloc[i]
            if pd.notna(val):
                last_valid = val
                last_date = d
            elif last_valid is not None and last_date is not None:
                gap = (d - last_date).days
                if gap <= max_gap_days:
                    filled.iloc[i] = last_valid
                else:
                    last_valid = None
                    last_date = None
        return filled

    # Apply conditional forward fill within each stock
    for col in ['Free_Float', 'Luu_Hanh', 'Market_Cap']:
        if col in df.columns and df[col].isna().any():
            print(f"⏩ Conditionally filling {col} within Stock_Code (max {max_gap_days}-day gap).")
            df[col] = df.groupby('Stock_Code', group_keys=False).apply(
                lambda g: conditional_ffill(g[col], g['Date'], max_gap_days=max_gap_days)
            )

    # Investigate remaining missing Free_Float
    if df['Free_Float'].isna().any():
        print("\n=== Stocks with Missing Free_Float ===")
        print(df[df['Free_Float'].isna()]['Stock_Code'].value_counts().head())
        print("\n=== Dates with Missing Free_Float ===")
        print(df[df['Free_Float'].isna()]['Date'].value_counts().sort_index().head())

    # Drop remaining missing values
    total_missing = df[['Free_Float', 'Luu_Hanh', 'Market_Cap']].isna().any(axis=1).sum()
    print(f"Dropping {total_missing} rows with any missing key values.")
    df = df.dropna(subset=['Free_Float', 'Market_Cap', 'Luu_Hanh'])

    # Handle invalid dates
    invalid_dates = df['Date'].isna().sum()
    if invalid_dates > 0:
        print(f"Dropping {invalid_dates} rows with invalid dates.")
        df = df.dropna(subset=['Date'])

    # Filter invalid data
    df_long = df[df['Luu_Hanh'] > 0]
    dropped_luu_hanh = len(df) - len(df_long)
    if dropped_luu_hanh > 0:
        print(f"Dropped {dropped_luu_hanh} rows with Luu_Hanh <= 0.")
    df_long = df_long[df_long['Free_Float'] <= df_long['Luu_Hanh']]
    df_long = df_long[df_long['Free_Float'] >= 0]

    # --- Date coverage summary ---
    print("\n=== Missing-Date Coverage Summary ===")
    coverage = (
        df_long.groupby('Stock_Code')['Date']
              .agg(['min', 'max', 'count'])
              .assign(days=lambda x: (x['max'] - x['min']).dt.days + 1)
    )
    coverage['coverage_%'] = (coverage['count'] / coverage['days'] * 100).round(1)
    print(coverage.head(10))

    # Log data retention
    print(f"\nRemaining rows after filtering: {len(df_long)} ({len(df_long)/initial_rows*100:.2f}% of original)")
    print(f"Unique stocks: {df_long['Stock_Code'].nunique()}")
    print(f"Unique dates: {df_long['Date'].nunique()}")

    print("\n=== After Pre-Processing ===")
    print(df_long.head())
    print("\n=== Debug Data ===")
    print("Total Free_Float = 0:", (df_long['Free_Float'] == 0).sum())
    print("Total Luu_Hanh = 0:", (df_long['Luu_Hanh'] == 0).sum())
    print("Free_Float sum by Date:", df_long.groupby('Date')['Free_Float'].sum().head())
    print("Luu_Hanh sum by Date:", df_long.groupby('Date')['Luu_Hanh'].sum().head())
    print("\n=== Missing Data After Preprocessing ===")
    print(df_long.isna().sum())

    # --- Calculate industry Free Float ratio ---
    nghanh_freefloat = df_long.groupby('Date').apply(
        lambda x: x['Free_Float'].sum() / x['Luu_Hanh'].sum() if x['Luu_Hanh'].sum() != 0 else 0
    )
    zero_luu_hanh_dates = df_long.groupby('Date').filter(lambda x: x['Luu_Hanh'].sum() == 0)['Date'].unique()
    for date in zero_luu_hanh_dates:
        print(f"Warning: Zero Luu_Hanh sum for date {date}")
    nghanh_freefloat = pd.DataFrame(nghanh_freefloat, columns=['Tỷ lệ FF']).reset_index()

    # --- Save results ---
    output = f'ty_le_freefloat_{tennganh}'
    output_path = Path("data") / "processed" / f"{output}.xlsx"
    os.makedirs(output_path.parent, exist_ok=True)
    nghanh_freefloat.to_excel(output_path, index=False)
    print(f"✅ Saved to {output_path}")

    return df_long, nghanh_freefloat


if __name__ == "__main__":
    list_nganh = [
        'hangtieudung', 'duocphamyte', 'taichinh', 'nganhang', 'congnghiep',
        'dichvutieudung', 'tienichcd', 'nguyenvatlieu', 'vienthong', 'cntt'
    ]

    # external_file = "D:\\nckh\\data\\external_market_cap.csv"  # Optional external file
    for nganh in list_nganh:
        result_df, result_ff = process_free_float_data(nganh)
        if result_df is not None:
            print(f"\n✅ Processing completed for {nganh}. Results saved.\n")


Reading D:\nckh\data\hangtieudung\hangtieudung_1.xlsx...
Reading D:\nckh\data\hangtieudung\hangtieudung_2.xlsx...
Reading D:\nckh\data\hangtieudung\hangtieudung_3.xlsx...
Reading D:\nckh\data\hangtieudung\hangtieudung_4.xlsx...
Reading D:\nckh\data\hangtieudung\hangtieudung_5.xlsx...
Reading D:\nckh\data\hangtieudung\hangtieudung_6.xlsx...
Reading D:\nckh\data\hangtieudung\hangtieudung_7.xlsx...
Dropped 77 rows with missing Stock_Code, STT, and Date.

=== First 5 rows ===
   Index Stock_Code               Company_Name       Date   Free_Float  \
0      1        CTF                  City Auto 2025-10-01   43044236.0   
1      2        CTP            Hoà Bình Takara 2025-10-01    7864994.0   
2      3        DAS   Thiết bị Dầu khí Đà Nẵng 2025-10-01    2310000.0   
3      4        DAT  ĐT Du lịch và PT Thủy sản 2025-10-01    8999684.0   
4      5        DBC            Tập đoàn DABACO 2025-10-01  250163333.0   

      Luu_Hanh     Niem_Yet    Market_Cap          Source_File  
0   95653859.

  df[col] = df.groupby('Stock_Code', group_keys=False).apply(


⏩ Conditionally filling Luu_Hanh within Stock_Code (max 5-day gap).


  df[col] = df.groupby('Stock_Code', group_keys=False).apply(


⏩ Conditionally filling Market_Cap within Stock_Code (max 5-day gap).


  df[col] = df.groupby('Stock_Code', group_keys=False).apply(



=== Stocks with Missing Free_Float ===
Stock_Code
BHS    1479
BAS    1479
NPS    1479
FBT    1479
AGC    1473
Name: count, dtype: int64

=== Dates with Missing Free_Float ===
Date
2008-12-23     45
2008-12-24     45
2008-12-25     46
2008-12-27     46
2008-12-31    131
Name: count, dtype: int64
Dropping 263292 rows with any missing key values.
Dropped 77362 rows with Luu_Hanh <= 0.

=== Missing-Date Coverage Summary ===
                  min        max  count  days  coverage_%
Stock_Code                                               
AGC        2014-08-27 2022-11-23   1909  3011        63.4
AGD        2014-08-27 2022-11-23   1849  3011        61.4
AJC        2016-05-04 2022-11-23   1356  2395        56.6
ANCO       2017-06-29 2022-11-23   1195  1974        60.5
AQN        2018-08-14 2022-11-23    849  1563        54.3
ASIASILK   2016-05-04 2022-11-23   1356  2395        56.6
ATD        2017-09-06 2022-11-23   1250  1905        65.6
AUTO32     2016-05-04 2022-11-23   1356  2395        

  nghanh_freefloat = df_long.groupby('Date').apply(


✅ Saved to data\processed\ty_le_freefloat_hangtieudung.xlsx

✅ Processing completed for hangtieudung. Results saved.

Reading D:\nckh\data\duocphamyte\duocphamyte1.xlsx...
Reading D:\nckh\data\duocphamyte\duocphamyte2.xlsx...
Dropped 22 rows with missing Stock_Code, STT, and Date.

=== First 5 rows ===
   Index Stock_Code           Company_Name       Date   Free_Float  \
0      1        AGP  Dược phẩm AGIMEXPHARM 2025-10-01   11976871.0   
1      2        AMP              Armephaco 2025-10-01    1950000.0   
2      3        AMV   Dược-TB Y tế Việt Mỹ 2025-10-01  131105650.0   
3      4        APC        Chiếu xạ An Phú 2025-10-01   19905025.0   
4      5        BCH          Dược Bảo Châu 2025-10-01          0.0   

      Luu_Hanh    Market_Cap     Niem_Yet        Source_File  
0   26615268.0  1.102910e+12   26615268.0  duocphamyte1.xlsx  
1   13000000.0  1.729000e+11   13000000.0  duocphamyte1.xlsx  
2  131105650.0  2.622113e+11  131105650.0  duocphamyte1.xlsx  
3   19905025.0  1.49805

  df[col] = df.groupby('Stock_Code', group_keys=False).apply(


⏩ Conditionally filling Luu_Hanh within Stock_Code (max 5-day gap).


  df[col] = df.groupby('Stock_Code', group_keys=False).apply(


⏩ Conditionally filling Market_Cap within Stock_Code (max 5-day gap).


  df[col] = df.groupby('Stock_Code', group_keys=False).apply(



=== Stocks with Missing Free_Float ===
Stock_Code
DHG    1404
OPC    1404
DCL    1404
IMP    1404
DMC    1404
Name: count, dtype: int64

=== Dates with Missing Free_Float ===
Date
2008-12-23     7
2008-12-24     7
2008-12-25     7
2008-12-27     7
2008-12-31    31
Name: count, dtype: int64
Dropping 49739 rows with any missing key values.
Dropped 14666 rows with Luu_Hanh <= 0.

=== Missing-Date Coverage Summary ===
                  min        max  count  days  coverage_%
Stock_Code                                               
AGP        2015-05-28 2025-10-01   2590  3780        68.5
AMP        2016-05-04 2025-10-01   2207  3438        64.2
AMV        2014-08-27 2025-10-01   2772  4054        68.4
APC        2014-08-27 2025-10-01   2763  4054        68.2
BCH        2018-08-14 2022-11-23    718  1563        45.9
BCP        2015-09-08 2025-10-01   2518  3677        68.5
BIO        2016-08-22 2025-10-01   2015  3328        60.5
CBV        2018-08-14 2022-11-23    662  1563        42.4
C

  nghanh_freefloat = df_long.groupby('Date').apply(


✅ Saved to data\processed\ty_le_freefloat_duocphamyte.xlsx

✅ Processing completed for duocphamyte. Results saved.

Reading D:\nckh\data\taichinh\taichinh1.xlsx...
Reading D:\nckh\data\taichinh\taichinh2.xlsx...
Reading D:\nckh\data\taichinh\taichinh3.xlsx...
Reading D:\nckh\data\taichinh\taichinh4.xlsx...
Reading D:\nckh\data\taichinh\taichinh5.xlsx...
Reading D:\nckh\data\taichinh\taichinh6.xlsx...
Dropped 2171 rows with missing Stock_Code, STT, and Date.

=== First 5 rows ===
   Index Stock_Code             Company_Name       Date   Free_Float  \
0      1        AAS  Chứng khoán SmartInvest 2025-10-20  229999521.0   
1      2        AAV     Việt Tiên Sơn Địa ốc 2025-10-20   51740745.0   
2      3        ABI        BH NH Nông Nghiệp 2025-10-20   35471671.0   
3      4        ABW      Chứng khoán An Bình 2025-10-20   25287500.0   
4      5       ACBS          Chứng khoán ACB 2025-10-20          0.0   

       Luu_Hanh      Niem_Yet  Market_Cap     Source_File  
0  2.299995e+08  2.2999

  df[col] = df.groupby('Stock_Code', group_keys=False).apply(


⏩ Conditionally filling Luu_Hanh within Stock_Code (max 5-day gap).


  df[col] = df.groupby('Stock_Code', group_keys=False).apply(


⏩ Conditionally filling Market_Cap within Stock_Code (max 5-day gap).


  df[col] = df.groupby('Stock_Code', group_keys=False).apply(



=== Stocks with Missing Free_Float ===
Stock_Code
SSI    1405
HAC    1404
NTL    1404
HDC    1404
KBC    1404
Name: count, dtype: int64

=== Dates with Missing Free_Float ===
Date
2008-12-23     20
2008-12-24     20
2008-12-25     20
2008-12-27     20
2008-12-31    177
Name: count, dtype: int64
Dropping 330524 rows with any missing key values.
Dropped 122106 rows with Luu_Hanh <= 0.

=== Missing-Date Coverage Summary ===
                  min        max  count  days  coverage_%
Stock_Code                                               
AAS        2016-05-04 2025-10-20   2237  3457        64.7
AAV        2017-06-20 2025-10-20   2085  3045        68.5
ABFG       2016-05-04 2022-11-23   1341  2395        56.0
ABFM       2016-05-04 2022-11-23   1356  2395        56.6
ABI        2014-08-27 2025-10-20   2785  4073        68.4
ABW        2016-05-04 2025-10-20   1955  3457        56.6
ACBC       2016-08-22 2016-12-05     75   106        70.8
ACBS       2014-08-27 2016-12-05    568   832       

  nghanh_freefloat = df_long.groupby('Date').apply(


✅ Saved to data\processed\ty_le_freefloat_taichinh.xlsx

✅ Processing completed for taichinh. Results saved.

Reading D:\nckh\data\nganhang\ngânhang.xlsx...
Dropped 11 rows with missing Stock_Code, STT, and Date.

=== First 5 rows ===
   Index  Stock_Code        Company_Name       Date    Free_Float  \
0      1         ABB   Ngân hàng An Bình 2025-10-01  5.175184e+08   
1      2         ACB                 ACB 2025-10-01  4.366158e+09   
2      3        AGRB            Agribank 2025-10-01  0.000000e+00   
3      4         BAB     Ngân hàng Bắc Á 2025-10-01  7.524143e+08   
4      5  BAOVIETBAN  Ngân hàng Bảo Việt 2025-10-01  0.000000e+00   

       Luu_Hanh      Niem_Yet    Market_Cap    Source_File  
0  1.035037e+09  1.035037e+09  1.278788e+13  ngânhang.xlsx  
1  5.136657e+09  5.136657e+09  1.325257e+14  ngânhang.xlsx  
2  4.096292e+09  4.096292e+09           NaN  ngânhang.xlsx  
3  1.003219e+09  9.580218e+08  1.344314e+13  ngânhang.xlsx  
4  3.150000e+08  3.150000e+08           NaN  

  df[col] = df.groupby('Stock_Code', group_keys=False).apply(


⏩ Conditionally filling Luu_Hanh within Stock_Code (max 5-day gap).


  df[col] = df.groupby('Stock_Code', group_keys=False).apply(


⏩ Conditionally filling Market_Cap within Stock_Code (max 5-day gap).


  df[col] = df.groupby('Stock_Code', group_keys=False).apply(



=== Stocks with Missing Free_Float ===
Stock_Code
PVcomBank    1479
STB          1404
ACB          1404
SHB          1344
VCB          1296
Name: count, dtype: int64

=== Dates with Missing Free_Float ===
Date
2008-12-23     3
2008-12-24     3
2008-12-25     3
2008-12-27     3
2008-12-31    33
Name: count, dtype: int64
Dropping 49044 rows with any missing key values.
Dropped 6933 rows with Luu_Hanh <= 0.

=== Missing-Date Coverage Summary ===
                  min        max  count  days  coverage_%
Stock_Code                                               
ABB        2014-08-27 2025-10-01   2698  4054        66.6
ACB        2014-08-27 2025-10-01   2772  4054        68.4
AGRB       2014-08-27 2022-11-23   1591  3011        52.8
ANZVL      2016-08-23 2016-12-05     74   105        70.5
BAB        2014-08-27 2025-10-01   2768  4054        68.3
BAOVIETBAN 2014-08-27 2022-11-23   1849  3011        61.4
BID        2014-08-27 2025-10-01   2772  4054        68.4
BIDCHCM    2016-05-04 2022-11-

  nghanh_freefloat = df_long.groupby('Date').apply(


✅ Saved to data\processed\ty_le_freefloat_nganhang.xlsx

✅ Processing completed for nganhang. Results saved.

Reading D:\nckh\data\congnghiep\congnghiep10.xlsx...
Reading D:\nckh\data\congnghiep\congnghiep11.xlsx...
Reading D:\nckh\data\congnghiep\congnghiep12.xlsx...
Reading D:\nckh\data\congnghiep\congnghiep13.xlsx...
Reading D:\nckh\data\congnghiep\congnghiep14.xlsx...
Reading D:\nckh\data\congnghiep\congnghiep15.xlsx...
Reading D:\nckh\data\congnghiep\congnghiep6.xlsx...
Reading D:\nckh\data\congnghiep\congnghiep8.xlsx...
Reading D:\nckh\data\congnghiep\congnghiep9.xlsx...
Reading D:\nckh\data\congnghiep\congnghiep_1.xlsx...
Reading D:\nckh\data\congnghiep\congnghiep_2.xlsx...
Reading D:\nckh\data\congnghiep\congnghiep_3.xlsx...
Reading D:\nckh\data\congnghiep\congnghiep_4.xlsx...
Reading D:\nckh\data\congnghiep\congnghiep_5.xlsx...
Reading D:\nckh\data\congnghiep\congnghiep_7_1.xlsx...
Reading D:\nckh\data\congnghiep\congnghiep_7_2.xlsx...
Dropped 1984 rows with missing Stock_Code

  df[col] = df.groupby('Stock_Code', group_keys=False).apply(


⏩ Conditionally filling Luu_Hanh within Stock_Code (max 5-day gap).


  df[col] = df.groupby('Stock_Code', group_keys=False).apply(


⏩ Conditionally filling Market_Cap within Stock_Code (max 5-day gap).


  df[col] = df.groupby('Stock_Code', group_keys=False).apply(



=== Stocks with Missing Free_Float ===
Stock_Code
CIC    1479
ALP    1479
SDS    1479
BTC    1473
BHV    1457
Name: count, dtype: int64

=== Dates with Missing Free_Float ===
Date
2008-12-23    138
2008-12-24    138
2008-12-25    139
2008-12-27    139
2008-12-31    359
Name: count, dtype: int64
Dropping 707197 rows with any missing key values.
Dropped 195518 rows with Luu_Hanh <= 0.

=== Missing-Date Coverage Summary ===
                  min        max  count  days  coverage_%
Stock_Code                                               
0302602811 2016-05-04 2022-11-23   1356  2395        56.6
11TL       2016-05-04 2022-11-23   1356  2395        56.6
ABR        2016-05-04 2025-10-01   2196  3438        63.9
ACC        2014-08-27 2025-10-01   2772  4054        68.4
ACC224     2016-05-04 2022-11-23   1459  2395        60.9
ACE        2014-08-27 2025-10-01   2772  4054        68.4
ACS        2016-05-04 2025-10-01   2279  3438        66.3
ACV        2016-05-04 2025-10-01   2279  3438       

  nghanh_freefloat = df_long.groupby('Date').apply(


✅ Saved to data\processed\ty_le_freefloat_congnghiep.xlsx

✅ Processing completed for congnghiep. Results saved.

Reading D:\nckh\data\dichvutieudung\dichvutieudung2.xlsx...
Reading D:\nckh\data\dichvutieudung\dichvutieudung3.xlsx...
Reading D:\nckh\data\dichvutieudung\dichvutieudung4.xlsx...
Reading D:\nckh\data\dichvutieudung\dichvutieudung5.xlsx...
Reading D:\nckh\data\dichvutieudung\dichvutiudung1.xlsx...
Dropped 55 rows with missing Stock_Code, STT, and Date.

=== First 5 rows ===
   Index Stock_Code               Company_Name       Date     Niem_Yet  \
0      1        HNT             Xe điện Hà Nội 2025-10-01    5500000.0   
1      2        HOT        Du lịch - DV Hội An 2025-10-01    8000000.0   
2      3        HTM  Thương mại Hà Nội - Hapro 2025-10-01  220000000.0   
3      4        HTP            In SGK Hòa Phát 2025-10-01   91804980.0   
4      5        HTT          Thương mại Hà Tây 2025-10-01   20000000.0   

   Free_Float     Luu_Hanh    Market_Cap           Source_File  

  df[col] = df.groupby('Stock_Code', group_keys=False).apply(


⏩ Conditionally filling Luu_Hanh within Stock_Code (max 5-day gap).


  df[col] = df.groupby('Stock_Code', group_keys=False).apply(


⏩ Conditionally filling Market_Cap within Stock_Code (max 5-day gap).


  df[col] = df.groupby('Stock_Code', group_keys=False).apply(



=== Stocks with Missing Free_Float ===
Stock_Code
HBE    1457
VPL    1431
DAE    1404
CTC    1404
COM    1404
Name: count, dtype: int64

=== Dates with Missing Free_Float ===
Date
2008-12-23    24
2008-12-24    24
2008-12-25    24
2008-12-27    24
2008-12-31    77
Name: count, dtype: int64
Dropping 213774 rows with any missing key values.
Dropped 116132 rows with Luu_Hanh <= 0.

=== Missing-Date Coverage Summary ===
                  min        max  count  days  coverage_%
Stock_Code                                               
ACSVN      2016-05-04 2022-11-23   1356  2395        56.6
ADC        2014-08-27 2025-10-01   2772  4054        68.4
ADG        2018-08-14 2025-10-01   1452  2606        55.7
AFX        2016-05-04 2025-10-01   2279  3438        66.3
AGX        2015-08-27 2025-10-01   2525  3689        68.4
AIRSE      2016-05-04 2022-11-23   1356  2395        56.6
ALT        2014-08-27 2025-10-01   2772  4054        68.4
AST        2017-12-25 2025-10-01   1939  2838        68.3

  nghanh_freefloat = df_long.groupby('Date').apply(


✅ Saved to data\processed\ty_le_freefloat_dichvutieudung.xlsx

✅ Processing completed for dichvutieudung. Results saved.

Reading D:\nckh\data\tienichcd\tiemichcc1.xlsx...
Reading D:\nckh\data\tienichcd\tienichcc2.xlsx...
Reading D:\nckh\data\tienichcd\tienichcc3.xlsx...
Reading D:\nckh\data\tienichcd\tienichcc4.xlsx...
Dropped 44 rows with missing Stock_Code, STT, and Date.

=== First 5 rows ===
   Index Stock_Code              Company_Name       Date   Free_Float  \
0      1        ASP            Dầu khí An Pha 2025-10-20   16802793.0   
1      2        AVC         Thủy điện A Vương 2025-10-20    7505205.0   
2      3        BDW  Cấp thoát nước Bình Định 2025-10-20    2482160.0   
3      4        BGE                BCG Energy 2025-10-20  730000000.0   
4      5        BGW       Nước sạch Bắc Giang 2025-10-20    3629889.0   

      Luu_Hanh     Niem_Yet  Market_Cap      Source_File  
0   37339542.0   37339929.0         NaN  tiemichcc1.xlsx  
1   75052052.0   75052052.0         NaN  ti

  df[col] = df.groupby('Stock_Code', group_keys=False).apply(


⏩ Conditionally filling Luu_Hanh within Stock_Code (max 5-day gap).


  df[col] = df.groupby('Stock_Code', group_keys=False).apply(


⏩ Conditionally filling Market_Cap within Stock_Code (max 5-day gap).


  df[col] = df.groupby('Stock_Code', group_keys=False).apply(



=== Stocks with Missing Free_Float ===
Stock_Code
ASP    1404
KHP    1404
HJS    1404
SJD    1404
SFC    1404
Name: count, dtype: int64

=== Dates with Missing Free_Float ===
Date
2008-12-23    14
2008-12-24    14
2008-12-25    14
2008-12-27    14
2008-12-31    54
Name: count, dtype: int64
Dropping 112236 rows with any missing key values.
Dropped 59626 rows with Luu_Hanh <= 0.

=== Missing-Date Coverage Summary ===
                  min        max  count  days  coverage_%
Stock_Code                                               
ASP        2014-08-27 2025-10-20   2785  4073        68.4
AVC        2016-05-04 2025-10-20   2288  3457        66.2
BDW        2015-10-06 2025-10-20   2511  3668        68.5
BGE        2024-09-12 2025-10-20    276   404        68.3
BGW        2016-05-04 2025-10-20   2288  3457        66.2
BHA        2017-07-11 2025-10-20   2070  3024        68.5
BLW        2018-08-14 2024-02-23   1361  2020        67.4
BMF        2018-03-27 2025-10-20   1892  2765        68.4


  nghanh_freefloat = df_long.groupby('Date').apply(


✅ Saved to data\processed\ty_le_freefloat_tienichcd.xlsx

✅ Processing completed for tienichcd. Results saved.

Reading D:\nckh\data\nguyenvatlieu\nguyenvatlieu1.xlsx...
Reading D:\nckh\data\nguyenvatlieu\nguyenvatlieu2.xlsx...
Reading D:\nckh\data\nguyenvatlieu\nguyenvatlieu3.xlsx...
Reading D:\nckh\data\nguyenvatlieu\nguyenvatlieu4.xlsx...
Reading D:\nckh\data\nguyenvatlieu\nguyenvatlieu5.xlsx...
Dropped 55 rows with missing Stock_Code, STT, and Date.

=== First 5 rows ===
   Index Stock_Code                 Company_Name       Date   Free_Float  \
0      1        AAA          An Phát Bioplastics 2025-10-20  196871365.0   
1      2        AAH                Than Hợp Nhất 2025-10-20   88425000.0   
2      3        ABS    DV Nông nghiệp Bình Thuận 2025-10-20   72000000.0   
3      4        ACG                  Gỗ An Cường 2025-10-20   12063035.0   
4      5        ACM  Tập đoàn Khoáng sản Á Cường 2025-10-20   51000000.0   

      Luu_Hanh     Niem_Yet  Market_Cap          Source_File  


  df[col] = df.groupby('Stock_Code', group_keys=False).apply(


⏩ Conditionally filling Luu_Hanh within Stock_Code (max 5-day gap).


  df[col] = df.groupby('Stock_Code', group_keys=False).apply(


⏩ Conditionally filling Market_Cap within Stock_Code (max 5-day gap).


  df[col] = df.groupby('Stock_Code', group_keys=False).apply(



=== Stocks with Missing Free_Float ===
Stock_Code
NVC    1479
VIS    1457
HPG    1404
HSI    1404
HSG    1404
Name: count, dtype: int64

=== Dates with Missing Free_Float ===
Date
2008-12-23    34
2008-12-24    34
2008-12-25    34
2008-12-27    34
2008-12-31    99
Name: count, dtype: int64
Dropping 191723 rows with any missing key values.
Dropped 67690 rows with Luu_Hanh <= 0.

=== Missing-Date Coverage Summary ===
                  min        max  count  days  coverage_%
Stock_Code                                               
AAA        2014-08-27 2025-10-20   2785  4073        68.4
AAH        2024-04-03 2025-10-20    386   566        68.2
ABS        2018-08-14 2025-10-20   1637  2625        62.4
ACG        2021-07-28 2025-10-20   1056  1546        68.3
ACM        2015-07-17 2025-10-20   2567  3749        68.5
AMC        2014-08-27 2025-10-20   2785  4073        68.4
APP        2014-08-27 2025-10-20   2785  4073        68.4
ASIAC      2016-05-04 2022-11-23   1356  2395        56.6


  nghanh_freefloat = df_long.groupby('Date').apply(


✅ Saved to data\processed\ty_le_freefloat_nguyenvatlieu.xlsx

✅ Processing completed for nguyenvatlieu. Results saved.

Reading D:\nckh\data\vienthong\vienthong.xlsx...
Dropped 11 rows with missing Stock_Code, STT, and Date.

=== First 5 rows ===
   Index Stock_Code                            Company_Name       Date  \
0      1        ABC                        Truyền thông VMG 2025-10-01   
1      2        FOX                             FPT Telecom 2025-10-01   
2      3        MFS                        Mobifone Service 2025-10-01   
3      4        PAI  CNTT, VT và Tự động hóa Dầu khí - PAIC 2025-10-01   
4      5        PIA           Tin học Viễn thông Petrolimex 2025-10-01   

   Free_Float     Luu_Hanh     Niem_Yet    Market_Cap     Source_File  
0  12233400.0   20389000.0   20393000.0  2.192225e+11  vienthong.xlsx  
1  30802610.0  738763463.0  738763463.0  4.755568e+13  vienthong.xlsx  
2   3884638.0    7062979.0    7062979.0  2.792561e+11  vienthong.xlsx  
3     82650.0    423

  df[col] = df.groupby('Stock_Code', group_keys=False).apply(


⏩ Conditionally filling Luu_Hanh within Stock_Code (max 5-day gap).


  df[col] = df.groupby('Stock_Code', group_keys=False).apply(


⏩ Conditionally filling Market_Cap within Stock_Code (max 5-day gap).


  df[col] = df.groupby('Stock_Code', group_keys=False).apply(



=== Stocks with Missing Free_Float ===
Stock_Code
PTP        1174
VGI         428
SPTC         79
VTT          76
VIETTEL      75
Name: count, dtype: int64

=== Dates with Missing Free_Float ===
Date
2008-12-31    3
2009-12-03    1
2009-12-04    1
2009-12-07    1
2009-12-08    1
Name: count, dtype: int64
Dropping 8280 rows with any missing key values.
Dropped 4591 rows with Luu_Hanh <= 0.

=== Missing-Date Coverage Summary ===
                  min        max  count  days  coverage_%
Stock_Code                                               
ABC        2014-08-27 2025-10-01   2772  4054        68.4
CTE        2016-05-04 2022-11-23   1356  2395        56.6
FOX        2015-05-28 2025-10-01   2590  3780        68.5
MFS        2016-05-04 2025-10-01   2289  3438        66.6
MOBIF      2016-05-04 2022-11-23   1355  2395        56.6
PAI        2016-05-04 2025-10-01   2185  3438        63.6
PIA        2016-05-04 2025-10-01   2199  3438        64.0
PTP        2014-08-27 2025-10-01   2772  4054 

  nghanh_freefloat = df_long.groupby('Date').apply(


✅ Saved to data\processed\ty_le_freefloat_vienthong.xlsx

✅ Processing completed for vienthong. Results saved.

Reading D:\nckh\data\cntt\CNTT.xlsx...
Dropped 11 rows with missing Stock_Code, STT, and Date.

=== First 5 rows ===
   Index Stock_Code             Company_Name       Date    Free_Float  \
0      1        CKV                 CokyVina 2025-10-01  1.604800e+06   
1      2        CMG   Tập đoàn Công nghệ CMC 2025-10-01  8.469804e+07   
2      3        CMT  CN mạng và Truyền thông 2025-10-01  2.911112e+06   
3      4        ELC                    ELCOM 2025-10-01  7.339320e+07   
4      5        FPT                 FPT Corp 2025-10-01  1.447981e+09   

       Luu_Hanh      Niem_Yet    Market_Cap Source_File  
0  4.012000e+06  4.050000e+06  4.734160e+10   CNTT.xlsx  
1  2.117451e+08  2.117839e+08  8.289821e+12   CNTT.xlsx  
2  7.277780e+06  8.000000e+06  1.001568e+11   CNTT.xlsx  
3  1.048474e+08  1.048474e+08  2.369552e+12   CNTT.xlsx  
4  1.703507e+09  1.703507e+09  1.591076e+1

  df[col] = df.groupby('Stock_Code', group_keys=False).apply(


⏩ Conditionally filling Luu_Hanh within Stock_Code (max 5-day gap).


  df[col] = df.groupby('Stock_Code', group_keys=False).apply(


⏩ Conditionally filling Market_Cap within Stock_Code (max 5-day gap).


  df[col] = df.groupby('Stock_Code', group_keys=False).apply(



=== Stocks with Missing Free_Float ===
Stock_Code
TLC    1479
SRA    1404
FPT    1404
SAM    1404
POT    1404
Name: count, dtype: int64

=== Dates with Missing Free_Float ===
Date
2008-12-23    13
2008-12-24    13
2008-12-25    13
2008-12-27    13
2008-12-31    28
Name: count, dtype: int64
Dropping 39611 rows with any missing key values.
Dropped 2382 rows with Luu_Hanh <= 0.

=== Missing-Date Coverage Summary ===
                  min        max  count  days  coverage_%
Stock_Code                                               
CKV        2014-08-27 2025-10-01   2772  4054        68.4
CMG        2014-08-27 2025-10-01   2772  4054        68.4
CMT        2014-08-27 2025-10-01   2772  4054        68.4
ELC        2014-08-27 2025-10-01   2772  4054        68.4
FISC       2016-05-04 2022-11-23   1356  2395        56.6
FPT        2014-08-27 2025-10-01   2772  4054        68.4
HIG        2014-08-27 2025-08-26   2748  4018        68.4
HPT        2014-08-27 2025-10-01   2772  4054        68.4
IC

  nghanh_freefloat = df_long.groupby('Date').apply(


✅ Saved to data\processed\ty_le_freefloat_cntt.xlsx

✅ Processing completed for cntt. Results saved.



In [93]:
import pandas as pd
from datetime import timedelta
import os
import sys

# =====================================
# CONFIG
# =====================================
DAUKHI_PATH = r'data\raw\daukhi\daukhi.xlsx'
OUTPUT_PATH = r'data\processed\ty_le_freefloat_daukhi.xlsx'
EXTERNAL_MARKET_CAP_FILE = None  # Optional external data
MAX_GAP_DAYS = 5  # Maximum gap allowed for ffill (in days)

# =====================================
# READ DATA
# =====================================
print("=== Loading Excel ===")
try:
    df_daukhi = pd.read_excel(DAUKHI_PATH, header=7)
except Exception as e:
    print(f"❌ Error loading file: {e}")
    sys.exit(1)

df_daukhi = pd.DataFrame(df_daukhi)
print("=== First 5 rows ===")
print(df_daukhi.head())
print("\n=== Column names ===")
print(df_daukhi.columns.tolist())

# =====================================
# FIND KEY COLUMNS
# =====================================
columns = df_daukhi.columns.tolist()
stt_col = next((col for col in columns if 'STT' in str(col).upper()), None)
date_col = next((col for col in columns if 'Ngày' in str(col)), None)

if not stt_col or not date_col:
    print(f"❌ Could not find STT or Ngày columns. Available columns: {columns}")
    sys.exit(1)

print(f"✅ Using STT column: {stt_col}")
print(f"✅ Using Date column: {date_col}")

# =====================================
# EXTRACT COMPANY DATA
# =====================================
companies = [
    'BSR', 'OIL', 'PEQ', 'PLX', 'POS', 'PTV', 'PVB', 'PVC',
    'PVD', 'PVE', 'PVS', 'TOS', 'PETROVN', 'BTPETROCHE'
]

columns_to_keep = [stt_col, date_col]
melted_dfs = []

for company in companies:
    company_cols = [col for col in columns if company in str(col)]
    if company_cols and len(company_cols) >= 4:
        temp_df = df_daukhi[columns_to_keep + company_cols].copy()
        rename_dict = {
            company_cols[0]: 'Free_Float',
            company_cols[1]: 'Luu_Hanh',
            company_cols[2]: 'Niem_Yet',
            company_cols[3]: 'Market_Cap'
        }
        temp_df = temp_df.rename(columns=rename_dict)
        print(f"\n📊 Company: {company}, Renamed columns: {list(rename_dict.values())}")
        temp_df['Stock_Code'] = company
        temp_df = temp_df[
            [stt_col, date_col, 'Stock_Code', 'Free_Float', 'Luu_Hanh', 'Niem_Yet', 'Market_Cap']
        ]
        melted_dfs.append(temp_df)
    else:
        print(f"⚠️ Skipping {company} — insufficient columns ({len(company_cols)} found).")

if not melted_dfs:
    print("❌ No valid company data found. Check Excel column headers.")
    sys.exit(1)

# =====================================
# COMBINE
# =====================================
df = pd.concat(melted_dfs, ignore_index=True)
df = df.rename(columns={stt_col: 'Index', date_col: 'Date'})
print("\n=== Long Format Data (first 5 rows) ===")
print(df.head())

initial_rows = len(df)

# =====================================
# CLEANUP
# =====================================
df = df.dropna(subset=['Index', 'Stock_Code', 'Date'], how='any')
print(f"Dropped {initial_rows - len(df)} rows with missing Index, Stock_Code, or Date.")

# Standardize columns
df = df.rename(columns={
    'Số CP Free Float\nĐơn vị: Cổ phiếu': 'Free_Float',
    'Số CP lưu hành hiện thời\nĐơn vị: Cổ phiếu': 'Luu_Hanh',
    'Số CP niêm yết\nĐơn vị: Cổ phiếu': 'Niem_Yet',
    'Ngày': 'Date',
    'Mã': 'Stock_Code',
    'Vốn hóa thị trường\nĐơn vị: VND': 'Market_Cap',
    'STT': 'Index'
})

# =====================================
# CONVERT TYPES
# =====================================
for col in ['Index', 'Free_Float', 'Luu_Hanh', 'Niem_Yet', 'Market_Cap']:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')

df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

# =====================================
# MERGE EXTERNAL MARKET CAP (OPTIONAL)
# =====================================
if EXTERNAL_MARKET_CAP_FILE and os.path.exists(EXTERNAL_MARKET_CAP_FILE):
    print(f"📂 Loading external Market Cap from {EXTERNAL_MARKET_CAP_FILE}")
    ext = pd.read_csv(EXTERNAL_MARKET_CAP_FILE)
    ext['Date'] = pd.to_datetime(ext['Date'], errors='coerce')
    ext = ext[['Stock_Code', 'Date', 'Market_Cap']]
    df = df.drop(columns=['Market_Cap'], errors='ignore')
    df = df.merge(ext, on=['Stock_Code', 'Date'], how='left')

# =====================================
# CHECK MISSING
# =====================================
print("\n=== Missing Data Summary ===")
for col in df.columns:
    n_missing = df[col].isna().sum()
    if n_missing:
        print(f"Column {col}: {n_missing} missing values")

# =====================================
# CONDITIONAL FILL LOGIC
# =====================================
df = df.sort_values(['Stock_Code', 'Date'])
print(f"\n⏳ Conditional forward-fill (max gap {MAX_GAP_DAYS} days)...")

def conditional_ffill(group, col):
    group = group.sort_values('Date').copy()
    if group[col].isna().sum() == 0:
        return group

    last_valid_val = None
    last_valid_date = None
    for i in range(len(group)):
        val = group.iloc[i][col]
        if pd.notna(val):
            last_valid_val = val
            last_valid_date = group.iloc[i]['Date']
        elif last_valid_val is not None and last_valid_date is not None:
            gap = (group.iloc[i]['Date'] - last_valid_date).days
            if 0 < gap <= MAX_GAP_DAYS:
                group.at[group.index[i], col] = last_valid_val
    return group

for col in ['Free_Float', 'Luu_Hanh', 'Market_Cap']:
    if col in df.columns:
        print(f"➡️ Filling missing {col} for each stock (≤ {MAX_GAP_DAYS} days)...")
        df = df.groupby('Stock_Code', group_keys=False).apply(lambda g: conditional_ffill(g, col))

# =====================================
# DROP INVALID DATA
# =====================================
df = df.dropna(subset=['Free_Float', 'Luu_Hanh', 'Market_Cap', 'Date'])
df = df[df['Luu_Hanh'] > 0]
df = df[df['Free_Float'] >= 0]
df = df[df['Free_Float'] <= df['Luu_Hanh']]

print(f"\n✅ Remaining rows after filtering: {len(df)}")
print(f"Unique stocks: {df['Stock_Code'].nunique()}")
print(f"Unique dates: {df['Date'].nunique()}")

# =====================================
# INDUSTRY FREE FLOAT RATIO
# =====================================
nghanh_ff = (
    df.groupby('Date')
    .apply(lambda x: x['Free_Float'].sum() / x['Luu_Hanh'].sum() if x['Luu_Hanh'].sum() else 0)
    .reset_index(name='Tỷ lệ FF')
)

# =====================================
# EXPORT
# =====================================
os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)
nghanh_ff.to_excel(OUTPUT_PATH, index=False)

print(f"\n✅ Saved industry Free Float ratio to: {OUTPUT_PATH}")
print("\n=== Sample Industry Free Float Ratio ===")
print(nghanh_ff.head(10))


=== Loading Excel ===
=== First 5 rows ===
   STT       Ngày                                 BSR  \
0  NaN        NaT  Số CP Free Float\nĐơn vị: Cổ phiếu   
1    1 2025-10-01                           310049961   
2    2 2025-09-30                           310049961   
3    3 2025-09-29                           310049961   
4    4 2025-09-26                           310049961   

                                        BSR.1  \
0  Số CP lưu hành hiện thời\nĐơn vị: Cổ phiếu   
1                                  3100499616   
2                                  3100499616   
3                                  3100499616   
4                                  3100499616   

                             BSR.2                             BSR.3  \
0  Vốn hóa thị trường\nĐơn vị: VND  Số CP niêm yết\nĐơn vị: Cổ phiếu   
1                   82318264804800                        3100499616   
2                   81233089939200                        3100499616   
3                   83403439670

  df = df.groupby('Stock_Code', group_keys=False).apply(lambda g: conditional_ffill(g, col))


➡️ Filling missing Luu_Hanh for each stock (≤ 5 days)...


  df = df.groupby('Stock_Code', group_keys=False).apply(lambda g: conditional_ffill(g, col))


➡️ Filling missing Market_Cap for each stock (≤ 5 days)...


  df = df.groupby('Stock_Code', group_keys=False).apply(lambda g: conditional_ffill(g, col))



✅ Remaining rows after filtering: 30190
Unique stocks: 13
Unique dates: 2772

✅ Saved industry Free Float ratio to: data\processed\ty_le_freefloat_daukhi.xlsx

=== Sample Industry Free Float Ratio ===
        Date  Tỷ lệ FF
0 2014-08-27  0.443419
1 2014-08-28  0.443419
2 2014-08-29  0.443419
3 2014-09-03  0.443419
4 2014-09-04  0.443419
5 2014-09-05  0.443419
6 2014-09-08  0.443419
7 2014-09-09  0.443419
8 2014-09-10  0.443419
9 2014-09-11  0.443419


  .apply(lambda x: x['Free_Float'].sum() / x['Luu_Hanh'].sum() if x['Luu_Hanh'].sum() else 0)


In [None]:
import pandas as pd
import glob
import os
import numpy as np
import matplotlib.pyplot as plt

folder_path = f"data\\processed"
excel_files = glob.glob(os.path.join(folder_path, "*.xlsx"))

if not excel_files:
    print(f"No .xlsx files found in {folder_path}")

dataframes = []

for file in excel_files:
    print(f"Reading {file}...")
    try:
        df = pd.read_excel(file, header=0)  
        df['SourceFile'] = os.path.basename(file)
        dataframes.append(df)
    except Exception as e:
        print(f"Error reading {file}: {e}")
        continue

if dataframes:
    print(f"No valid DataFrames created from files in {folder_path}")
    if dataframes:
        df_all = pd.concat(dataframes, ignore_index=True)
        print(f"✅ Tổng số dòng sau khi gộp: {len(df_all):,}")
else:
    print(f"⚠️ Không có file hợp lệ trong {folder_path}")
df_all.head()

Reading data\processed\ty_le_freefloat_cntt.xlsx...
Reading data\processed\ty_le_freefloat_congnghiep.xlsx...
Reading data\processed\ty_le_freefloat_daukhi.xlsx...
Reading data\processed\ty_le_freefloat_dichvutieudung.xlsx...
Reading data\processed\ty_le_freefloat_duocphamyte.xlsx...
Reading data\processed\ty_le_freefloat_hangtieudung.xlsx...
Reading data\processed\ty_le_freefloat_nganhang.xlsx...
Reading data\processed\ty_le_freefloat_nguyenvatlieu.xlsx...
Reading data\processed\ty_le_freefloat_taichinh.xlsx...
Reading data\processed\ty_le_freefloat_tienichcd.xlsx...
Reading data\processed\ty_le_freefloat_vienthong.xlsx...
No valid DataFrames created from files in data\processed
✅ Tổng số dòng sau khi gộp: 30,534


Unnamed: 0,Date,Tỷ lệ FF,SourceFile
0,2014-08-27,0.622417,ty_le_freefloat_cntt.xlsx
1,2014-08-28,0.622417,ty_le_freefloat_cntt.xlsx
2,2014-08-29,0.622417,ty_le_freefloat_cntt.xlsx
3,2014-09-03,0.622417,ty_le_freefloat_cntt.xlsx
4,2014-09-04,0.622417,ty_le_freefloat_cntt.xlsx


In [97]:
df_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30534 entries, 0 to 30533
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   Date        30534 non-null  datetime64[ns]
 1   Tỷ lệ FF    30534 non-null  float64       
 2   SourceFile  30534 non-null  object        
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 715.8+ KB


In [98]:
import pandas as pd

# Giả sử df là dataframe của bạn
df_all["Ngành"] = (
    df_all["SourceFile"]
    .str.replace("ty_le_freefloat_", "", regex=False)
    .str.replace(".xlsx", "", regex=False)
)
df_all['Date'] = pd.to_datetime(df_all['Date'])
df_all.drop(columns=['SourceFile'],inplace=True)

In [99]:
df_all.head()

Unnamed: 0,Date,Tỷ lệ FF,Ngành
0,2014-08-27,0.622417,cntt
1,2014-08-28,0.622417,cntt
2,2014-08-29,0.622417,cntt
3,2014-09-03,0.622417,cntt
4,2014-09-04,0.622417,cntt


In [100]:
df_all['Date'].nunique()

2788

In [101]:
df_min = df_all['Date'].min()
df_max = df_all['Date'].max()
print(df_min,df_max)

2012-12-28 00:00:00 2025-10-20 00:00:00


In [102]:
import pandas as pd

path = r"data\raw\feature_chinh_cac_nganh.xlsx"
df_von_hoa = pd.read_excel(path, header= 0,sheet_name= 'vonhoa')
# Chuyển từ wide -> long
df_melted = df_von_hoa.melt(
    id_vars=["Ngày"],  # Cột giữ nguyên
    var_name="Ngành",  # Tên cột mới cho tên ngành
    value_name="Vốn hóa thị trường"  # Tên cột mới cho giá trị
)

# Làm sạch tên ngành (loại bỏ phần "Vốn hóa thị trường ")
df_melted["Ngành"] = df_melted["Ngành"].str.replace("Vốn hóa thị trường ", "", regex=False)

df_melted['Date'] = pd.to_datetime(df_melted['Ngày'])
df_melted.drop(columns=['Ngày'], inplace= True)

print(df_melted.head())
df_melted["Vốn hóa thị trường"] = (
    df_melted.groupby("Ngành")["Vốn hóa thị trường"].transform(lambda x: x.interpolate())
)
df_melted["Vốn hóa thị trường"] = (
    df_melted.groupby("Ngành")["Vốn hóa thị trường"].transform(lambda x: x.interpolate().ffill().bfill())
)


    Ngành  Vốn hóa thị trường       Date
0  daukhi        1.730623e+14 2025-10-01
1  daukhi        1.721406e+14 2025-09-30
2  daukhi        1.760597e+14 2025-09-29
3  daukhi        1.782136e+14 2025-09-26
4  daukhi        1.804920e+14 2025-09-25


In [103]:
# Giả sử cột ngành tên là "Ngành"
# Làm sạch mềm, an toàn
df_melted['Ngành'] = (
    df_melted['Ngành']
    .astype(str)
    .str.replace('\n', '', regex=False)  # Xoá ký tự xuống dòng
    .str.replace('\t', '', regex=False)  # Xoá tab
    .str.strip()                         # Xoá khoảng trắng đầu/cuối
)


df_all['Ngành'] = (
    df_all['Ngành']
    .astype(str)
    .str.replace('\n', '', regex=False)  # Xoá ký tự xuống dòng
    .str.replace('\t', '', regex=False)  # Xoá tab
    .str.strip()                         # Xoá khoảng trắng đầu/cuối
)

In [104]:
df_all = df_all[(df_all['Date'] >= '2015-01-01') & (df_all['Date'] <= '2025-10-01')]

In [105]:
df_melted = df_melted[(df_melted['Date'] >= '2015-01-01') & (df_melted['Date'] <= '2025-10-01')]

In [106]:
df_merged = pd.merge(
    df_melted,
    df_all[["Date", "Ngành", "Tỷ lệ FF"]],
    on=["Date", "Ngành"],
    how="left"
)

In [107]:
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29513 entries, 0 to 29512
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   Ngành               29513 non-null  object        
 1   Vốn hóa thị trường  29513 non-null  float64       
 2   Date                29513 non-null  datetime64[ns]
 3   Tỷ lệ FF            29513 non-null  float64       
dtypes: datetime64[ns](1), float64(2), object(1)
memory usage: 922.4+ KB


In [108]:
df_merged["Vốn hóa FF Adjusted"] = df_merged["Vốn hóa thị trường"] * df_merged["Tỷ lệ FF"]

In [109]:
df_merged.isnull().sum()


Ngành                  0
Vốn hóa thị trường     0
Date                   0
Tỷ lệ FF               0
Vốn hóa FF Adjusted    0
dtype: int64

In [114]:
df_merged.to_excel('data\\cal_sector_index\\adjusted_von_hoa.xlsx', index= False)