In [1]:
import pandas as pd
import numpy as np
import re
from src.config import BLD, RAW

In [14]:
columns = {
    'Stkcd': 'stock_id',
    'Accper': 'year',
    'A001101000': 'cash',  # 货币资金
    'A002101000': 'short_term_debt',  # 短期借款
    'A002125000': 'current_portion_long_debt',  # 一年内到期的长期借款
    'A002201000': 'long_term_debt',  # 长期借款
    'A002203000': 'bonds_payable',  # 应付债券
    'A002211000': 'lease_liabilities'
    }
df1 = pd.read_csv(RAW / 'csmar/基本信息/资产负债表/FS_Combas.csv',
                     usecols=columns.keys(), dtype={'Stkcd': str}).rename(columns=columns)
df1


Unnamed: 0,stock_id,year,cash,short_term_debt,current_portion_long_debt,long_term_debt,bonds_payable,lease_liabilities
0,000001,2013-12-31,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,8.102000e+09,
1,000001,2014-12-31,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,4.175000e+10,
2,000001,2015-12-31,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,2.129630e+11,
3,000001,2016-12-31,0.000000e+00,0.000000e+00,,,2.634640e+11,
4,000001,2017-12-31,,,,,3.424920e+11,
...,...,...,...,...,...,...,...,...
28452,920819,2016-12-31,3.091401e+09,3.589184e+09,1.040550e+09,,,
28453,920819,2017-12-31,3.580529e+09,2.896739e+09,9.801300e+07,1.374927e+09,1.189154e+09,
28454,920819,2018-12-31,1.841292e+09,2.117293e+09,,1.428159e+09,1.191109e+09,
28455,920819,2019-12-31,1.493290e+09,2.428895e+09,1.344661e+09,,1.193200e+09,


In [31]:

columns = {
     'Symbol': 'stock_id',
    'EndDate': 'year',
    'MarketValue': 'market_cap'
    }

df2 = pd.read_csv(RAW / 'csmar/基本信息/上市公司资本结构表/EVA_Structure.csv',
                     usecols=columns.keys(), dtype={'Symbol': str}).rename(columns=columns)

df2


Unnamed: 0,stock_id,year,market_cap
0,000001,2014-12-31,1.809703e+11
1,000001,2015-12-31,1.715610e+11
2,000001,2016-12-31,1.562507e+11
3,000001,2017-12-31,2.283665e+11
4,000001,2018-12-31,1.610585e+11
...,...,...,...
23387,920445,2020-12-31,9.456940e+08
23388,920489,2020-12-31,7.044706e+08
23389,920682,2020-12-31,1.153600e+09
23390,920799,2020-12-31,1.713396e+09


In [36]:

# 先重命名并提取 df1 的关键字段
df1_ev = df1.copy()
df1_ev['year'] = pd.to_datetime(df1_ev['year']).dt.year

# 计算有息负债
df1_ev['interest_bearing_debt'] = (
    df1_ev['short_term_debt'].fillna(0) +
    df1_ev['current_portion_long_debt'].fillna(0) +
    df1_ev['long_term_debt'].fillna(0) +
    df1_ev['bonds_payable'].fillna(0) +
    df1_ev['lease_liabilities'].fillna(0)
)

# 准备 df2，提取并重命名市值字段
df2_ev = df2.copy()
df2_ev['year'] = pd.to_datetime(df2_ev['year']).dt.year

# 合并两个表
df_merged = pd.merge(df1_ev[['stock_id', 'year', 'interest_bearing_debt', 'cash']],
                     df2_ev[['stock_id', 'year', 'market_cap']],
                     on=['stock_id', 'year'], how='inner')

# 计算 EV
df_merged['ev'] = df_merged['market_cap'] + df_merged['interest_bearing_debt'] - df_merged['cash']


In [37]:
df_merged

Unnamed: 0,stock_id,year,interest_bearing_debt,cash,market_cap,ev
0,000001,2014,4.175000e+10,0.000000e+00,1.809703e+11,2.227203e+11
1,000001,2015,2.129630e+11,0.000000e+00,1.715610e+11,3.845240e+11
2,000001,2016,2.634640e+11,0.000000e+00,1.562507e+11,4.197147e+11
3,000001,2017,3.424920e+11,,2.283665e+11,
4,000001,2018,3.818840e+11,,1.610585e+11,
...,...,...,...,...,...,...
23387,920445,2020,0.000000e+00,1.613909e+08,9.456940e+08,7.843031e+08
23388,920489,2020,1.425946e+08,2.141773e+08,7.044706e+08,6.328879e+08
23389,920682,2020,5.226783e+08,3.024732e+08,1.153600e+09,1.373805e+09
23390,920799,2020,1.302803e+07,2.180899e+08,1.713396e+09,1.508335e+09


In [38]:
output_path = BLD / 'ev.parquet'
df_merged.to_parquet(output_path, index=False)