In [1]:
import pandas as pd
import numpy as np
from src.config import BLD, RAW, YEAR_RANGE
import pdfplumber
import re

df_basic_file = RAW/'csmar'/'基本信息'/'上市公司基本信息年度表215937759'/'STK_LISTEDCOINFOANL.csv'
df_basic = pd.read_csv(df_basic_file, dtype={'Symbol': str})
df_basic = df_basic.fillna(0) 
df_basic["year"] = pd.to_datetime(df_basic["EndDate"]).dt.year.astype(int)
df_basic.rename(columns={'Symbol': 'Stkcd'}, inplace=True)
df_basic = df_basic[(df_basic['Stkcd'] >= "000001") & (df_basic['Stkcd'] <= "679999")]
df_basic['Industry_Main'] = df_basic['IndustryCode'].str[0]
industry_changes = df_basic[df_basic['year'].between(2014, 2019)].groupby('Stkcd')['Industry_Main'].apply(set)

# 筛选出行业代码集合长度大于1的公司，表示发生过行业代码变化
companies_with_changes = industry_changes[industry_changes.apply(len) > 1].index

df_basic = df_basic[~df_basic['Stkcd'].isin(companies_with_changes)]
df_basic = df_basic[df_basic['Industry_Main'] != 'G']
df_basic = df_basic[df_basic['Industry_Main'] != 'K']
df_basic

Unnamed: 0,Stkcd,ShortName,EndDate,IndustryCode,FullName,year,Industry_Main
0,000001,平安银行,2012-12-31,J66,平安银行股份有限公司,2012,J
1,000001,平安银行,2013-12-31,J66,平安银行股份有限公司,2013,J
2,000001,平安银行,2014-12-31,J66,平安银行股份有限公司,2014,J
3,000001,平安银行,2015-12-31,J66,平安银行股份有限公司,2015,J
4,000001,平安银行,2016-12-31,J66,平安银行股份有限公司,2016,J
...,...,...,...,...,...,...,...
41815,605598,上海港湾,2022-12-31,E48,上海港湾基础建设(集团)股份有限公司,2022,E
41816,605598,上海港湾,2023-12-31,E48,上海港湾基础建设(集团)股份有限公司,2023,E
41817,605599,菜百股份,2021-12-31,F52,北京菜市口百货股份有限公司,2021,F
41818,605599,菜百股份,2022-12-31,F52,北京菜市口百货股份有限公司,2022,F


In [2]:
df_subsidiary_file = RAW/'csmar'/'基本信息'/'上市公司子公司情况表220415924'/'FN_Fn061.csv'
df_subsidiary = pd.read_csv(df_subsidiary_file, dtype={'Stkcd': str})
df_subsidiary["year"] = pd.to_datetime(df_subsidiary["EndDate"]).dt.year.astype(int)
df_subsidiary = df_subsidiary[(df_subsidiary['Stkcd'] >= "000001") & (df_subsidiary['Stkcd'] <= "679999")]

industry_changes = df_basic[df_basic['year'].between(2014, 2019)].groupby('Stkcd')['Industry_Main'].apply(set)
companies_with_changes = industry_changes[industry_changes.apply(len) > 1].index
df_subsidiary = df_subsidiary[~df_subsidiary['Stkcd'].isin(companies_with_changes)]
df_basic['company'] = df_basic['FullName']  # 母公司名称列
df_subsidiary['company'] = df_subsidiary['FN_Fn06101']
merged_df_list = pd.concat(
    [
        df_basic[['Stkcd', 'year', 'company']], 
        df_subsidiary[['Stkcd', 'year', 'company']],
    ],
    axis=0,
    ignore_index=True
)
merged_df_list.sort_values(by=['Stkcd', 'year'], inplace=True)
merged_df_list

Unnamed: 0,Stkcd,year,company
0,000001,2012,平安银行股份有限公司
36069,000001,2012,平安银行股份有限公司
1,000001,2013,平安银行股份有限公司
2,000001,2014,平安银行股份有限公司
3,000001,2015,平安银行股份有限公司
...,...,...,...
36064,605598,2022,上海港湾基础建设(集团)股份有限公司
36065,605598,2023,上海港湾基础建设(集团)股份有限公司
36066,605599,2021,北京菜市口百货股份有限公司
36067,605599,2022,北京菜市口百货股份有限公司


In [3]:
df_export_file = RAW / 'cn_custom_data'/'2014_2016_export_data.parquet'
df_export = pd.read_parquet(df_export_file)
result_df = pd.merge(
    merged_df_list,  # 包含母公司和子公司名称的表
    df_export[['Company_Name', 'Year', 'Country_Name', 'Export_Amount', 'Product_Code']],
    left_on=['company', 'year'],
    right_on=['Company_Name', 'Year'],
    how='inner'
)
result_df['hs_code'] = result_df['Product_Code'].astype(str).str[:6].astype(int)
# 筛选出出口美国的数据
df_usa = result_df[result_df['Country_Name'] == '美国']
exchange_rates = {2014: 6.128333, 2015: 6.205000, 2016: 6.614167}
df_usa = df_usa.copy()

# 计算转换后的金额
df_usa['export'] = df_usa.apply(lambda row: row['Export_Amount'] * exchange_rates[row['year']], axis=1)

# 按HS代码、公司代码、年份分类
grouped_df = df_usa.groupby(['Stkcd', 'year', 'hs_code'], as_index=False).agg({
    'Export_Amount': 'sum',  # 汇总出口金额
    'export': 'sum'  # 汇总转换后的金额
})

# 按公司代码（Stkcd）和年份（year）排序
df_usa = grouped_df.sort_values(by=['Stkcd', 'year']).reset_index(drop=True)
# 筛选出至少有2年数据的公司代码
company_year_counts = df_usa.groupby('Stkcd')['year'].nunique()
valid_companies = company_year_counts[company_year_counts >= 2].index

# 筛选出这些公司代码对应的行
df_usa = df_usa[df_usa['Stkcd'].isin(valid_companies)]
# 计算2014-2016年的公司-商品组合的平均出口金额
df_usa_avg = df_usa.groupby(['Stkcd', 'hs_code'])['export'].mean().reset_index()

# 重命名列
df_usa_avg.rename(columns={'export': 'avg_export_amount'}, inplace=True)
df_usa_avg

Unnamed: 0,Stkcd,hs_code,avg_export_amount
0,000009,250410,3.045076e+05
1,000009,284290,2.976375e+03
2,000009,380110,2.327422e+07
3,000009,381590,3.442277e+05
4,000009,392350,1.448503e+04
...,...,...,...
25344,603997,392690,1.804181e+04
25345,603997,600632,1.218006e+05
25346,603997,731814,2.830625e+01
25347,603997,940190,9.090358e+06


In [4]:
pdf_path = RAW/'us_tariff'/'FRN301.pdf'
all_text = ""
with pdfplumber.open(pdf_path) as pdf:
    for page in pdf.pages:
        all_text += page.extract_text() + " "

# Continue with your regex extraction as before
hts_codes = re.findall(r'\b\d{8}\b', all_text)
hts_prefixes = {code[:6] for code in hts_codes}
hts_list_1 = sorted(hts_prefixes)


pdf_path = RAW/'us_tariff'/'list_2.pdf'  # 替换为您的PDF路径

all_text = ""
with pdfplumber.open(pdf_path) as pdf:
    for page_number in range(4, 9):  # 页码从 0 开始，第 5 页是索引 4
        page = pdf.pages[page_number]
        all_text += page.extract_text() + " "

# 使用正则表达式提取8位 HTSUS 代码

hts_codes = re.findall(r'\b\d{4}\.\d{2}\.\d{2}\b', all_text)

# 格式化为6位代码（去掉圆点和后两位）
hts_prefixes = {code.replace(".", "")[:6] for code in hts_codes}
hts_list_2 = sorted(hts_prefixes)


pdf_path = RAW/'us_tariff'/'list_3.pdf'  # 替换为您的PDF路径

# 合并所有页面的文本
all_text = ""
with pdfplumber.open(pdf_path) as pdf:
    for page_number in range(3, 28): 
        page = pdf.pages[page_number]
        all_text += page.extract_text() + " "

# 使用正则提取所有符合8位格式的代码
hts_codes = re.findall(r'\b\d{4}\.\d{2}\.\d{2}\b', all_text)

# 格式化为6位代码（去掉圆点和后两位）
hts_prefixes = {code.replace(".", "")[:6] for code in hts_codes}

# 排序结果
hts_list_3 = sorted(hts_prefixes)


# 文件路径
pdf_path = RAW/'us_tariff'/'list_4.pdf'  # 替换为您的PDF路径

# 合并所有页面的文本
all_text = ""
with pdfplumber.open(pdf_path) as pdf:
    for page_number in range(3, 25): 
        page = pdf.pages[page_number]
        all_text += page.extract_text() + " "

# 使用正则提取所有符合8位格式的代码
hts_codes = re.findall(r'\b\d{4}\.\d{2}\.\d{2}\b', all_text)

# 格式化为6位代码（去掉圆点和后两位）
hts_prefixes = {code.replace(".", "")[:6] for code in hts_codes}

# 排序结果
hts_list_4 = sorted(hts_prefixes)

set1 = set(hts_list_1)
set2 = set(hts_list_2)
set3 = set(hts_list_3)
set4 = set(hts_list_4)
set5 = set1 | set2 | set3 
set6 = (set4 - set5)
hts_list_1_3 = sorted([int(item) for item in set5])
hts_list_4 = sorted([int(item) for item in set6])

df_usa_avg['affected_1_3'] = df_usa_avg['hs_code'].isin(hts_list_1_3)
df_usa_avg['affected_4'] = df_usa_avg['hs_code'].isin(hts_list_4)

# 按公司代码计算总金额
df_total = df_usa_avg.groupby('Stkcd')['avg_export_amount'].sum().reset_index()
df_total.rename(columns={'avg_export_amount': 'total_export_amount'}, inplace=True)

# 按公司代码计算受影响金额
df_1_3 = df_usa_avg[df_usa_avg['affected_1_3']].groupby('Stkcd')['avg_export_amount'].sum().reset_index()
df_1_3.rename(columns={'avg_export_amount': 'amount_affected_1_3'}, inplace=True)

df_4 = df_usa_avg[df_usa_avg['affected_4']].groupby('Stkcd')['avg_export_amount'].sum().reset_index()
df_4.rename(columns={'avg_export_amount': 'amount_affected_4'}, inplace=True)

# 合并结果
result = pd.merge(df_total, df_1_3, on='Stkcd', how='left')
result = pd.merge(result, df_4, on='Stkcd', how='left')

result.fillna(0, inplace=True)
result

Unnamed: 0,Stkcd,total_export_amount,amount_affected_1_3,amount_affected_4
0,000009,3.130058e+07,3.099408e+07,0.000000e+00
1,000012,8.859766e+06,8.859766e+06,0.000000e+00
2,000016,8.749128e+08,8.027679e+08,5.054625e+05
3,000020,5.433077e+06,9.845915e+05,0.000000e+00
4,000021,1.762177e+08,9.331029e+07,8.093301e+07
...,...,...,...,...
1028,603968,1.824172e+08,1.815179e+08,0.000000e+00
1029,603969,7.636192e+06,7.636192e+06,0.000000e+00
1030,603989,6.877404e+06,6.877404e+06,0.000000e+00
1031,603996,7.367749e+08,6.859205e+08,3.629002e+06


In [5]:
income_columns = {"Stkcd": "Stkcd","Accper": "year", "B001100000": "revenue"}
df_income_file = RAW/'csmar'/'基本信息'/'利润表000222262'/'FS_Comins.csv'
df_income = pd.read_csv(df_income_file, usecols=income_columns.keys(), dtype={'Stkcd': str}).rename(columns=income_columns)
df_income.loc[:, "year"] = pd.to_datetime(df_income["year"]).dt.year
df_income = df_income[(df_income['Stkcd'] >= "000001") & (df_income['Stkcd'] <= "679999")]
merged_df = pd.merge(df_income, df_basic[['Stkcd', 'year', 'Industry_Main']], on=['Stkcd', 'year'], how='left')
merged_df = merged_df.dropna()  # 去掉包含 NaN 的行

complete_years = set(range(2013, 2019)) 
company_year_counts = merged_df.groupby("Stkcd")["year"].apply(set)
valid_companies = company_year_counts[company_year_counts.apply(lambda x: complete_years.issubset(x))].index

average_revenue_df = (
    merged_df[(merged_df['year'] >= 2014) & (merged_df['year'] <= 2016)]
    .groupby('Stkcd')['revenue']
    .mean()
    .reset_index()
    .rename(columns={'revenue': 'average_revenue'})
)
merged_df = pd.merge(average_revenue_df, result, on='Stkcd', how='left').fillna(0)


merged_df['ratio_affected_1_3'] = merged_df['amount_affected_1_3'] / merged_df['average_revenue']
merged_df['ratio_affected_4'] = merged_df['amount_affected_4'] / merged_df['average_revenue']
merged_df = merged_df[merged_df["total_export_amount"] > 0]
merged_df

Unnamed: 0,Stkcd,average_revenue,total_export_amount,amount_affected_1_3,amount_affected_4,ratio_affected_1_3,ratio_affected_4
0,000009,5.218518e+09,3.130058e+07,3.099408e+07,0.000000e+00,0.005939,0.000000
1,000012,7.816492e+09,8.859766e+06,8.859766e+06,0.000000e+00,0.001133,0.000000
2,000016,1.937267e+10,8.749128e+08,8.027679e+08,5.054625e+05,0.041438,0.000026
4,000020,6.044878e+08,5.433077e+06,9.845915e+05,0.000000e+00,0.001629,0.000000
5,000021,1.562505e+10,1.762177e+08,9.331029e+07,8.093301e+07,0.005972,0.005180
...,...,...,...,...,...,...,...
2449,603968,1.285999e+09,1.824172e+08,1.815179e+08,0.000000e+00,0.141149,0.000000
2450,603969,1.621050e+09,7.636192e+06,7.636192e+06,0.000000e+00,0.004711,0.000000
2456,603989,1.431074e+09,6.877404e+06,6.877404e+06,0.000000e+00,0.004806,0.000000
2459,603996,3.825821e+09,7.367749e+08,6.859205e+08,3.629002e+06,0.179287,0.000949


In [6]:
merged_df.describe(include='all').transpose()



Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
Stkcd,940.0,940.0,9.0,1.0,,,,,,,
average_revenue,940.0,,,,14359478550.178566,102720233004.17366,80274150.52,950851195.038333,2151115205.155,5622792486.115,2258569333333.333
total_export_amount,940.0,,,,163984145.76287,584187323.892677,17614.511227,4136818.777289,25770328.745239,107159221.298105,9314945778.835236
amount_affected_1_3,940.0,,,,96216685.736717,288622164.124371,0.0,1617302.974641,12443767.983868,70261802.763279,5374149070.375442
amount_affected_4,940.0,,,,29842742.079681,207623949.201845,0.0,0.0,0.0,363735.904929,5332455878.091283
ratio_affected_1_3,940.0,,,,0.030452,0.066636,0.0,0.000689,0.005051,0.026063,0.643128
ratio_affected_4,940.0,,,,0.006638,0.03253,0.0,0.0,0.0,0.000113,0.430732


In [14]:
merged_df.nlargest(80, 'ratio_affected_1_3')


Unnamed: 0,Stkcd,average_revenue,total_export_amount,amount_affected_1_3,amount_affected_4,ratio_affected_1_3,ratio_affected_4,impact_category
1346,300403,7.012165e+08,4.509718e+08,4.509718e+08,0.000000e+00,0.643128,0.000000e+00,1
670,002444,3.215292e+09,2.246736e+09,1.868104e+09,1.217765e+08,0.581006,3.787418e-02,1
1151,300179,1.864026e+08,9.313472e+07,9.309539e+07,3.498894e+04,0.499432,1.877063e-04,1
2394,603703,3.931661e+08,1.835168e+08,1.817354e+08,0.000000e+00,0.462236,0.000000e+00,1
2091,600962,9.798564e+08,4.483933e+08,4.464387e+08,1.954632e+06,0.455616,1.994815e-03,1
...,...,...,...,...,...,...,...,...
610,002376,1.279445e+09,1.370272e+08,1.367235e+08,4.696059e+02,0.106862,3.670386e-07,1
2392,603699,2.309872e+09,2.450267e+08,2.450108e+08,7.350709e+03,0.106071,3.182301e-06,1
810,002613,8.732364e+08,1.053555e+08,9.220220e+07,1.329448e+03,0.105587,1.522437e-06,1
512,002250,3.699121e+09,7.389691e+08,3.799660e+08,1.820167e+06,0.102718,4.920539e-04,1


In [13]:
merged_df.nlargest(20, 'ratio_affected_4')


Unnamed: 0,Stkcd,average_revenue,total_export_amount,amount_affected_1_3,amount_affected_4,ratio_affected_1_3,ratio_affected_4,impact_category
812,2615,948887000.0,434678400.0,871564.9,408715600.0,0.000919,0.430732,2
383,2091,14360450000.0,5471505000.0,58682340.0,5332456000.0,0.004086,0.371329,2
827,2634,377228600.0,139312900.0,0.0,139312900.0,0.0,0.369306,2
375,2083,4377938000.0,1472208000.0,7908850.0,1444835000.0,0.001807,0.330026,2
1975,600735,1319507000.0,366154800.0,6987583.0,271463700.0,0.005296,0.205731,2
635,2404,2091144000.0,422279900.0,20115930.0,400741600.0,0.00962,0.191638,2
1920,600626,7799814000.0,2115738000.0,613619800.0,1456536000.0,0.078671,0.18674,2
441,2166,574504600.0,169917800.0,480155.5,96567790.0,0.000836,0.168089,2
634,2403,2332030000.0,526622400.0,214003.4,378107100.0,9.2e-05,0.162136,2
341,2045,2271977000.0,375525500.0,5234416.0,365175800.0,0.002304,0.16073,2


In [11]:
threshold_1_3 = 0.1
threshold_4 = 0.1

def classify_impact(row):
    if row['ratio_affected_1_3'] > threshold_1_3 and row['ratio_affected_4'] > threshold_4:
        return 3  # both impacted
    elif row['ratio_affected_1_3'] > threshold_1_3:
        return 1  # list1-3
    elif row['ratio_affected_4'] > threshold_4:
        return 2  # list4
    else:
        return 4  # unaffected

# 新增分类列
merged_df['impact_category'] = merged_df.apply(classify_impact, axis=1)

# 统计各类别数量
impact_summary = merged_df['impact_category'].value_counts().rename(index={1: 'list1-3', 2: 'list4', 3: 'both', 4: 'unaffected'})


In [12]:
impact_summary

impact_category
unaffected    842
list1-3        80
list4          18
Name: count, dtype: int64

In [17]:
result[result['Stkcd'] == '000088']

Unnamed: 0,Stkcd,total_export_amount,amount_affected_1_3,amount_affected_4
15,88,5445563000.0,1378883000.0,2362642000.0


In [22]:
df_basic

Unnamed: 0,Stkcd,ShortName,EndDate,IndustryCode,FullName,year,Industry_Main,company
0,000001,平安银行,2012-12-31,J66,平安银行股份有限公司,2012,J,平安银行股份有限公司
1,000001,平安银行,2013-12-31,J66,平安银行股份有限公司,2013,J,平安银行股份有限公司
2,000001,平安银行,2014-12-31,J66,平安银行股份有限公司,2014,J,平安银行股份有限公司
3,000001,平安银行,2015-12-31,J66,平安银行股份有限公司,2015,J,平安银行股份有限公司
4,000001,平安银行,2016-12-31,J66,平安银行股份有限公司,2016,J,平安银行股份有限公司
...,...,...,...,...,...,...,...,...
41815,605598,上海港湾,2022-12-31,E48,上海港湾基础建设(集团)股份有限公司,2022,E,上海港湾基础建设(集团)股份有限公司
41816,605598,上海港湾,2023-12-31,E48,上海港湾基础建设(集团)股份有限公司,2023,E,上海港湾基础建设(集团)股份有限公司
41817,605599,菜百股份,2021-12-31,F52,北京菜市口百货股份有限公司,2021,F,北京菜市口百货股份有限公司
41818,605599,菜百股份,2022-12-31,F52,北京菜市口百货股份有限公司,2022,F,北京菜市口百货股份有限公司


In [25]:
unique_fullnames = df_basic[df_basic['Stkcd'].isin(merged_df['Stkcd'])].drop_duplicates(subset=['Stkcd', 'FullName'])
unique_fullnames_latest = unique_fullnames.groupby("Stkcd", as_index=False).last()



In [None]:
# 分批打印，每批 20 条记录
batch_size = 20
for i in range(0, len(unique_fullnames_latest), batch_size):
    batch = unique_fullnames_latest.iloc[i:i+batch_size]
    for index, row in batch.iterrows():
        print(f"Stkcd: {row['Stkcd']}, FullName: {row['FullName']}")
    input("按回车键继续显示下一批...")

Stkcd: 000009, FullName: 中国宝安集团股份有限公司
Stkcd: 000012, FullName: 中国南玻集团股份有限公司
Stkcd: 000016, FullName: 康佳集团股份有限公司
Stkcd: 000020, FullName: 深圳中恒华发股份有限公司
Stkcd: 000021, FullName: 深圳长城开发科技股份有限公司
Stkcd: 000026, FullName: 飞亚达精密科技股份有限公司
Stkcd: 000030, FullName: 富奥汽车零部件股份有限公司
Stkcd: 000039, FullName: 中国国际海运集装箱(集团)股份有限公司
Stkcd: 000049, FullName: 深圳市德赛电池科技股份有限公司
Stkcd: 000050, FullName: 天马微电子股份有限公司
Stkcd: 000058, FullName: 深圳赛格股份有限公司
Stkcd: 000063, FullName: 中兴通讯股份有限公司
Stkcd: 000066, FullName: 中国长城科技集团股份有限公司
Stkcd: 000100, FullName: TCL科技集团股份有限公司
Stkcd: 000153, FullName: 安徽丰原药业股份有限公司
Stkcd: 000157, FullName: 中联重科股份有限公司
Stkcd: 000333, FullName: 美的集团股份有限公司
Stkcd: 000338, FullName: 潍柴动力股份有限公司
Stkcd: 000404, FullName: 长虹华意压缩机股份有限公司
Stkcd: 000410, FullName: 沈阳机床股份有限公司


In [None]:
result_df[
    (result_df['Stkcd'] == '300425') & 
    (result_df['year'] == 2015) & 
    (result_df['Company_Name'].str.contains('', na=False))
]

In [39]:
df_basic[df_basic['Stkcd'] == '002450']

Unnamed: 0,Stkcd,ShortName,EndDate,IndustryCode,FullName,year,Industry_Main,company
11042,2450,康得新,2012-12-31,C29,北京康得新复合材料股份有限公司,2012,C,北京康得新复合材料股份有限公司
11043,2450,康得新,2013-12-31,C29,北京康得新复合材料股份有限公司,2013,C,北京康得新复合材料股份有限公司
11044,2450,康得新,2014-12-31,C29,江苏康得新复合材料股份有限公司,2014,C,江苏康得新复合材料股份有限公司
11045,2450,康得新,2015-12-31,C29,江苏康得新复合材料股份有限公司,2015,C,江苏康得新复合材料股份有限公司
11046,2450,康得新,2016-12-31,C29,康得新复合材料集团股份有限公司,2016,C,康得新复合材料集团股份有限公司
11047,2450,康得新,2017-12-31,C29,康得新复合材料集团股份有限公司,2017,C,康得新复合材料集团股份有限公司
11048,2450,ST 康得新,2018-12-31,C29,康得新复合材料集团股份有限公司,2018,C,康得新复合材料集团股份有限公司
11049,2450,*ST 康得,2019-12-31,C29,康得新复合材料集团股份有限公司,2019,C,康得新复合材料集团股份有限公司
11050,2450,*ST 康得,2020-12-31,C29,康得新复合材料集团股份有限公司,2020,C,康得新复合材料集团股份有限公司
