In [13]:
import pandas as pd
from scipy.stats import linregress

# ----------------------------------------
# 1. 데이터 로드
# ----------------------------------------
df_exp = pd.read_csv("/content/export_with_surprise.csv")  # 수출 + 서프라이즈
df_ret = pd.read_csv("/content/part3.csv")                # return_before

# 날짜 변환
df_exp['date'] = pd.to_datetime(df_exp['date'])
df_ret['date'] = pd.to_datetime(df_ret['date'])

# 연-월 기준 키 생성
df_exp['ym'] = df_exp['date'].dt.to_period('M')
df_ret['ym'] = df_ret['date'].dt.to_period('M')

# ----------------------------------------
# 2. 수출 + 수익률 병합 (symbol + ym 기준)
# ----------------------------------------
df_merged = pd.merge(
    df_exp,
    df_ret[['symbol', 'ym', 'return_before']],  # part3의 zscore는 안 쓰고 return만 끌어옴
    on=['symbol', 'ym'],
    how='inner'
)

print("병합 후 행 개수:", len(df_merged))
print(df_merged.head())

df_merged.to_csv("merged_export_return.csv", index=False)
print("병합 완료: merged_export_return.csv 생성됨")



병합 후 행 개수: 7056
        date  symbol  export_value  export_log       YoY       QoQ       MoM  \
0 2021-12-31  AEGQRD        192976   12.170326 -0.033667  0.022243  0.040129   
1 2022-04-30  AEGQRD        818181   13.614840  0.092376 -0.016454  0.098517   
2 2022-05-31  AEGQRD        230202   12.346717  0.009301 -0.016454 -0.093143   
3 2022-07-31  AEGQRD        331010   12.709907  0.049963  0.029592 -0.031341   
4 2022-08-31  AEGQRD       1765165   14.383755  0.225684  0.029592  0.131696   

   rolling_zscore_yoy  rolling_zscore_qoq  rolling_zscore_mom  \
0                 NaN            0.179718            0.991592   
1                 NaN           -0.622387            1.409158   
2                 NaN           -0.584011           -1.392543   
3                 NaN            0.336128           -0.532602   
4                 NaN            0.252498            1.432718   

   pred_export_log  pred_export       ym  return_before  
0              NaN          NaN  2021-12       0.03213

In [14]:
# ----------------------------------------
# 3. GICS industry_group 매핑
# ----------------------------------------
df_gics = pd.read_csv("/content/industry_group.csv")  # symbol, value 컬럼 있어야 함

df_final = pd.merge(df_merged, df_gics, on="symbol", how="left")
print("GICS 매핑 후 행 개수:", len(df_final))
print(df_final[['symbol', 'value']].head())

df_final.to_csv("merged_with_gics.csv", index=False)
print("GICS 매핑 완료: merged_with_gics.csv 생성됨")



GICS 매핑 후 행 개수: 7056
   symbol   value
0  AEGQRD  3510.0
1  AEGQRD  3510.0
2  AEGQRD  3510.0
3  AEGQRD  3510.0
4  AEGQRD  3510.0
GICS 매핑 완료: merged_with_gics.csv 생성됨


In [15]:
# ----------------------------------------
# 4. industry_group 최소 샘플 5개 필터
# ----------------------------------------
group_col = "value"
counts = df_final[group_col].value_counts()
valid_groups = counts[counts >= 5].index

df_filtered = df_final[df_final[group_col].isin(valid_groups)]
print("필터링 후 행 개수:", len(df_filtered))

df_filtered.to_csv("filtered_groups.csv", index=False)
print("필터링 완료: filtered_groups.csv 생성됨")


필터링 후 행 개수: 6926
필터링 완료: filtered_groups.csv 생성됨


In [16]:

# ----------------------------------------
# 5. Surprise → Return 회귀 분석
# ----------------------------------------
df = df_filtered.copy()

surprise_metrics = [
    "rolling_zscore_yoy",
    "rolling_zscore_qoq",
    "rolling_zscore_mom"
]

y_col = "return_before"   # 주가 수익률

results = []

for group, subdf in df.groupby(group_col):

    for metric in surprise_metrics:

        clean = subdf[[metric, y_col]].dropna()

        if len(clean) < 5:
            continue

        slope, intercept, r, p, se = linregress(clean[metric], clean[y_col])

        results.append({
            "industry_group": group,
            "surprise_metric": metric,
            "sample_size": len(clean),
            "slope": slope,
            "intercept": intercept,
            "R": r,
            "R2": r**2,
            "p_value": p,
            "std_err": se
        })

summary = pd.DataFrame(results)
summary.to_csv("industry_group_regression_summary_final.csv", index=False)

print("완료: industry_group_regression_summary_final.csv 생성됨")
print(summary.head())


완료: industry_group_regression_summary_final.csv 생성됨
   industry_group     surprise_metric  sample_size     slope  intercept  \
0          1010.0  rolling_zscore_yoy           17 -0.003515   0.002469   
1          1010.0  rolling_zscore_qoq           24  0.001344   0.000716   
2          1010.0  rolling_zscore_mom           24  0.000212   0.000338   
3          1510.0  rolling_zscore_yoy          808 -0.000050   0.000995   
4          1510.0  rolling_zscore_qoq         1158  0.000199   0.001201   

          R        R2   p_value   std_err  
0 -0.344596  0.118747  0.175574  0.002473  
1  0.090295  0.008153  0.674779  0.003162  
2  0.014387  0.000207  0.946803  0.003136  
3 -0.002791  0.000008  0.936863  0.000627  
4  0.011498  0.000132  0.695901  0.000510  
