In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

# 1. 데이터 불러오기
path = r"C:\Users\제조업통합.xlsx"
df_raw = pd.read_excel(path)

# 2. 공변량 선택
X = df_raw[['매출액', '부채', '업력', '영업이익', '자본', '자산', '종업원수']].copy()

# 3. 결측치 제거 - X와 df 모두 동일한 행만 남기기
X = X.dropna()
df = df_raw.loc[X.index].reset_index(drop=True)
X = X.reset_index(drop=True)

# 4. 처리여부 벡터
treated = df['공시참여'].reset_index(drop=True)

# 5. 성향 점수 계산
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

logit = LogisticRegression()
logit.fit(X_scaled, treated)
propensity_scores = logit.predict_proba(X_scaled)[:, 1]
df['pscore'] = propensity_scores

# 6. 처리군/대조군 분리
treated_df = df[treated == 1].copy()
control_df = df[treated == 0].copy()

# 7. MatchIt 방식 1:1 매칭 (중복 없음)
matched_pairs = []
used_controls = set()

for idx, row in treated_df.iterrows():
    t_score = row['pscore']
    available_controls = control_df[~control_df.index.isin(used_controls)]
    if available_controls.empty:
        break
    distances = (available_controls['pscore'] - t_score).abs()
    best_match_idx = distances.idxmin()
    matched_pairs.append((idx, best_match_idx))
    used_controls.add(best_match_idx)

# 8. 매칭 결과 정리
matched_treated_idx = [i for i, _ in matched_pairs]
matched_control_idx = [j for _, j in matched_pairs]

matched_df = treated_df.loc[matched_treated_idx].copy()
matched_df['matched_id'] = matched_control_idx
matched_df['matched_company'] = df.loc[matched_control_idx, '회사명'].values
matched_df['matched_score'] = df.loc[matched_control_idx, 'pscore'].values

# 9. 결과 저장 (경로 수정 완료)
save_path = r"C:\Users\matched_psm_result.csv"
matched_df.to_csv(save_path, index=False, encoding='cp949')
