# 7장. 멀티 팩터 전략

In [1]:
from matplotlib import rc
from collections import defaultdict
from typing import Optional, Dict

import FinanceDataReader as fdr
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.graph_objs as go
import seaborn as sns

from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.manifold import TSNE
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from tqdm import tqdm

from data.data_loader import PykrxDataLoader

## 1. 거시 경기 데이터

In [2]:
# 한글 폰트 설정
import platform
os_system = platform.system()
# 시스템에 딸 폰트 설정
if os_system == 'Windows':
    font_name = 'Malgun Gothic'
elif os_system == 'Darwin':
    font_name = 'AppleGothic'
else:
    font_name = 'Arial'
rc('font', family=font_name)

plt.rcParams['axes.unicode_minus'] = False
eng_to_kor = {'relative': '모멘텀', 'per': 'PER', 'pbr': 'PBR', 'dividend': '배당',
              'small': '소형주', 'lowvol': '로우볼', 'individual': '개인 수급 주체',
              'institutional': '기관 수급 주체', 'foreign': '외국인 수급 주체',
              'cluster': '군집', 'factor': '전략', 'date': '날짜', 'real': '실제',
              'pred': '예측', 'accuracy': '정확도'}

### 1.1. 거시 경기 데이터 불러오기

In [19]:
fromdate = '2012-11-01'
todate = '2021-12-30'
macro_name = [
    # 주요 지수
    'KS200',    # 코스피 200
    'US500',    # S&P 500 지수
    'SSEC',     # 상해 종합
    'VIX',      # 공포지수

    # 상품 선물
    'CL',   # WTI유 선물 Crude Oil (NYMEX)
    'GC',   # 금 선물 (COMEX)
    'HG=F', # 구리 선물 (COMEX)

    # 환율
    'KRW/USD',  # 달러 원화
    'KRW/CNY',  # 달러 위엔화

    # 채권
    'US5YT',    # 5년 만기 미국구채 수익률
    'US30YT',   # 30년 만기 미국국채 수익률
    'FRED:T10Y3M',  # 미국 창단기금리차(10Y-3M) : 연준에서 중시하는 10년-3개월 금리차

    # 경기 지표(미국)
    'FRED:M1SL',    # M1 통화량
    'FRED:M2',      # M2 통화량
    'FRED:HSN1F',   # HSNIF 주택판매지수
    'FRED:T5YIFR',  # 5년 기애인플레이션
    'FRED:UNRATE',  # 미국 실업률

    # 경기 지표(한국)
    'FRED:MANMM101KRM189S',     # 대한민국 M1 통화량
    'FRED:MYAGM2KRM189S',       # 대한민국 M2 통화량
    'FRED:KORCPIALLMINMEI',     # 한국 소비자물가지수: 모든 항목
    'FRED:KORLOLITONOSTSAM',    # OECD 선행지수: 대한민국용 정규화된 선행지수
    'FRED:XTEXVA01KRM664S',     # 대한민국 수출: 상품 가치
    'FRED:XTIMVA01KRM667S',     # 대한민국 수입: 상품가치
]

In [17]:
def macro_data_loader(fromdate: str, todate: str,
                      data_list: list) -> pd.DataFrame:
    df = pd.DataFrame({'DATE': pd.date_range(start=fromdate, end=todate)})
    for data_name in data_list:
        print(f'data_name: {data_name}, fromdate: {fromdate}, todate: {todate}')
        # 데이트 로드하기
        df_sub = fdr.DataReader(symbol=data_name, start=fromdate, end=todate)
        # OHLCV 데이터면 Close만 사용
        if 'Close' in df_sub.columns:
            df_sub = df_sub[['Close']]
            df_sub.rename(columns={'Close': data_name}, inplace=True)
        df = df.merge(df_sub, how='left', left_on='DATE', right_index=True)

    return df.rename(columns={'DATE': 'date'})

### 1.2. 거시 경기 데이터 전처리

In [5]:
def macro_preprocess(df: pd.DataFrame, fromdate: str,
                     todate:str) -> pd.DataFrame:
    # 업무일 데이터로 ffill하기
    business_day_list = pd.to_datetime(PykrxDataLoader(fromdate=fromdate, todate=todate).get_business_days())
    df = df[df['date'].isin(business_day_list)]

    return df.ffill().dropna()

### 1.3. 거시 경기 데이터 증강

In [6]:
def macro_direction(df: pd.DataFrame, days: int) -> pd.DataFrame:
    def _feature_direction(df: pd.DataFrame):
        # 선형 회귀 기울기를 구하는 함수 정의
        line_fitter = LinearRegression()
        fit_result = line_fitter.fit(X=np.arange(len(df)).reshape(-1,1), y=df)

        return fit_result.coef_ / abs(df).mean()

    valid_columns = df.columns.drop('date')
    # 선형 회귀 계산 윈도우
    feature_direction_df = df[valid_columns].rolling(days).apply(_feature_direction)

    return feature_direction_df.add_suffix(f'_{days}').ffill()

### 1.4. 거시 경기 데이터 준비 실행

In [22]:
# 데이터 로드
macro_original = macro_data_loader(fromdate=fromdate, todate=todate, data_list=macro_name)

# 데이터 전처리
macro_processed = macro_preprocess(df=macro_original, fromdate=fromdate, todate=todate)

# 데이터 증강
macro_20 = macro_direction(df=macro_processed, days=20)
macro_60 = macro_direction(df=macro_processed, days=60)

# 증강된 데이터 결합
macro = pd.concat([macro_processed, macro_20, macro_60], axis=1)
macro.dropna(inplace=True)
macro

data_name: KS200, fromdate: 2012-11-01, todate: 2021-12-30
data_name: US500, fromdate: 2012-11-01, todate: 2021-12-30
data_name: SSEC, fromdate: 2012-11-01, todate: 2021-12-30
data_name: VIX, fromdate: 2012-11-01, todate: 2021-12-30
data_name: CL, fromdate: 2012-11-01, todate: 2021-12-30
data_name: GC, fromdate: 2012-11-01, todate: 2021-12-30
data_name: HG=F, fromdate: 2012-11-01, todate: 2021-12-30
data_name: KRW/USD, fromdate: 2012-11-01, todate: 2021-12-30
data_name: KRW/CNY, fromdate: 2012-11-01, todate: 2021-12-30
data_name: US5YT, fromdate: 2012-11-01, todate: 2021-12-30
data_name: US30YT, fromdate: 2012-11-01, todate: 2021-12-30
data_name: FRED:T10Y3M, fromdate: 2012-11-01, todate: 2021-12-30
data_name: FRED:M1SL, fromdate: 2012-11-01, todate: 2021-12-30
data_name: FRED:M2, fromdate: 2012-11-01, todate: 2021-12-30
data_name: FRED:HSN1F, fromdate: 2012-11-01, todate: 2021-12-30
data_name: FRED:T5YIFR, fromdate: 2012-11-01, todate: 2021-12-30
data_name: FRED:UNRATE, fromdate: 2012

Unnamed: 0,date,KS200,US500,SSEC,VIX,CL,GC,HG=F,KRW/USD,KRW/CNY,...,M2_60,HSN1F_60,T5YIFR_60,UNRATE_60,MANMM101KRM189S_60,MYAGM2KRM189S_60,KORCPIALLMINMEI_60,KORLOLITONOSTSAM_60,XTEXVA01KRM664S_60,XTIMVA01KRM667S_60
91,2013-01-31,258.07,1498.109985,2385.422119,14.280000,53.685001,4.84,3.7235,0.000921,0.571950,...,3.515484e-04,0.000000,0.000377,0.000000,0.000000,0.000000,4.850227e-34,0.000000,0.000000,0.000000
92,2013-02-01,257.64,1513.170044,2419.020020,12.900000,54.919998,4.84,3.7755,0.000920,0.570790,...,3.500886e-04,0.000229,0.000397,0.000000,0.000053,0.000025,1.829448e-05,0.000004,0.000014,0.000034
95,2013-02-04,256.89,1495.709961,2428.154053,14.670000,54.365002,4.82,3.7585,0.000935,0.556970,...,3.475912e-04,0.000450,0.000436,0.000000,0.000103,0.000049,3.596212e-05,0.000008,0.000027,0.000067
96,2013-02-05,254.82,1511.290039,2433.129883,13.720000,54.730000,4.82,3.7615,0.000919,0.571240,...,3.444289e-04,0.000662,0.000475,0.000000,0.000152,0.000072,5.300327e-05,0.000011,0.000040,0.000099
97,2013-02-06,254.31,1512.119995,2434.477051,13.410000,54.904999,4.80,3.7335,0.000921,0.572310,...,3.406023e-04,0.000865,0.000497,0.000000,0.000199,0.000094,6.941828e-05,0.000015,0.000053,0.000130
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3340,2021-12-24,400.53,4725.790039,3618.053955,17.959999,82.790001,33.75,4.3870,0.000844,0.005366,...,1.243937e-33,0.003942,-0.001684,-0.003213,0.000449,0.000000,1.486808e-04,-0.000086,0.001230,0.001163
3343,2021-12-27,398.61,4791.189941,3615.969971,17.680000,83.910004,33.75,4.4650,0.000843,0.005357,...,1.243937e-33,0.003978,-0.001690,-0.003119,0.000433,0.000000,1.439389e-04,-0.000082,0.001132,0.001074
3344,2021-12-28,401.21,4786.350098,3630.112061,17.540001,84.459999,33.75,4.4200,0.000843,0.005363,...,1.243937e-33,0.003906,-0.001704,-0.003093,0.000424,0.000000,1.416765e-04,-0.000082,0.001084,0.001061
3345,2021-12-29,396.72,4793.060059,3597.000000,16.950001,84.910004,33.75,4.4020,0.000842,0.005353,...,1.243937e-33,0.003824,-0.001666,-0.003059,0.000413,0.000000,1.390451e-04,-0.000081,0.001034,0.001045


## 2. 팩터로 구하는 국면

In [26]:
target = pd.read_csv('factor/factor_asset.csv', index_col=0)