In [1]:
from tqdm import tqdm
import datetime
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from pandas_datareader import data
pd.set_option('display.float_format', '{:.2f}'.format)
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
plt.style.use("ggplot")
import seaborn as sns

import scipy.stats as stats
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from xgboost import XGBClassifier

In [2]:
# stock_code: 종목코드 로드
stock_code = pd.read_csv('../Data/KOSPI_200.csv', dtype={'종목코드':str, '종목명':str}).iloc[:, :2]
stock_code = stock_code[(stock_code['종목명'] != 'DB하이텍') & 
                        (stock_code['종목명'] != 'KB금융') & 
                        (stock_code['종목명'] != 'DL') &
                        (stock_code['종목명'] != '두산인프라코어')]

# codes_dic: 종목코드 딕셔너리로 변환
codes = stock_code.values
codes_dic = {}
target_dic = {5: 'ma5', 20:'ma20', 60:'ma60', 120:'ma120'}

for code, name in codes:
    codes_dic[name] = code

today = datetime.date.today()
diff_day = datetime.timedelta(days=10000)

start_date = str(today - diff_day)
finish_date = str(today)

NAME = input('기업 이름:')
PERIOD = int(input('예측 기간:'))
TARGET = target_dic[PERIOD]

code = codes_dic[NAME]

# 종목코드로 데이터 로드
try:
    finance_data = data.DataReader(f'{code}.KS','yahoo', start_date, finish_date)
except:
    print(f'=====ERROR: {NAME}=====')

기업 이름:삼성전자
예측 기간:20


In [3]:
#단순 이동 평균
def SMA(data, period=30, column='Close'):
    return data[column].rolling(window=period).mean()

#지수 이동 평균
def EMA(data, period=20, column='Close'):
    return data[column].ewm(span=period, adjust=False).mean()

#MACD
def MACD(data, period_long=26, period_short=12, period_signal=9, column='Close'):
    
    ShortEMA = EMA(data, period_short, column=column) #단기SMA
    LongEMA = EMA(data, period_long, column=column)   #장기EMA
    
    data['MACD'] = ShortEMA - LongEMA
    data['MACD_Signal'] = EMA(data, period_signal, column='MACD')
    data['MACD_Oscillator'] = data['MACD'] - data['MACD_Signal']
    
    return data

#RSI
def RSI(data, period=14, column='Close'):
    delta = data[column].diff(1)
    delta = delta[1:]
        
    up = delta.copy()
    down = delta.copy()
    
    up[up < 0] = 0
    down[down > 0] = 0
    
    data['up'] = up
    data['down'] = down
    
    AVG_Gain = SMA(data, period, column='up')
    AVG_Loss = abs(SMA(data, period, column='down'))
    RS = AVG_Gain / AVG_Loss
    
    RSI = 100.0 - (100.0 / (1.0 + RS))
    data['RSI'] = RSI
    data['RSI_Signal'] = SMA(data, period=9, column='RSI')
    
    return data

#Stochastic
def Stochastic(data, n=10, m=5, t=5):

    ndays_high = data.High.rolling(window=n, min_periods=1).max() #n일중 최고가
    ndays_low = data.Low.rolling(window=n, min_periods=1).min() #n일중 최저가
 
    data['kdj_k'] = ((data.Close - ndays_low) / (ndays_high - ndays_low))*100 #Fast%K
    data['kdj_d'] = EMA(data, period=m, column='kdj_k') #Fast%D (=Slow%K)
    data['kdj_j'] = EMA(data, period=t, column='kdj_d') #Slow%D
    
    return data

#CCI
def CCI(data, period=20):
    M = (data['High'] + data['Low'] + data['Close']) /3
    m = M.rolling(window=period).mean()
    d = abs(M-m).rolling(window=period).mean()
    data['CCI'] = (M-m) / (d * 0.015)
    
    return data

#DMI
def DMI(data, n=14, n_ADX=14):
    data = data.reset_index()

    i = 0 
    UpI = [0] 
    DoI = [0] 
    while i + 1 <= data.index[-1]: 
        UpMove = data.loc[i + 1, "High"] - data.loc[i, "High"] 
        DoMove = data.loc[i, "Low"] - data.loc[i+1, "Low"] 
        if UpMove > DoMove and UpMove > 0: 
            UpD = UpMove 
        else: 
            UpD = 0 
        UpI.append(UpD) 
        if DoMove > UpMove and DoMove > 0: 
            DoD = DoMove 
        else: 
            DoD = 0 
        DoI.append(DoD) 
        i = i + 1 
    
    i = 0 
    TR_l = [0] 
    while i < data.index[-1]: 
        TR = max(data.loc[i + 1, 'High'], data.loc[i, 'Close']) - min(data.loc[i + 1, 'Low'], data.loc[i, 'Close']) 
        TR_l.append(TR) 
        i = i + 1
        
    TR_s = pd.Series(TR_l)
    ATR = pd.Series(TR_s.ewm(span=n, min_periods=n).mean())
    UpI = pd.Series(UpI)
    DoI = pd.Series(DoI)
    PosDI = pd.Series(UpI.ewm(span=n, min_periods=n).mean() / ATR)
    NegDI = pd.Series(DoI.ewm(span=n, min_periods=n).mean() / ATR)
    ADX = pd.Series((abs(PosDI - NegDI) / (PosDI + NegDI)).ewm(span=n_ADX, min_periods=n_ADX).mean())
    
    data['PDI'] = PosDI.values
    data['MDI'] = NegDI.values
    data['ADX'] = ADX.values
    
    data.index = data.Date
    data = data.drop('Date', axis=1)
    
    return data

#OBV
def OBV(data):
    OBV = []
    OBV.append(0)
    
    for i in range(1, len(data.Close)):
        if data.Close[i] > data.Close[i-1]:
            OBV.append(OBV[-1] + data.Volume[i])
        elif data.Close[i] < data.Close[i-1]:
            OBV.append(OBV[-1] - data.Volume[i])
        else:
            OBV.append(OBV[-1])
    
    data['OBV'] = OBV
    data['OBV_EMA'] = EMA(data, period=20, column='OBV')
    
    return data

In [4]:
# 보조지표 생성
def sub_features(data):
    data['ma5'] = SMA(data, 5)
    data['ma20'] = SMA(data, 20)
    data['ma60'] = SMA(data, 60)
    data['ma120'] = SMA(data, 120)

    data = MACD(data)
    data = RSI(data)
    data = Stochastic(data)
    data = CCI(data)
    data = DMI(data)
    data = OBV(data)

    data = data.dropna().reset_index()
    
    return data

In [5]:
# 보조지표로 매수/매도 신호 생성 후 target 생성
def data_sign(data, period, target):
    
    columns = ['MACD', 'MACD_Signal', 'MACD_Oscillator', 'RSI', 'RSI_Signal', 
               'kdj_k', 'kdj_d', 'kdj_j', 'CCI', 'PDI', 'MDI', 'ADX', 'OBV', 'OBV_EMA']
    
    column_dic = {'MACD_sign':'MACD', 'MACD_Oscillator_sign':'MACD_Oscillator', 'MACD_diff_sign': ['MACD', 'MACD_Signal'],
              'RSI_sign':'RSI', 'Stochastic_K_sign':'kdj_d', 'Stochastic_KD_sign':['kdj_d', 'kdj_j'],
              'CCI_sign':'CCI', 'CCI_shift_sign':'CCI', 'DMI_sign':['PDI', 'MDI', 'ADX'], 'OBV_sign': ['OBV', 'OBV_EMA']}
    
    # 보조지표 지정 기간만큼 지수이동 평균
    for col in columns:
        data[f'{col}'] = data[f'{col}'].ewm(span=period, adjust=False).mean()

    # 매수: 1, 매도: -1, 중립: 0
    # 양수: 1, 음수: 0
    
    # MACD 신호,
    # 양수/음수, 양수이면서 전날보다 상승/음수이면서 전날보다 상승, MACD/MACD시그널 교차
    data['MACD_sign'] = data['MACD'].apply(lambda x: 1 if x >0 else -1)
    data['MACD_Oscillator_pm'] = data['MACD_Oscillator'].apply(lambda x: 1 if x > 0 else 0)
    data['MACD_Oscillator_sign'] = data['MACD_Oscillator_pm'] - data['MACD_Oscillator_pm'].shift(1)
    data['MACD_diff'] = (data['MACD'] - data['MACD_Signal']).apply(lambda x: 1 if x > 0 else 0)
    data['MACD_diff_sign'] = data['MACD_diff'] - data['MACD_diff'].shift(1)
    
    # RSI 신호,
    # 70이상/30이하
    data['RSI_sign'] = data['RSI'].apply(lambda x: 1 if x >= 70 else (-1 if x <= 30 else 0))
    
    # Stochastic 신호,
    # 80이상/20이하 일때 80선 하향돌파/20선 상향돌파, slow_k선 slow_d선 교차
    data['kdj_d_pm'] = data['kdj_d'].apply(lambda x: 1 if x >= 80 else(-1 if x <= 20 else 0))
    data['kdj_d_pm_shift'] = data['kdj_d_pm'] - data['kdj_d_pm'].shift(1)
    data.loc[(data['kdj_d'] < 80) & (data['kdj_d'] > 20) & (data['kdj_d_pm_shift'] == 1), 'Stochastic_K_sign'] =  1
    data.loc[(data['kdj_d'] < 80) & (data['kdj_d'] > 20) & (data['kdj_d_pm_shift'] == -1), 'Stochastic_K_sign'] =  -1
    data['Stochastic_K_sign'] = data['Stochastic_K_sign'].fillna(0)
    
    data['kdj_pm'] = (data['kdj_d'] - data['kdj_j']).apply(lambda x: 1 if x > 0 else 0)
    data['Stochastic_KD_sign'] = data['kdj_pm'] - data['kdj_pm'].shift(1)
    
    # CCI 신호,
    # 양수에서 음수/음수에서 양수, -100 상향돌파/ 100 하향돌파
    data['CCI_pm'] = data['CCI'].apply(lambda x: 1 if x > 0 else 0)
    data['CCI_sign'] = data['CCI_pm'] - data['CCI_pm'].shift(1)
    data['CCI_range'] = data['CCI'].apply(lambda x: -1 if x < -100 else (1 if x > 100 else 0))
    data['CCI_shift'] = data['CCI'] - data['CCI'].shift(1)
    
    data.loc[(data['CCI_range'] == -1) & (data['CCI_shift'] > 0), 'CCI_shift_sign'] = 1
    data.loc[(data['CCI_range'] == 1) & (data['CCI_shift'] < 0), 'CCI_shift_sign'] = -1
    data['CCI_shift_sign'] = data.fillna(0)['CCI_shift_sign']
    
    # DMI 신호,
    # PDI ADX가 MDI보다 위에 있으면서 ADX증가/ MDI ADX가 PDI보다 위에 있으면서 ADX증가
    data['ADX_shift'] = (data['ADX'] - data['ADX'].shift(1)).rolling(5).mean()
    data.loc[(data['PDI'] > data['MDI']) & (data['ADX'] > data['MDI']) & (data['ADX_shift'] > 0.01), 'DMI_sign'] = 1
    data.loc[(data['MDI'] > data['PDI']) & (data['ADX'] > data['PDI']) & (data['ADX_shift'] > 0.01), 'DMI_sign'] = -1
    data['DMI_sign'] = data['DMI_sign'].fillna(0)

    data['OBV_diff'] = (data['OBV'] - data['OBV_EMA']).apply(lambda x: 1 if x > 0 else 0)
    data['OBV_sign'] = data['OBV_diff'] - data['OBV_diff'].shift(1)
    
    # target, 지정 시점보다 증가했으면 1, 감소했으면 -1, 변동이 없으면 0
    data['target'] = (data[target].shift(-period) - data[target]).apply(lambda x: 1 if x > 0 else (-1 if x < 0 else 0))
    
    # all_data 모든 보조지표 포함
    all_data = data.drop(['High', 'Low', 'Open', 'Close', 'Volume', 'Adj Close', 'up', 'down',
                          'ma5', 'ma20', 'ma60', 'ma120'], axis=1)
    all_data = all_data.dropna().set_index('Date')
    
    # sign_data 모든 보조지표 신호만 포함
    sign_data = data[['Date',
                      'MACD_sign', 'MACD_Oscillator_sign', 'MACD_diff_sign', 
                      'RSI_sign', 'Stochastic_K_sign', 'Stochastic_KD_sign', 
                      'CCI_sign', 'CCI_shift_sign', 'DMI_sign', 'OBV_sign', 'target']]
    
    sign_data = sign_data.dropna().set_index('Date')
    
    lst = [1, -1]
    
    column_lst = ['Date', 'target']
    all_column_lst = ['Date', 'target']
    
    # 독립성 검정
    for c in sign_data.columns[:-1]:
        chisq_data = pd.DataFrame()
        
        try:
            for i in lst:
                for j in lst:
                    chisq_data.loc[f'{c}_{i}', f'target_{j}'] = len(sign_data[(sign_data[f'{c}'] == i) & (sign_data['target'] == j)])

            result = stats.chi2_contingency(observed=chisq_data)
            print(f'''
            ======Column: {c} Period: {period}======
            
            {chisq_data}
            
            chi_square: {result[0]:.4f}
            p-value: {result[1]:.4f}
            ''')
            
            # 유의한 변수만 선택
            if result[1] <= 0.05:
                column_lst.append(c)
                all_column_lst.append(c)
                all_c = column_dic[c]
        
                if type(all_c) == list:
                    for col in all_c:
                        all_column_lst.append(col)
                else:
                    all_column_lst.append(all_c)
                
        except:
            print(f'Column: {c} Period: {period} ERROR!!!*****\n')
    
    # 유의한 변수 중 모든 보조지표 포함
    all_column_lst = set(all_column_lst)
    after_all_data = data[all_column_lst]
    after_all_data = after_all_data.dropna().set_index('Date')
    
    # 유의한 변수 중 보조지표 신호만 포함
    after_sign_data = data[column_lst]
    after_sign_data = after_sign_data.dropna().set_index('Date')
    
    # 변수 값 정수로 변환
    for c in sign_data.columns:
        all_data[f'{c}'] = all_data[f'{c}'].astype('int')
        sign_data[f'{c}'] = sign_data[f'{c}'].astype('int')
        
    for c in after_sign_data.columns:    
        after_all_data[f'{c}'] = after_all_data[f'{c}'].astype('int')
        after_sign_data[f'{c}'] = after_sign_data[f'{c}'].astype('int')
    
    # 데이터 프레임 총 4개 반환
    # 1.모든 보조지표 포함 
    # 2.모든 보조지표 신호만 포함 
    # 3.유의한 보조지표 포함 
    # 4.유의한 보조지표 신호만 포함
    
    return all_data, sign_data, after_all_data, after_sign_data

In [6]:
def modeling(data, period):
    
    # 지정 시점 기준으로 분할 데이터 분할
    train = data.iloc[:-period, :]
    test = data.iloc[-period:, :]
    
    X = train.drop('target', axis=1)
    y = train[['target']]
    
    # 앞의 시점 데이터 랜덤 셔플
    # 훈련 데이터 분할, (train, validation)
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, shuffle=False)
    X_test = test.drop('target', axis=1)
    y_test = test[['target']]

    scaler = MinMaxScaler()
    scaler.fit(X_train)
    
    X_train = scaler.transform(X_train)
    X_val = scaler.transform(X_val)
    X_test = scaler.transform(X_test)

    model = RandomForestClassifier(random_state=0)
    model.fit(X_train, y_train)
    pred = model.predict(X_val)
    acc = accuracy_score(pred, y_val)

    print(f'\nAccuracy: {acc:.4f}')
    
    return y_val, pred, X_test, y_test, model, scaler

In [7]:
finance_data = sub_features(finance_data)

In [8]:
try:
    al, sign, af_al, af_al_sign = data_sign(finance_data, 
                                            PERIOD, TARGET)
    
except:
    print(f'=====ERROR CODE: {code}=====')


            
                          target_1  target_-1
MACD_sign_1    1973.00    1033.00
MACD_sign_-1   1006.00    1264.00
            
            chi_square: 238.2392
            p-value: 0.0000
            

            
                                     target_1  target_-1
MACD_Oscillator_sign_1      85.00      30.00
MACD_Oscillator_sign_-1     46.00      69.00
            
            chi_square: 25.6088
            p-value: 0.0000
            

            
                               target_1  target_-1
MACD_diff_sign_1      85.00      30.00
MACD_diff_sign_-1     46.00      69.00
            
            chi_square: 25.6088
            p-value: 0.0000
            

            
                         target_1  target_-1
RSI_sign_1     207.00      15.00
RSI_sign_-1     16.00      53.00
            
            chi_square: 140.3783
            p-value: 0.0000
            

            
                                  target_1  target_-1
Stochastic_K_sign_1       1.0

In [9]:
# al, 모든 보조지표 포함 데이터 선택
y_val, pred, X_test, y_test, model, scaler = modeling(al, PERIOD)


Accuracy: 0.6462


In [10]:
# print(pd.Series(pred).value_counts())
# print(y_val.value_counts())

 1    591
-1    464
dtype: int64

In [12]:
# pred = model.predict(X_test)
# print(pred)
# print(pred.mean())

In [15]:
# pd.concat([finance_data.iloc[-PERIOD-1:, :][['Date', TARGET, 'Close']].reset_index(drop=True), pd.DataFrame(np.insert(pred, 0, 0), columns=['Predict'])], axis=1)

Unnamed: 0,Date,ma20,Close,Predict
0,2021-07-02,80900.0,80000.0,0
1,2021-07-05,80825.0,80400.0,-1
2,2021-07-06,80790.0,81200.0,-1
3,2021-07-07,80775.0,80800.0,-1
4,2021-07-08,80720.0,79900.0,-1
5,2021-07-09,80640.0,79400.0,-1
6,2021-07-12,80600.0,79700.0,-1
7,2021-07-13,80545.0,79800.0,-1
8,2021-07-14,80430.0,79500.0,-1
9,2021-07-15,80415.0,80600.0,-1
