In [4]:
import pandas as pd
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

In [5]:
# KRX300종목 데이터 불러오기
df_code = pd.read_csv('KRX300_20250914.csv', encoding = 'cp949') #상페되는 종목이 있기에 주기적으로 최신순으로 업데이트 해주는것이 좋음. krx정보데이터시스템->지수->주가지수->지수구성종목
df_code = df_code.iloc[:,:2]
df_code.종목코드 = df_code.종목코드.apply(lambda x : format(x, '06'))
df_code

Unnamed: 0,종목코드,종목명
0,005930,삼성전자
1,000660,SK하이닉스
2,373220,LG에너지솔루션
3,207940,삼성바이오로직스
4,012450,한화에어로스페이스
...,...,...
295,336370,솔루스첨단소재
296,005420,코스모화학
297,145720,덴티움
298,002710,TCC스틸


In [6]:
# 분석 기간 설정 후 데이터 불러오기
import FinanceDataReader as fdr
import numpy as np

code = df_code.종목코드
name = df_code.종목명
df = pd.DataFrame()

for i,j in tqdm(zip(code, name)):
    df1 = fdr.DataReader(i,'2025-04-14','2025-09-14') #날짜 입력 당일부터 과거 5달
    df1['종목코드'] = i
    df1['종목명'] = j
    df = pd.concat([df, df1])


df = df.sort_values("Date")

300it [01:50,  2.72it/s]


In [7]:
df['log_Change'] = np.log(df['Change'] + 1)

In [8]:
df.query('Volume == 0')['종목명'].value_counts()

종목명
KG모빌리티    16
영풍         9
리노공업       9
카카오페이      2
Name: count, dtype: int64

In [9]:
#회사별로 분류해서 3차원 데이터프레임 만들기
df_ls = []

for i in tqdm(code):
    df_ls.append(df[df.종목코드 == i])


100%|██████████| 300/300 [00:00<00:00, 1088.38it/s]


In [10]:
# 최근 100일 안에 거래정지된 회사들 뽑는 함수
# 피처를 만드려면 약 100일 간의 정상적인 거래 데이터가 있어야 하기 때문에
# 뽑은 회사들은 거래를 하지 않는 가운데 순위에 들어갈 예정이다.
def Abnormal_trading(df, code):
    
    trading100 = []
    for i in tqdm(df):
        trading100.append(i.iloc[-100:].Volume == 0)
            
    trading100_set = []
    for i in trading100:
        trading100_set.append(set(i))
        
    stop_trading = []
    for i,j in zip(code, trading100_set):
        if True in j:
            stop_trading.append(i)
            
    return stop_trading

# 뽑은 회사들을 전체 종목코드에서 삭제 후, 반환하는 함수
def Make_trading_code(code, stop):
    a = list(code)
    for i in stop:
        a.remove(i)

    return a

In [11]:
stop_trading = Abnormal_trading(df_ls, code)
trading_code = Make_trading_code(code, stop_trading)

len(stop_trading), len(trading_code)

100%|██████████| 300/300 [00:00<00:00, 12236.97it/s]


(4, 296)

In [12]:
# 최근 100일간의 데이터만 회사별로 추출
df_lst = []

for i in tqdm(trading_code):
    df_lst.append(df[df.종목코드 == i].iloc[-100:])

del df_ls

100%|██████████| 296/296 [00:00<00:00, 1133.54it/s]


In [13]:
# RSI, StochasticOscillator, MACD, BollingerBands 추가
from ta.momentum import RSIIndicator
from ta.momentum import StochasticOscillator
from ta.trend import MACD
from ta.volatility import BollingerBands

for i in tqdm(df_lst):
    
    calc_RSI = RSIIndicator(i["Close"], window = 14, fillna = False)
    i['RSI'] = calc_RSI.rsi()


    calc_stoch = StochasticOscillator(i['High'], i['Low'], i['Close'], window = 14, smooth_window = 3, fillna = False)
    i['stoch'] = calc_stoch.stoch()
    stoch_signal = pd.Series(calc_stoch.stoch_signal())
    i['stoch_diff'] = i['stoch'] - stoch_signal

    calc_MACD = MACD(i['Close'], window_slow = 26, window_fast = 12,
                    window_sign = 9, fillna = False)
    i['MACD'] = calc_MACD.macd()
    i['MACD_diff'] = calc_MACD.macd_diff()

    calc_Boll = BollingerBands(i['Close'], window = 20, window_dev = 2, fillna = False)
    i['Boll_h_i'] = calc_Boll.bollinger_hband_indicator()
    i['Boll_l_i'] = calc_Boll.bollinger_lband_indicator()
    
    i['Boll_p'] = calc_Boll.bollinger_pband()

100%|██████████| 296/296 [00:00<00:00, 593.88it/s]


In [14]:
# 40일 기준 합산 변동률 구하기 
sum_change = []

for i in tqdm(df_lst):
    sum_change_part = [0 for i in range(39)]
    for j in range(len(i) - 39):
        a = i[j:j+40].Change.sum()
        sum_change_part.append(a)
    sum_change.append(sum_change_part)
    
# 합산 변동률 열 추가
for i, j in zip(sum_change, df_lst):
    j['sum_change'] = i

100%|██████████| 296/296 [00:00<00:00, 413.60it/s]


In [15]:
# X 필오한 열만 자르기 (종목코드, 종목명, Change 삭제)
df_lst2 = []

for a in tqdm(df_lst):
    a_1 = a.iloc[:,4:]
    a_2 = a_1.drop(['종목코드', '종목명', 'Change'], axis = 1)
    df_lst2.append(a_2)

100%|██████████| 296/296 [00:00<00:00, 3212.58it/s]


In [16]:
# 정규화
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler

scaled_df_lst2 = []

mm = ['RSI', 'stoch', 'Boll_h_i', 'Boll_l_i']
st = ['Volume', 'log_Change', 'sum_change', 'Boll_p', 'stoch_diff', 'MACD', 'MACD_diff']

for i in tqdm(df_lst2):
    # x 정규화
    standard_scaler = StandardScaler()
    standard_scaler = standard_scaler.fit_transform(i[['Volume', 'log_Change', 'sum_change', 'Boll_p', 'stoch_diff', 'MACD', 'MACD_diff']])
    standard_scaler = pd.DataFrame(standard_scaler)
    standard_scaler.columns = st
    
    minmax_scaler = MinMaxScaler()
    minmax_scaler = minmax_scaler.fit_transform(i[['RSI', 'stoch', 'Boll_h_i', 'Boll_l_i']])
    minmax_scaler = pd.DataFrame(minmax_scaler)
    minmax_scaler.columns = mm
    
    scaled_df = pd.concat([standard_scaler, minmax_scaler], axis = 1)
    scaled_df_lst2.append(scaled_df)
    
del df_lst2

100%|██████████| 296/296 [00:00<00:00, 705.54it/s]


In [17]:
# 지표가 다 만들어진 이후에는 최근 40일간의 데이터가 필요
scaled_df_lst3 = []
for i in scaled_df_lst2:
    scaled_df_lst3.append(i.iloc[-40:])

del scaled_df_lst2

In [18]:
X_test = np.array(scaled_df_lst3)
print('X_test Shape :', X_test.shape)

X_test Shape : (296, 40, 11)


In [19]:
# 데이터셋 형태, 크기 조정
import torch
from torch.autograd import Variable

X_test_tensors = Variable(torch.Tensor(X_test))

print('X_test_tensors Shape :', X_test_tensors.shape)

X_test_tensors Shape : torch.Size([296, 40, 11])


In [20]:
del X_test

### prediction

In [28]:
import os
import time

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from torch.autograd import Variable
from tqdm import tqdm_notebook
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

#device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [29]:
# 변수값 설정

input_size=11 # number of feature
hidden_size=5 # number of feature in hidden state
num_layers=2 # number of stacked lstm layer

num_classes=1 # number of output classes

In [30]:
class LSTM(nn.Module) :
    def __init__(self, num_classes, input_size, hidden_size, num_layers, seq_length) :
        super(LSTM, self).__init__()
        self.num_classes = num_classes
        self.num_layers = num_layers
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.seq_length = seq_length
        
        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size,
                           num_layers=num_layers, batch_first=True)
        self.fc_1 = nn.Linear(hidden_size, 128)
        self.fc_2 = nn.Linear(128, 64)
        self.fc = nn.Linear(64, num_classes)
        self.relu = nn.ReLU()
        
    def forward(self, x) :
        h_0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size))
        c_0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size))
        
        output, (hn, cn) = self.lstm(x, (h_0, c_0))
        
        hn = hn.view(-1, self.hidden_size)
        out = self.relu(hn)
        out = torch.nan_to_num(out) # nan loss가 나오길래 넣었더니 해결됨
        out = self.fc_1(out)
        out = self.relu(out)
        out = self.fc_2(out)
        out = self.relu(out)
        out = self.fc(out)
        n = int(len(out)/num_layers)
        out2 = out[-n:]
        
        return out2

In [31]:
# 모델 불러오기
PATH = "lstm_7_23_jinsim_h5.pth"

model = LSTM(num_classes, input_size, hidden_size, num_layers, 40)
model.load_state_dict(torch.load(PATH, map_location=torch.device('cpu')), strict=False)
model.eval()

LSTM(
  (lstm): LSTM(11, 5, num_layers=2, batch_first=True)
  (fc_1): Linear(in_features=5, out_features=128, bias=True)
  (fc_2): Linear(in_features=128, out_features=64, bias=True)
  (fc): Linear(in_features=64, out_features=1, bias=True)
  (relu): ReLU()
)

In [32]:
# 예측
with torch.no_grad(): 
    #predicted22 = model(X_test_tensors.to(device))
    predicted22 = model(X_test_tensors)

In [33]:
# 예측값과 종목코드 합치기
pred = predicted22.tolist()

pred = pd.Series(pred)
trading_code = pd.Series(trading_code)

result = pd.concat([pred, trading_code], axis = 1)
result.columns = ['pred', '종목코드']

# 예측값 하나하나가 리스트안에 들어가있는 형태이므로 리스트 밖으로 꺼내서 실수형으로 만들어주기
for i in range(len(result)):
    result['pred'][i] = result['pred'][i][0]
    
result = result.astype({'pred' : 'float'})

In [37]:
result['순위'] = result['pred'].rank(method='first', ascending=False).astype('int') # 각 순위를 중복없이 생성
result.sort_values('순위', inplace = True)
result.reset_index(drop = True, inplace = True)
merge_inner = pd.merge(result, df_code, on = '종목코드')

In [38]:
merge_inner.head(20)

Unnamed: 0,pred,종목코드,순위,종목명
0,0.13681,462870,1,시프트업
1,0.122589,34020,2,두산에너빌리티
2,0.12071,30200,3,KT
3,0.113046,35250,4,강원랜드
4,0.099477,114090,5,GKL
5,0.097339,1440,6,대한전선
6,0.096282,15760,7,한국전력
7,0.095685,82270,8,젬백스
8,0.092544,1430,9,세아베스틸지주
9,0.09236,96530,10,씨젠
