In [34]:
import pandas as pd
import numpy as np

#visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [35]:
import ast
import requests
from datetime import date, timedelta
list = []
class Krx:

    isin_code = 'KR7005930003'
    start_date = '2019/01/20'
    end_date = '2019/06/20'
    


    def main(self):

        start = date(int(self.start_date.split("/")[0]), int(self.start_date.split("/")[1]), int(self.start_date.split("/")[2]))
        end = date(int(self.end_date.split("/")[0]), int(self.end_date.split("/")[1]), int(self.end_date.split("/")[2]))

        delta = end - start

        day_price_data = self.get_day_price()
        short_stock_selling_data = self.get_short_stock_selling()

        for day in range(delta.days+1):
            d = start + timedelta(days=day)
            key = str(d).replace("-", "")
            if key in day_price_data:
                list.append([str(d).replace("-", "/"), day_price_data[key][0], day_price_data[key][1], day_price_data[key][2], 
                             day_price_data[key][3], day_price_data[key][4], short_stock_selling_data[key][2], short_stock_selling_data[key][3]])
       

    def get_day_price(self):

        otp = requests.get('http://marketdata.krx.co.kr/contents/COM/GenerateOTP.jspx?bld=MKD/04/0402/04020100/mkd04020100t3_02&name=chart')

        parameters = {
            'isu_cd': self.isin_code,
            'fromdate': self.start_date.replace("/", ""),
            'todate': self.end_date.replace("/", ""),
            'pagePath': '/contents/MKD/04/0402/04020100/MKD04020100T3T2.jsp',
            'code': otp.content
        }

        res = requests.post('http://marketdata.krx.co.kr/contents/MKD/99/MKD99000001.jspx', parameters)

        data = ast.literal_eval(res.text)['block1']

        result = {}

        for item in data:
            # tdd_clsprc : 종가
            # acc_trdval : 거래대금
            # tdd_opnprc : 시가
            # tdd_hgprc : 고가
            # tdd_lwprc : 저가
            result[item['trd_dd'].replace("/", "")] = (item['tdd_clsprc'], item['tdd_opnprc'], item['tdd_hgprc'], item['tdd_lwprc'], item['acc_trdval'])

        return result

    def get_short_stock_selling(self):

        # reverse engineered from the source at here https://finance.naver.com/item/short_trade.nhn?code=005930
        otp = requests.get('https://short.krx.co.kr/contents/COM/GenerateOTP.jspx?bld=SRT/02/02010100/srt02010100X&name=form')

        parameters = {
            'isu_cd': self.isin_code,
            'strt_dd': self.start_date.replace("/", ""),
            'end_dd': self.end_date.replace("/", ""),
            'pagePath': '/contents/SRT/02/02010100/SRT02010100X.jsp',
            'code': otp.content
        }

        res = requests.post('https://short.krx.co.kr/contents/SRT/99/SRT99000001.jspx', parameters)

        data = ast.literal_eval(res.text)['block1']

        result = {}

        for item in data:
            # cvsrtsell_trdvol : 공매도 거래량
            # str_const_val1 : 공매도 잔고량
            # cvsrtsell_trdval : 공매도 거래대금
            # str_const_val2 : 공매도 잔고금액
            result[item['trd_dd'].replace("/", "")] = (item['cvsrtsell_trdvol'], item['str_const_val1'], item['cvsrtsell_trdval'], item['str_const_val2'])

        return result


if __name__ == "__main__":
    Krx().main()


In [36]:
# list는 webscraping의 결과물
df = pd.DataFrame(list, columns=["년/월/일", "종가", "시가", "고가", "저가", "거래대금", "공매도거래대금", "공매도잔고금액"])
df.head()

Unnamed: 0,년/월/일,종가,시가,고가,저가,거래대금,공매도거래대금,공매도잔고금액
0,2019/01/21,42750,42700,42750,41900,482175851350,23237367100,187014107250
1,2019/01/22,42150,42750,42850,41850,420046904750,16032791400,183901503750
2,2019/01/23,42000,41350,42250,41350,463297064600,22384438100,194500530000
3,2019/01/24,43050,43050,43100,42350,632002060800,19768624250,166935243300
4,2019/01/25,44750,44300,44750,43750,1009867914800,42827301900,203515213500


In [37]:
df.dtypes

년/월/일      object
종가         object
시가         object
고가         object
저가         object
거래대금       object
공매도거래대금    object
공매도잔고금액    object
dtype: object

In [38]:
# REMOVE COMMA
columns_values = [df.columns.values]
for i in range(len(columns_values[0])):
    df[columns_values[0][i]] = df[columns_values[0][i]].str.replace(',','')
df.head()

Unnamed: 0,년/월/일,종가,시가,고가,저가,거래대금,공매도거래대금,공매도잔고금액
0,2019/01/21,42750,42700,42750,41900,482175851350,23237367100,187014107250
1,2019/01/22,42150,42750,42850,41850,420046904750,16032791400,183901503750
2,2019/01/23,42000,41350,42250,41350,463297064600,22384438100,194500530000
3,2019/01/24,43050,43050,43100,42350,632002060800,19768624250,166935243300
4,2019/01/25,44750,44300,44750,43750,1009867914800,42827301900,203515213500


In [39]:
# 전일대비등락률 (종가, 거래대금, 공매도, 시가/종가)
Close = ['']
Volumes = ['']
ShortSelling = ['']
Open_Close = ['']

for k in range(1, len(df)-1): #처음 가격은 계산 안 되므로 1, 마지막 가격에는 값이 없는 row가 있으므로 -1
    Close.append(int((int(df['종가'][k])-int(df['종가'][k-1]))*100/int(df['종가'][k-1])))
    Volumes.append(int(df['거래대금'][k]) / int(df['거래대금'][k-1]))
    ShortSelling.append(int(df['공매도잔고금액'][k]) / int(df['공매도잔고금액'][k-1]))
    Open_Close.append(int(df['시가'][k+1]) / int(df['종가'][k])) # 다음날 시가갭이면 1보다 큼
Close.append('')
Volumes.append('')
ShortSelling.append('')
Open_Close.append('')
# Column 삽입
# https://www.geeksforgeeks.org/adding-new-column-to-existing-dataframe-in-pandas/
df.insert(8, "Close", Close)
df.insert(9, "Volumes", Volumes)
df.insert(10, "ShortSelling", ShortSelling)
df.insert(11, "Open_Close", Open_Close)
df.head()

# 날짜 헷갈리면 안 됨 feature들은 전일대비당일가격, Open_Close는 당일대비다음날시가(label이기 때문이다.)

Unnamed: 0,년/월/일,종가,시가,고가,저가,거래대금,공매도거래대금,공매도잔고금액,Close,Volumes,ShortSelling,Open_Close
0,2019/01/21,42750,42700,42750,41900,482175851350,23237367100,187014107250,,,,
1,2019/01/22,42150,42750,42850,41850,420046904750,16032791400,183901503750,-1,0.871149,0.983356,0.98102
2,2019/01/23,42000,41350,42250,41350,463297064600,22384438100,194500530000,0,1.10297,1.05763,1.025
3,2019/01/24,43050,43050,43100,42350,632002060800,19768624250,166935243300,2,1.36414,0.858277,1.02904
4,2019/01/25,44750,44300,44750,43750,1009867914800,42827301900,203515213500,3,1.59789,1.21913,1.00559
5,2019/01/28,45050,45000,45500,44600,812165597086,57134921800,219482879200,0,0.80423,1.07846,1
6,2019/01/29,45500,45050,45500,44350,729601104797,63726634500,230713847000,0,0.89834,1.05117,0.984615
7,2019/01/30,46400,44800,46400,44800,802319669700,57130043050,233805192000,1,1.09967,1.0134,1.00539
8,2019/01/31,46150,46650,47050,46150,1007418016957,56475246850,231867106900,0,1.25563,0.991711,1.01083
9,2019/02/01,46350,46650,46950,46250,644179617419,35406219100,251513222850,0,0.639436,1.08473,1.00971


In [7]:
# row 첫번째와 마지막 삭제
# https://thispointer.com/python-pandas-how-to-drop-rows-in-dataframe-by-index-labels/
df = df.drop([df.index[0], df.index[-1]])
df.head()

Unnamed: 0,년/월/일,종가,시가,고가,저가,거래대금,공매도거래대금,공매도잔고금액,Close,Volumes,ShortSelling,Open_Close
1,2019/01/22,42150,42750,42850,41850,420046904750,16032791400,183901503750,0.985965,0.871149,0.983356,0.98102
2,2019/01/23,42000,41350,42250,41350,463297064600,22384438100,194500530000,0.996441,1.10297,1.05763,1.025
3,2019/01/24,43050,43050,43100,42350,632002060800,19768624250,166935243300,1.025,1.36414,0.858277,1.02904
4,2019/01/25,44750,44300,44750,43750,1009867914800,42827301900,203515213500,1.03949,1.59789,1.21913,1.00559
5,2019/01/28,45050,45000,45500,44600,812165597086,57134921800,219482879200,1.0067,0.80423,1.07846,1.0


In [8]:
# label값을 True와 False로 변환
df['Open_Close'] = df['Open_Close'] >= 1
df.head()

Unnamed: 0,년/월/일,종가,시가,고가,저가,거래대금,공매도거래대금,공매도잔고금액,Close,Volumes,ShortSelling,Open_Close
1,2019/01/22,42150,42750,42850,41850,420046904750,16032791400,183901503750,0.985965,0.871149,0.983356,False
2,2019/01/23,42000,41350,42250,41350,463297064600,22384438100,194500530000,0.996441,1.10297,1.05763,True
3,2019/01/24,43050,43050,43100,42350,632002060800,19768624250,166935243300,1.025,1.36414,0.858277,True
4,2019/01/25,44750,44300,44750,43750,1009867914800,42827301900,203515213500,1.03949,1.59789,1.21913,True
5,2019/01/28,45050,45000,45500,44600,812165597086,57134921800,219482879200,1.0067,0.80423,1.07846,True


In [42]:
# LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
X = df[['Close', 'Volumes', 'ShortSelling']]
y = df['Open_Close']
X_train, X_test, y_train, y_test = train_test_split(X, y)
logreg = LogisticRegression().fit(X_train, y_train)
logreg.score(X_test, y_test)



ValueError: could not convert string to float: 