In [156]:
# import library
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from urllib.request import urlopen
from bs4 import BeautifulSoup
from google.colab import files
import time
import asyncio

from sklearn.linear_model import LinearRegression

In [109]:
def get_company_code():

    df = pd.read_html('http://kind.krx.co.kr/corpgeneral/corpList.do?method=download', header=0)[0]
    company_code_df = df[['회사명','종목코드']]

    return company_code_df

def make_soup(company_code):
    # company_code = '005930' # test company code
    company_code = str(company_code).zfill(6)
    url = f'https://finance.naver.com/item/sise.nhn?code={company_code}'

    source = urlopen(url)
    soup = BeautifulSoup(source, 'lxml')
    
    return soup

def call_element(class_name):
    features = []

    for element in class_name:
        features.append(element.text.strip().replace(',',''))

    return features


# collecting features has class name as p11
def make_features(soup):
    p11 = soup.find_all(class_='p11')
    features = call_element(p11)
    features = features[:24]
    features[4] = features[4][:-1]
    features[10] = features[10][:-1]
    features[20] = features[20][:-2]
    features[22] = features[22][:-2]
    features[23] = features[23][:-2]

    return features

def make_stock_data(company_name, company_code, features):
    stock_dict = {'기업명':[company_name],
                  '기업코드':[company_code],
                '현재가':[features[0]],
                '매도호가':[features[1]],
                '전일대비':[features[2]],
                '매수호가':[features[3]],
                '등락률':[features[4]],
                '전일가':[features[5]],
                '거래량':[features[6]],
                '시가':[features[7]],
                '거래대금':[features[8]],
                '고가':[features[9]],
                '액면가':[features[10]],
                '저가':[features[11]],
                '상한가':[features[12]],
                '전일상한':[features[13]],
                '하한가':[features[14]],
                '전일하한':[features[15]],
                'PER':[features[16]],
                'EPS':[features[17]],
                '52주 최고':[features[18]],
                '52주 최저':[features[19]],
                '시가총액':[features[20]],
                '상장주식수':[features[21]],
                '외국인현재':[features[22]],
                '자본금':[features[23]]}

    stock_data = pd.DataFrame(stock_dict)
    
    return stock_data


In [110]:
# ATTENTION! THIS SHELL TAKES TOO LONG TIME!! IT TAKES ABOUT 1HOURS AND 17MINITUES
# load company code
company_code_df = get_company_code()

# initiallize
stock_data = pd.DataFrame()

for index, company in company_code_df.iterrows():

    company_name = company['회사명']
    company_code =  company['종목코드']

    soup = make_soup(company_code)
    features = make_features(soup)
    stock_data = pd.concat([stock_data, (make_stock_data(company_name, company_code, features))],ignore_index=True, axis=0)

# save data every 200 index
    if index % 200 == 0:
        stock_data.to_csv("stock_data.csv")
        print(f"DataFrameSaved at {index}rows")

# finally save all data and download
stock_data.to_csv("stock_data.csv")
files.download("stock_data.csv")

DataFrameSaved at 0rows
DataFrameSaved at 200rows
DataFrameSaved at 400rows
DataFrameSaved at 600rows
DataFrameSaved at 800rows
DataFrameSaved at 1000rows
DataFrameSaved at 1200rows
DataFrameSaved at 1400rows
DataFrameSaved at 1600rows
DataFrameSaved at 1800rows
DataFrameSaved at 2000rows
DataFrameSaved at 2200rows
DataFrameSaved at 2400rows


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [229]:
stock_data = pd.read_csv('stock_data.csv')
stock_data.head()

Unnamed: 0.1,Unnamed: 0,기업명,기업코드,현재가,매도호가,전일대비,매수호가,등락률,전일가,거래량,시가,거래대금,고가,액면가,저가,상한가,전일상한,하한가,전일하한,PER,EPS,52주 최고,52주 최저,시가총액,상장주식수,외국인현재,자본금
0,0,DRB동일,4840,7370,7370,20,7360,0.27,7350,15014,7310,110,7380,500,7310,9550,9540,5150,5140.0,48.17,153.0,10650,6080,1469,19930000,441,9965
1,1,DSR,155660,6330,6340,210,6330,-3.21,6540,136824,6540,870,6580,500,6300,8500,8430,4580,4550.0,9.65,656.0,8800,3615,1013,16000000,241,8000
2,2,GS글로벌,1250,2710,2710,65,2705,-2.34,2775,437858,2775,1191,2775,2500,2700,3605,3605,1945,1945.0,-2.62,-1036.0,3350,1520,2237,82533764,3200,206334
3,3,HDC현대산업개발,294870,31050,31050,100,31000,-0.32,31150,265921,31200,8251,31200,5000,30850,40450,40150,21850,21650.0,9.89,3140.0,33400,19700,20464,65907330,7530,329536
4,4,KEC,92220,2330,2335,5,2330,0.22,2325,1084276,2340,2550,2395,500,2310,3020,3000,1630,1620.0,-8.89,-262.0,3545,737,2719,116706940,4378,58353


In [194]:
stock_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2450 entries, 0 to 2449
Data columns (total 27 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  2450 non-null   int64  
 1   기업명         2450 non-null   object 
 2   기업코드        2450 non-null   int64  
 3   현재가         2450 non-null   int64  
 4   매도호가        2450 non-null   int64  
 5   전일대비        2450 non-null   int64  
 6   매수호가        2450 non-null   int64  
 7   등락률         2450 non-null   float64
 8   전일가         2450 non-null   int64  
 9   거래량         2450 non-null   int64  
 10  시가          2450 non-null   int64  
 11  거래대금        2450 non-null   int64  
 12  고가          2450 non-null   int64  
 13  액면가         2449 non-null   object 
 14  저가          2450 non-null   int64  
 15  상한가         2450 non-null   int64  
 16  전일상한        2450 non-null   int64  
 17  하한가         2450 non-null   int64  
 18  전일하한        2440 non-null   float64
 19  PER         2281 non-null  

In [195]:
stock_data.isnull().sum()

Unnamed: 0      0
기업명             0
기업코드            0
현재가             0
매도호가            0
전일대비            0
매수호가            0
등락률             0
전일가             0
거래량             0
시가              0
거래대금            0
고가              0
액면가             1
저가              0
상한가             0
전일상한            0
하한가             0
전일하한           10
PER           169
EPS           154
52주 최고          0
52주 최저          0
시가총액            0
상장주식수           0
외국인현재           0
자본금             0
dtype: int64

In [230]:
stock_data.drop(stock_data[stock_data['자본금'] == 5].index, inplace=True)

for index,data in stock_data.iterrows():
    try:
        float(data['액면가'])
    except ValueError:
        stock_data.drop(index, inplace=True)

stock_data['액면가'].fillna(0, inplace=True)
stock_data.drop(stock_data[stock_data['EPS'].isnull()].index, inplace=True)
# stock_data.drop(stock_data[stock_data['EPS'] < 0].index, inplace=True)
# stock_data['PER'].fillna((stock_data['현재가']/stock_data['EPS']), inplace=True)
stock_data.drop(stock_data[stock_data['PER'].isnull()].index, inplace=True)
stock_data['전일하한'].fillna(stock_data['하한가'], inplace=True)


stock_data.isnull().sum()
stock_data.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 2247 entries, 0 to 2431
Data columns (total 27 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  2247 non-null   int64  
 1   기업명         2247 non-null   object 
 2   기업코드        2247 non-null   int64  
 3   현재가         2247 non-null   int64  
 4   매도호가        2247 non-null   int64  
 5   전일대비        2247 non-null   int64  
 6   매수호가        2247 non-null   int64  
 7   등락률         2247 non-null   float64
 8   전일가         2247 non-null   int64  
 9   거래량         2247 non-null   int64  
 10  시가          2247 non-null   int64  
 11  거래대금        2247 non-null   int64  
 12  고가          2247 non-null   int64  
 13  액면가         2247 non-null   object 
 14  저가          2247 non-null   int64  
 15  상한가         2247 non-null   int64  
 16  전일상한        2247 non-null   int64  
 17  하한가         2247 non-null   int64  
 18  전일하한        2247 non-null   float64
 19  PER         2247 non-null  

In [244]:
X_train = stock_data.drop(["기업명", "기업코드", "현재가"], axis=1)
X_test = X_train.iloc[-100:]
X_train = X_train.iloc[:-100]
Y_train = stock_data["현재가"].iloc[:-100]

Y_answer = stock_data["현재가"].iloc[-100:]



In [245]:
linear = LinearRegression()
linear.fit(X_train, Y_train)

Y_pred = linear.predict(X_test)

linear.score(X_train,Y_train)

0.9999720077183458

In [259]:
pd.DataFrame({"Y_answer" : Y_answer.values, "Y_pred" : Y_pred})

Unnamed: 0,Y_answer,Y_pred
0,12950,13007.906802
1,10750,10862.936038
2,2465,2324.416976
3,639,527.296850
4,40600,40628.653093
...,...,...
95,16100,16007.756163
96,795,782.193484
97,18850,18595.336624
98,1220,1244.374806
