In [None]:
### Two in one
##

#!/usr/bin/python
# -*- coding: utf-8 -*-

import os
import json
import csv
import time

from datetime import date

import requests

class CrawlerController(object):
    '''Split targets into several Crawler, avoid request url too long'''

    def __init__(self, targets, max_stock_per_crawler=50):
        self.crawlers = []

        for index in range(0, len(targets), max_stock_per_crawler):
            crawler = Crawler(targets[index:index + max_stock_per_crawler])
            self.crawlers.append(crawler)

    def run(self):
        data = []
        for crawler in self.crawlers:
            data.extend(crawler.get_data())
        return data

class Crawler(object):
    '''Request to Market Information System'''
    def __init__(self, targets):
        endpoint = 'http://mis.twse.com.tw/stock/api/getStockInfo.jsp'
        # Add 1000 seconds for prevent time inaccuracy
        timestamp = int(time.time() * 1000 + 1000000)
        ## channels = '|'.join('tse_{}.tw'.format(target) for target in targets)
        ##  please mark tse_|otc_ markets in stocknumber.csv 
        channels = '|'.join('{}.tw'.format(target) for target in targets)
        self.query_url = '{}?_={}&ex_ch={}'.format(endpoint, timestamp, channels)

    def get_data(self):
        try:
            # Get original page to get session
            req = requests.session()
            req.get('http://mis.twse.com.tw/stock/index.jsp',
                    headers={'Accept-Language': 'zh-TW'})

            response = req.get(self.query_url)
            content = json.loads(response.text)
        except Exception as err:
            print(err)
            data = []
        else:
            data = content['msgArray']

        return data

class Recorder(object):
    '''Record data to csv'''
    def __init__(self, path='data'):
        self.folder_path = '{}/{}'.format(path, date.today().strftime('%Y%m%d'))
        if not os.path.isdir(self.folder_path):
            os.mkdir(self.folder_path)

    def record_to_csv(self, data):
        for row in data:
            try:
                file_path = '{}/{}.csv'.format(self.folder_path, row['c'])
                print ("File :",file_path)
                with open(file_path, 'a') as output_file:
                    writer = csv.writer(output_file, delimiter=',', quotechar='"')
                    writer.writerow([
                        row['t'], # 資料時間
                        row['z'], # 最近成交價
                        row['tv'],# 當盤成交量
                        row['v'], # 當日累計成交量
                        row['a'], # 最佳五檔賣出價格
                        row['f'], # 最價五檔賣出數量
                        row['b'], # 最佳五檔買入價格
                        row['g'], # 最佳五檔買入數量
                        row['n'], # name 
                        row['o'], # open 
                        row['l'], # low 
                        row['h'], # high 
                        row['y']  # 
                    ])
                    output_file.flush() # whenever you want, and/or
                #output_file.close() # when you're done.
            except Exception as err:
                print('Fatal: ',err)

import datetime
import requests
import sched
import time as tm
import json
from time import gmtime, strftime                
                
s = sched.scheduler(tm.time, tm.sleep)

from Stock import stock_mrk 

def main_crawler ():  
    tar   =  [(_.strip()) for _ in open('stocknumber.csv', 'r')] # Append TSE OTC market to stock id
    
    targets = [stock_mrk(_.strip()) for _ in open('stocknumber.csv', 'r')] # Append TSE OTC market to stock id
    
    print (tar, " + ", targets)
    
    time = datetime.datetime.now()  
    print("開始更新時間:" + str(time.date())+':'+str(time.hour)+":"+str(time.minute)+":"+str(time.second))

    start_time = datetime.datetime.strptime(str(time.date())+'9:00', '%Y-%m-%d%H:%M')
    end_time =  datetime.datetime.strptime(str(time.date())+'22:31', '%Y-%m-%d%H:%M')
    
    # tm.sleep (3) # 避免證交所伺服器鎖 IP，可能為都是網頁伺服器的rate limiting 在作祟。
    # 判斷爬蟲終止條件
    sleeptimer = 0.5
    if time >= start_time and time <= end_time:
        tm.sleep (sleeptimer)
        try: 
            controller = CrawlerController(targets)
            data = controller.run()

            recorder = Recorder()
            recorder.record_to_csv(data)
        except:
            msg = "證交所網路忙碌！"
        else:
            msg = "資料擷取時間:" + str(time.date())+':'+str(time.hour)+":"+str(time.minute)+":"+str(time.second)
        finally:
            # print("更新時間:" + str(time.date())+':'+str(time.hour)+":"+str(time.minute)+":"+str(time.second))
            print("Done.", msg)
            s.enter(1, 0, main_crawler, argument=())
    else:
        print ('非營業時間，不提供連續資料。')
        print ('繼續等待交易時間。。。')
        s.enter(1, 0, main_crawler, argument=())        
                

### 
#   Regenerate purged price file
### 


# 讀取 CSV File

def RegenController(targets):
    
    import pandas as pd # 引用套件並縮寫為 pd  

    import csv
    import os
    from os import listdir
    from os.path import isfile, join
    from datetime import date

    print ('股票代碼: ',targets)
    today = str(date.today().year).zfill(4)+str(date.today().month).zfill(2)+str(date.today().day).zfill(2)
    ## Template today string for test
    # today = '20200903'

    # 今天抓到的清單
    index_list = [ f[:-4] for f in listdir(join('data', today)) if f[-4:] == '.csv' ]

    # 刪除重複的資料並重新排序
    for stock_id in index_list:
        f = open(join('data', today, stock_id+'.csv'), 'rt')

        colnames=['時間', '即時價位', '即時量', '總量', '5檔賣價', '5檔賣量', '5檔買價', '5檔買量', 'name', 'open', 'low', 'high', 'last'] 
        df = pd.read_csv(f,names=colnames, header=None)  # header=None, usecols=[0,1,2,3,4,5,6,7])
      
        result_df = df.drop_duplicates(subset=['時間', '總量'], keep='first')
        result_df.reset_index(inplace=False)
 
        result_df['Sale5']  = df['5檔賣價'].str.split('_')  
        result_df['Sale5V'] = df['5檔賣量'].str.split('_')
        result_df['Buy5']   = df['5檔買價'].str.split('_')
        result_df['Buy5V']  = df['5檔買量'].str.split('_')

        result_df = result_df.drop(columns=['5檔賣價', '5檔賣量', '5檔買價', '5檔買量'])
        result_df.reset_index(inplace=True)

        last_v = 0 
        last_p = result_df.loc[0,'Sale5'][0]

        for i in range(0, len(result_df)): 
            if int(result_df.loc[i,'總量']) <= last_v  :
                result_df = result_df.drop([i])

                continue 
            else: 
                if (str(result_df.loc[i,'即時價位']) == '-'):
                    result_df.loc[i, '即時價位'] = last_p
                else:
                    last_p = float(result_df.loc[i,'即時價位'])
                if (str(result_df.loc[i,'即時量']) == '-'):
                    result_df.loc[i, '即時量'  ] = int(result_df.loc[i, '總量']) -  last_v
                last_v = result_df.loc[i,'總量']  

        result_df.reset_index(inplace=True)


        # print ('Final ',result_df.head(10))
        f = join('data', today, stock_id+'._csv')  # template CSV File 
        result_df.to_csv(f, header=0,index=False) 

    print ('All Done! ') 
# 


#
#  Main loop 
#
import datetime
import requests
import sched
import time as tm
import json
from time import gmtime, strftime                


def Regen_loop ():        
    targets = [_.strip() for _ in open('stocknumber.csv', 'r')]
    
    time = datetime.datetime.now()  
    print("開始統整時間:" + str(time.date())+':'+str(time.hour)+":"+str(time.minute)+":"+str(time.second))

    start_time = datetime.datetime.strptime(str(time.date())+'9:00', '%Y-%m-%d%H:%M')
    end_time =  datetime.datetime.strptime(str(time.date())+'22:31', '%Y-%m-%d%H:%M')
    
    # tm.sleep (3) # 避免證交所伺服器鎖 IP，可能為都是網頁伺服器的rate limiting 在作祟。
    # 判斷爬蟲終止條件
    sleeptimer = 5.5
    if time >= start_time and time <= end_time:
        tm.sleep (sleeptimer)
        try: 
            RegenController(targets)
            
        except:
            msg = "證交所網路忙碌！"
        else:
            msg = "_csv 更新時間:" + str(time.date())+':'+str(time.hour)+":"+str(time.minute)+":"+str(time.second)
        finally:
            # print("更新時間:" + str(time.date())+':'+str(time.hour)+":"+str(time.minute)+":"+str(time.second))
            print("Done...", msg)
            s.enter(5, 0, Regen_loop, argument=())
    else:
        print ('非營業時間，不須統整。')
        print ('繼續等待交易時間。。。')
        s.enter(5, 0, Regen_loop, argument=())        
            
            
s = sched.scheduler(tm.time, tm.sleep)            
if __name__ == '__main__':
    
    s.enter(1, 0, main_crawler, argument=())
    s.enter(5, 0, Regen_loop, argument=())
    s.run()
