In [None]:
import requests
import pandas as pd
import datetime
from io import StringIO

In [None]:
# 爬取資料
def crawl_legal_person(date):    
    # 將時間物件變成字串：'20180102'
    datestr = date.strftime('%Y%m%d')
    # 下載三大法人資料
    try:
      url = 'http://www.tse.com.tw/fund/T86?response=csv&date='+datestr+'&selectType=ALLBUT0999'
      r = requests.get(url)      
    except:      
      return None

    # 製作三大法人的DataFrame
    try:      
      df = pd.read_csv(StringIO(r.text), header=1).dropna(how='all', axis=1).dropna(how='any')  
    except:      
      return None
    
    # 微調整（為了配合資料庫的格式）    
    # 刪除逗點
    df = df.astype(str).apply(lambda s: s.str.replace(',',''))

    # 刪除「證券代號」中的「"」和「=」
    df['stock_id'] = df['證券代號'].str.replace('=','').str.replace('"','')

    # 刪除「證券代號」這個欄位
    df = df.drop(['證券代號'], axis=1)

    # 設定index
    df['date'] = date
    df = df.set_index(['stock_id', 'date'])
    
    # 將dataframe的型態轉成數字
    return df.apply(lambda s: pd.to_numeric(s, errors='coerce')).dropna(how='all', axis=1)


df = crawl_legal_person(datetime.date(2020,6,24))
df


Unnamed: 0_level_0,Unnamed: 1_level_0,外陸資買進股數(不含外資自營商),外陸資賣出股數(不含外資自營商),外陸資買賣超股數(不含外資自營商),外資自營商買進股數,外資自營商賣出股數,外資自營商買賣超股數,投信買進股數,投信賣出股數,投信買賣超股數,自營商買賣超股數,自營商買進股數(自行買賣),自營商賣出股數(自行買賣),自營商買賣超股數(自行買賣),自營商買進股數(避險),自營商賣出股數(避險),自營商買賣超股數(避險),三大法人買賣超股數
stock_id,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2317,2020-06-24,42388717,10668552,31720165,0.0,0.0,0.0,264000,54000,210000,3376000,857000,252000,605000,3411000,640000,2771000,35306165
00637L,2020-06-24,18752000,4545000,14207000,0.0,0.0,0.0,0,0,0,17523000,200000,0,200000,22099000,4776000,17323000,31730000
2330,2020-06-24,35970485,11922760,24047725,0.0,0.0,0.0,208000,375000,-167000,716000,1416000,567000,849000,338000,471000,-133000,24596725
2885,2020-06-24,14730000,7143188,7586812,0.0,0.0,0.0,307000,36000,271000,2435000,1718000,0,1718000,824000,107000,717000,10292812
2449,2020-06-24,6088000,4567958,1520042,0.0,0.0,0.0,3006000,0,3006000,1053000,485000,8000,477000,604000,28000,576000,5579042
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2344,2020-06-24,2425000,8799000,-6374000,0.0,0.0,0.0,0,7000,-7000,-340000,12000,0,12000,20000,372000,-352000,-6721000
2388,2020-06-24,253000,7764000,-7511000,0.0,0.0,0.0,0,0,0,-5000,0,5000,-5000,0,0,0,-7516000
2362,2020-06-24,126000,11033000,-10907000,0.0,0.0,0.0,0,0,0,-6000,8000,17000,-9000,3000,0,3000,-10913000
00632R,2020-06-24,26436000,16509000,9927000,0.0,0.0,0.0,0,0,0,-47391000,1251000,1000,1250000,16080000,64721000,-48641000,-37464000


In [None]:
# ls drive/My\ Drive/股票/
fileName = '20200624.csv'
# folder = os.path.join('drive/', fileName)
# shutil.move(fileName, folder)
shutil.move(fileName, "/content/drive/My Drive/股票/三大法人歷史紀錄/"+ fileName)



'/content/drive/My Drive/股票/三大法人歷史紀錄/20200624.csv'

In [None]:
import shutil
import time
import os

data = {}
n_days = 100000
date = datetime.datetime.now()
fail_count = 0
allow_continuous_fail_count = 5

while len(data) < n_days:

    print('parsing', date)
    # 使用 crawPrice 爬資料
    try:
        # 抓資料
        fileName = date.date().strftime('%Y%m%d'+'.csv')
        data[date.date()] = crawl_legal_person(date)
        data[date.date()].to_csv(fileName)
        # mv *.csv drive/My\ Drive/股票/三大法人每日歷史紀錄
        shutil.move(fileName, "/content/drive/My Drive/股票/三大法人買賣超/"+ fileName)
        print('success!')
        fail_count = 0
    except:
        # 假日爬不到
        print('fail! check the date is holiday')
        fail_count += 1
        if fail_count == allow_continuous_fail_count:
            raise
            break
    
    # 減一天
    date -= datetime.timedelta(days=1)
    time.sleep(10)