In [1]:
import requests
from io import StringIO
import pandas as pd
import numpy as np
import sqlite3
import sys
import datetime
sqlite3.register_adapter(np.int64, int)
pd.options.mode.chained_assignment = None 
import locale
locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')

# 展開所有dataframe columns
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

db = sqlite3.connect('../stock.db' , isolation_level=None)

In [2]:
df_stock_id_name = pd.read_sql_query("select * from stockIdName",db)

In [3]:
ids = df_stock_id_name['id'].tolist()
df_stock_id_name.head()


Unnamed: 0,id,name,listingDate,market,industry,capital
0,1101,台泥,1962/02/09,上市,水泥工業,69.37
1,1102,亞泥,1962/06/08,上市,水泥工業,35.46
2,1103,嘉泥,1969/11/14,上市,水泥工業,7.75
3,1104,環泥,1971/02/01,上市,水泥工業,6.54
4,1108,幸福,1990/06/06,上市,水泥工業,4.05


In [4]:
def workdays(d, end, cformat="%Y%m%d"):
    days = []
    excluded=(6, 7)
    while d.date() <= end.date():
        if d.isoweekday() not in excluded:
            days.append(d.strftime(cformat)) 
        d += datetime.timedelta(days=1)
    return days

def download(date):
    # 下載股價
    r = requests.post('https://www.twse.com.tw/exchangeReport/MI_INDEX?response=csv&date=' + date + '&type=ALL')
    
    # 整理資料，變成表格
    df_origin = pd.read_csv(StringIO(r.text.replace("=", "")), 
                header=["證券代號" in l for l in r.text.split("\n")].index(True)-1)
    
    # 整理一些字串：
    df_origin = df_origin.apply(lambda s: pd.to_numeric(s.astype(str).str.replace(",", "").replace("+", "1").replace("-", "-1"), errors='coerce'))
    
    # 清資料
    try:
        del df_origin['證券名稱']
        del df_origin['Unnamed: 16']
    except:
        print('沒有"證券名稱"欄位')

    df = df_origin[df_origin["證券代號"].astype(str).str.match(r"^\d{4}.0$")]  # 證券代號: xxxx (上市)
    df['證券代號'] = df['證券代號'].apply(lambda id: int(id))
    
    # 合併 df, df_stock_id_name
    df = df.merge(df_stock_id_name, left_on="證券代號",right_on="id", how="left")
    
    return df

def download_otc(date):
    r = requests.get(f'https://www.tpex.org.tw/web/stock/aftertrading/otc_quotes_no1430/stk_wn1430_result.php?l=zh-tw&d={date}&se=AL')
    j = r.json()
    df = pd.DataFrame.from_dict(j["aaData"])
    df.columns=['代號','名稱','收盤','漲跌','開盤','最高','最低','成交股數','成交金額(元)','成交筆數','最後買價','最後買量(千股)','最後賣價','最後賣量(千股)','發行股數','次日漲停價','次日跌停價']
    df = df[df["代號"].astype(str).str.match(r"^\d{4}$")]  
    return df


In [5]:
today = datetime.date.today().strftime("%Y%m%d")
today_otc = datetime.date.today().strftime("%Y/%m/%d")

def checkDateIsInDB(date):
    df = pd.read_sql_query(f"select * from daily where date={date}", db)
    return len(df) > 0

def start_download_sii(start=today, end=today):
    startDate = datetime.datetime.strptime(start, "%Y%m%d")
    endDate =  datetime.datetime.strptime(end, "%Y%m%d")

    for date in workdays(startDate, endDate):
        print('download sii: ', date)
        # check if date and id is already in db
#         if(checkDateIsInDB(date)):
#             continue
        
        # download and insert to db
        try:
            df = download(date)
            
            for index, row in df.iterrows(): 
                sql_insert = f'insert into daily (date, id, name, tradeVolumn,[transaction], tradeValue,open,high,low,close,dir,change,bidPrice,bidVolumn,askPrice,askVolumn, pe) \
                               values (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)'
                db.execute(sql_insert, (date, row['證券代號'],row['name'],row['成交股數'],row['成交筆數'],row['成交金額'],row['開盤價'],row['最高價'],row['最低價'],row['收盤價'],row['漲跌(+/-)'],row['漲跌價差'],row['最後揭示買價'],row['最後揭示買量'],row['最後揭示賣價'],row['最後揭示賣量'],row['本益比']))
        except:
            print('skip...', date)
            continue

def atof(v):
    return locale.atof(v)

def start_download_otc(start=today_otc, end=today_otc):
    startDate = datetime.datetime.strptime(start, "%Y/%m/%d")
    endDate =  datetime.datetime.strptime(end, "%Y/%m/%d")

    for date in workdays(startDate, endDate, "%Y/%m/%d"):
        # 2021 -> 110
        fdate = date.split("/")
        fdate[0] = str(int(fdate[0]) - 1911)
        fdate = "/".join(fdate)
        print('download otc: ', fdate) 
        try:
            df = download_otc(fdate)
            for index, row in df.iterrows():
                sql_insert = f'insert into daily (date, id, name, tradeVolumn,[transaction], tradeValue,open,high,low,close,dir,change,bidPrice,bidVolumn,askPrice,askVolumn, pe) \
                               values (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)'
                change = row['漲跌']
                dir = -1.0 if ('-' in change) else 1.0 if ('+' in change) else np.nan
                change = change.replace('-', '') if ('-' in change) else change.replace('+', '') if ('+' in change) else np.nan

                try:
                    db.execute(sql_insert, (date.replace("/",""), row['代號'],row['名稱'],atof(row['成交股數']),atof(row['成交筆數']),atof(row['成交金額(元)']),atof(row['開盤']),\
                               atof(row['最高']),atof(row['最低']),atof(row['收盤']),dir, change, atof(row['最後買價']),atof(row['最後買量(千股)']),atof(row['最後賣價']),atof(row['最後賣量(千股)']),np.nan))
                except:
                    pass 
        except:
            print('skip: >>>>>>>>>', )
            continue


In [6]:
start = "2023/08/01" 
end = "2023/08/03"
sii_start = start.replace("/","")
sii_end = end.replace("/","")

start_download_sii(sii_start, sii_end)
start_download_otc(start, end)

download sii:  20230801
download sii:  20230802
download sii:  20230803
skip... 20230803
download otc:  112/08/01
download otc:  112/08/02
download otc:  112/08/03
skip: >>>>>>>>>


In [7]:
pd.read_sql_query("select * from daily where id=5483 order by date", db).tail(10)

Unnamed: 0,date,id,name,tradeVolumn,transaction,tradeValue,open,high,low,close,dir,change,bidPrice,bidVolumn,askPrice,askVolumn,pe
783,20230719,5483,中美晶,13988000,8374,2557214500,185.5,187.0,179.0,179.0,-1.0,4.0,178.5,80,179.0,59,
784,20230720,5483,中美晶,7822000,4606,1422778000,180.5,183.5,179.5,181.5,1.0,2.5,181.5,111,182.0,95,
785,20230721,5483,中美晶,8810000,5421,1577743000,178.5,181.0,176.5,180.0,-1.0,1.5,179.5,82,180.0,346,
786,20230724,5483,中美晶,9774000,5806,1764377500,180.5,182.5,179.0,180.5,1.0,0.5,180.5,53,181.0,634,
787,20230725,5483,中美晶,8275000,5136,1462742000,176.0,179.0,175.0,175.5,,,175.5,242,176.0,14,
788,20230726,5483,中美晶,11324000,7415,1914675000,175.0,175.0,165.5,165.5,-1.0,10.0,165.5,222,166.0,18,
789,20230727,5483,中美晶,6255000,4179,1058965000,168.0,171.0,167.0,169.5,1.0,4.0,169.5,6,170.0,89,
790,20230728,5483,中美晶,6784000,4795,1156622000,170.0,173.0,169.0,171.0,1.0,1.5,171.0,24,171.5,155,
791,20230731,5483,中美晶,6596000,3931,1136053000,173.0,174.5,170.5,171.5,1.0,0.5,171.0,24,171.5,99,
792,20230801,5483,中美晶,4432000,2862,752185000,173.5,173.5,168.0,168.5,-1.0,3.0,168.5,123,169.0,74,


# 更新上櫃pe(爬蟲沒有上櫃pe)

In [8]:
# 財報
df_financial_statement = pd.read_sql_query("select * from financialStatement",db)
df_financial_statement = df_financial_statement.fillna(0)
df_financial_statement.replace('--', 0, inplace=True)

In [9]:
df = pd.read_sql_query("select * from daily", db)
df = df.sort_values(by=['date'])

In [10]:
# 把個股每月對應到的近四季eps記錄在dict

year = [[1,2,3],[4,5,6],[7,8,9],[10,11,12]]

def find_eps(id, quarters):
    df = df_financial_statement[df_financial_statement['id'] == id]
    return round(df[df['date'].isin(quarters)]['qeps'].sum(), 2)

def find_in_list_of_list(mylist, char):
    for sub_list in mylist:
        if char in sub_list:
            return mylist.index(sub_list) + 1
    
def getQuarters(yyyy, mm):
    start = find_in_list_of_list(year, mm) - 1 + 4
    res = []
    
    for i in range(4):
        if(start % 4):
            res.append(f"{yyyy}q{start % 4}")
        else:
            yyyy -= 1
            res.append(f"{yyyy}q4")
            
        start -= 1
    
    return res

dict = {}
currentYears = 2023
currentMonth = 5
for id in set(df['id'].tolist()):
    dict[id] = {}
    for yyyy in range(2017,currentYears + 1):
        for mm in range(1,currentMonth + 1):
            quarters = getQuarters(yyyy, mm)
            eps = find_eps(id, quarters)
            key = f"{yyyy}{str(mm).zfill(2)}"
            dict[id][key] = eps

In [14]:
# getQuarters(2023, 1) # ['2022q4', '2022q3', '2022q2', '2022q1'] 用前四季eps總和
# dict[5425]

In [11]:
# 找daily中 id在filter內 與 pe為NaN
filter_ids = [5347, 6182, 8938, 3264, 5425, 5483, 3611, 6509, 8155, 6770, 1342, 2640, 6146, 6263, 8109, 5009, 6691, 3265]

update_df = df[(df['id'].isin(filter_ids))& (df['pe'].isna())] 
update_df

Unnamed: 0,date,id,name,tradeVolumn,transaction,tradeValue,open,high,low,close,dir,change,bidPrice,bidVolumn,askPrice,askVolumn,pe
1889136,20230626,6509,聚和,764000,414,33062900,43.7,43.7,43.15,43.3,-1.0,0.45,43.3,15.0,43.35,5.0,
1889034,20230626,6146,耕興,441000,368,109153500,252.0,252.0,245.0,249.5,-1.0,1.0,248.5,4.0,249.5,6.0,
1889094,20230626,6263,普萊德,47000,43,5827000,124.5,125.5,123.5,124.0,-1.0,1.0,124.0,5.0,124.5,3.0,
1889054,20230626,6182,合晶,3077000,1932,141965100,46.0,46.45,45.45,46.15,,,46.1,17.0,46.15,1.0,
1889337,20230626,8938,明安,267000,235,20729000,77.5,77.9,77.3,77.6,-1.0,0.3,77.5,74.0,77.7,2.0,
1889269,20230626,8109,博大,63000,50,5770000,91.5,92.0,91.4,91.8,,,91.8,1.0,92.0,4.0,
1889273,20230626,8155,博智,510000,420,69282000,135.5,137.5,134.5,136.5,,,136.0,21.0,136.5,16.0,
1888656,20230626,3264,欣銓,7713000,4761,458437400,59.8,60.3,58.6,59.6,-1.0,0.6,59.6,194.0,59.7,58.0,
1888657,20230626,3265,台星科,1397000,823,97316500,70.4,70.4,69.3,69.3,-1.0,1.1,69.2,16.0,69.4,5.0,
1888744,20230626,3611,鼎翰,50000,41,12450000,250.0,250.5,246.0,250.0,-1.0,2.5,249.5,1.0,250.5,1.0,


In [12]:
for index, row in update_df.iterrows():
    date = str(row['date'])[0:6]
    id = row['id']
    try:
        eps = dict[id][date]
        update_df.loc[index, 'eps'] = eps
    except: # dict沒有這季eps先拿舊的(會導致之後需要更新eps)
        preDate = list(dict[id].keys())[-1]
        eps = dict[id][preDate]
        update_df.loc[index, 'eps'] = eps

In [13]:
def writeDB(dff):
    for index, row in dff.iterrows():
        date = row['date']
        pe = row['pe']
        id = row['id']
        sql = f"UPDATE daily SET pe = {pe} WHERE id = {id} and date = {date}"
        db.execute(sql)
        
a = update_df.sort_values(by=['date'])
a['pe'] = round(a['close']/a['eps'], 2)
writeDB(a)

In [14]:
# search db
id = 5425
df1 = pd.read_sql_query(f"select * from daily where id={id} order by date", db)
df1

Unnamed: 0,date,id,name,tradeVolumn,transaction,tradeValue,open,high,low,close,dir,change,bidPrice,bidVolumn,askPrice,askVolumn,pe
0,20200430,5425,台半,2530000,1304,96962900,38.55,38.75,38.00,38.30,1.0,0.20,38.25,8,38.30,38,18.41
1,20200504,5425,台半,1045000,653,39410800,37.80,38.00,37.50,37.55,-1.0,0.75,37.55,50,37.70,12,18.05
2,20200505,5425,台半,997000,589,38061750,38.20,38.45,37.85,38.15,1.0,0.60,38.10,8,38.15,7,18.34
3,20200506,5425,台半,787000,498,30075000,38.40,38.40,38.00,38.00,-1.0,0.15,38.00,67,38.10,1,18.27
4,20200507,5425,台半,1585000,950,61208400,38.20,39.15,38.20,38.55,1.0,0.55,38.55,1,38.60,25,18.53
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
771,20230630,5425,台半,3471000,2241,336137600,97.50,97.50,96.30,97.10,1.0,0.50,97.00,86,97.10,15,17.16
772,20230703,5425,台半,6296000,4364,622418300,99.20,99.60,97.90,99.00,1.0,1.90,98.90,69,99.00,21,17.49
773,20230704,5425,台半,8203000,5801,808492800,100.00,100.50,97.60,98.10,-1.0,0.90,98.10,136,98.30,1,17.33
774,20230705,5425,台半,8967000,6248,887054300,98.80,100.50,97.60,97.60,-1.0,0.50,97.60,141,97.70,1,17.24


# OTC API return columns


|代號|名稱|收盤|漲跌|開盤|最高|最低|成交股數|成交金額(元)|成交筆數|最後買價|最後買量(千股)|最後賣價|最後賣量(千股)|發行股數|次日漲停價|次日跌停價|
|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|
|id|name|close|change|open|high|low|tradeVolumn|tradeValue|transaction|bidPrice|bidVolumn|askPrice|askVolumn|


In [None]:
a = download_otc("107/01/01")
a

# Search DB 

In [None]:
# find duplicate
df = pd.read_sql_query("select * from daily", db)
a = df[(df['id']==5425)].sort_values(by=['date'], ascending=False)
duplicated = a[a['date'].duplicated()]['date'].tolist()
print(f"dulicate: {duplicated}")

In [None]:
# delete row
for date in duplicated:
    sql = f"delete from daily where date={date}"
    db.execute(sql)

In [7]:
db.close()