In [1]:
import os
from IPython.display import display, HTML
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

In [2]:
DATASET_ROOT = './0601'
TRAIN_PATH = os.path.join(DATASET_ROOT, 'tasharep.csv')
TS_IN=20
TS_OUT=5
fields = ['代碼', '開盤價(元)', '最高價(元)', '最低價(元)', '收盤價(元)', '成交張數(張)']
fields_dict = dict()
for n, f in enumerate(fields):
    fields_dict[f] = n

In [3]:
train_csv = pd.read_csv(TRAIN_PATH)
train_csv.sort_values(by=['日期', '代碼'], ascending=True, inplace=True)
train_csv[fields] = train_csv[fields].applymap(lambda x: float(x.replace(',','')) if type(x)==str else x)
display(train_csv.head(n=10))

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,代碼,日期,中文簡稱,開盤價(元),最高價(元),最低價(元),收盤價(元),成交張數(張)
0,1101,20130102,台泥,30.41,30.53,30.18,30.45,6374.0
1327,1102,20130102,亞泥,28.74,28.9,28.7,28.78,2856.0
2654,1103,20130102,嘉泥,12.29,12.33,12.24,12.24,315.0
3981,1104,20130102,環泥,11.87,12.17,11.87,12.06,551.0
5308,1108,20130102,幸福,5.18,5.22,5.16,5.18,447.0
6635,1109,20130102,信大,9.24,9.28,9.19,9.24,104.0
7962,1110,20130102,東泥,13.41,13.46,13.23,13.23,169.0
9289,1201,20130102,味全,35.69,37.74,35.44,37.69,8971.0
10616,1203,20130102,味王,16.61,16.78,16.61,16.73,143.0
11943,1210,20130102,大成,16.84,16.99,16.84,16.87,1373.0


In [4]:
codes = list(sorted(set(train_csv['代碼'])))
days  = list(sorted(set(train_csv['日期'])))
print(len(days))

1327


In [5]:
split_data = [] # 同一天分配在同一 row (已對時間排序)
for day in days:
    row = train_csv.loc[train_csv['日期']==day][fields]
    split_data.append(row)

In [6]:
subset = set(codes)
print(len(subset))
for n, s in enumerate(split_data):
    subset = subset & set(s['代碼']) # 尋找每天股票種類的交集
# subset = list(sorted(list(subset)))
# display(subset) # 從以前到現在每一天都存在的股票們
print(len(subset))

1690
1388


In [7]:
np.save('codes_subset.npy', np.asarray(list(sorted(list(subset))))) # 將 code 的交集存起來，以後用得到

In [8]:
print(len(split_data))
for n in reversed(range(len(split_data))):
    mask = split_data[n]['代碼'].isin(subset) # 濾除不在交集內的股票
    split_data[n] = split_data[n].loc[mask]
    assert len(split_data[n])==len(subset), '若濾除後，同一天股票數量與我們想要的股票集和大小不同的話，那一定有錯誤發生' # 若濾除後，同一天股票數量與我們想要的股票集和大小不同的話，那一定有錯誤發生
print(len(split_data))
display(split_data[0].head(n=10))

1327
1327


Unnamed: 0,代碼,開盤價(元),最高價(元),最低價(元),收盤價(元),成交張數(張)
0,1101,30.41,30.53,30.18,30.45,6374.0
1327,1102,28.74,28.9,28.7,28.78,2856.0
2654,1103,12.29,12.33,12.24,12.24,315.0
3981,1104,11.87,12.17,11.87,12.06,551.0
5308,1108,5.18,5.22,5.16,5.18,447.0
6635,1109,9.24,9.28,9.19,9.24,104.0
7962,1110,13.41,13.46,13.23,13.23,169.0
9289,1201,35.69,37.74,35.44,37.69,8971.0
10616,1203,16.61,16.78,16.61,16.73,143.0
11943,1210,16.84,16.99,16.84,16.87,1373.0


In [9]:
for n in range(len(split_data)):
    split_data[n] = np.asarray(split_data[n], dtype=np.float32)[...,1:] # drop codes
split_data = np.asarray(split_data, dtype=np.float32)
print(split_data.shape) # shape: (天數, unique 且通常存在股票數, features)

(1327, 1388, 5)


In [10]:
# 看起來都沒問題? 接下來把最後兩個維度壓成同一為，做成 1388*5 維的 featrue 
# 接下來就可以執行 pca 來降維
split_data = split_data.reshape(split_data.shape[0], -1)
print(split_data.shape)# shape: (天數, unique 且通常存在股票數*features)

(1327, 6940)


In [11]:
oops = np.sum(np.isinf(split_data)) # 確認資料乾淨
print('nans: {:d}'.format(int(oops)))
if oops>0:
    split_data = np.nan_to_num(split_data) # 轉換髒掉的資料

nans: 0


In [12]:
# 這裡開始 PCA
from sklearn.decomposition import PCA
n_components = 122
pca = PCA(n_components=n_components)
pca.fit(split_data)
print(np.sum(pca.explained_variance_ratio_))
np.save('pca_components.npy', pca.components_)
np.save('pca_mean.npy', pca.mean_)

0.939707


In [13]:
for v in pca.explained_variance_ratio_:
    print(v)

0.1738416
0.10271885
0.073690705
0.05901826
0.045348506
0.031895794
0.028235544
0.026436388
0.023544608
0.02196812
0.019774375
0.016510328
0.016089484
0.015207369
0.014056422
0.013532259
0.011852904
0.010209615
0.009744663
0.009417584
0.0085505815
0.008252952
0.007705429
0.006853384
0.0066358647
0.0062570646
0.006009864
0.005554784
0.0052299956
0.005205884
0.0048589404
0.004811724
0.004640503
0.0045092474
0.0040546437
0.0039903177
0.0038595228
0.0037310445
0.0036249293
0.0034906762
0.003367313
0.0033078345
0.0031700858
0.0030975433
0.0030029588
0.0028291845
0.0026107747
0.0025317294
0.0024825209
0.0024314916
0.0022823655
0.002177283
0.0021291787
0.0020811502
0.0019580233
0.0019356026
0.0019201619
0.001861337
0.0018238672
0.001768437
0.0017045101
0.0016801043
0.0016359009
0.0015940411
0.0015729746
0.0015281222
0.001459286
0.001429391
0.0014118567
0.0013763945
0.0013298099
0.0012999784
0.0012828164
0.0012401696
0.0012149498
0.0011876298
0.0011739663
0.0011496058
0.0011244862
0.001113162


![](https://tbrain.trendmicro.com.tw/Content/img/18etf.png)

### Todo:
1. 是否需要針對 outlier 做處理 (資料清理)