# Final Project

In [3]:
# 移除不必要的警告
import warnings
warnings.filterwarnings('ignore')

### 獲取歷史資料

In [4]:
from ml_finlab.data import Data

data = Data()

rev = data.get("當月營收")

close = data.get_adj("收盤價")
open_ = data.get_adj("開盤價")
high = data.get_adj("最高價")
low = data.get_adj("最低價")
vol = data.get("成交股數")

PB = data.get("股價淨值比")
pe = data.get("本益比")
DY = data.get("殖利率(%)")

rev.index = rev.index.shift(3, "d")

### 計算features

In [5]:
def bias(n):
    return close / close.rolling(n, min_periods=1).mean()

def acc(n):
    return close.shift(n) / (close.shift(2*n) + close) * 2

def rsv(n):
    l = close.rolling(n, min_periods=1).min()
    h = close.rolling(n, min_periods=1).max()
    
    return (close - l) / (h - l)

def mom(n):
    return (rev / rev.shift(1)).shift(n)

#####上為原版資料###
def yoy(n):
    denominator = rev.shift(12 + n)
    result = rev.shift(n) / denominator - 1
    result[denominator < 0] = 0  # 如果分母为负，将对应的结果设置为0
    return result

def delta_yoy(n):
    yoy_current = yoy(n)
    yoy_previous = yoy(n + 1)
    delta = yoy_current - yoy_previous
    return delta

def willr(n):
    highest_high = high.rolling(n, min_periods=1).max()
    lowest_low = low.rolling(n, min_periods=1).min()
    willr = (highest_high - close) / (highest_high - lowest_low) * -100
    return willr

def linearreg_slope(n):
    slope = (close - close.shift(n)) / n
    return slope

def adx(n):
    true_range = high - low
    true_range = true_range.fillna(0)
    
    plus_dm = high.diff()
    minus_dm = low.diff()
    plus_dm[plus_dm < 0] = 0
    minus_dm[minus_dm > 0] = 0
    
    plus_dm = plus_dm.rolling(n).sum()
    minus_dm = minus_dm.abs().rolling(n).sum()
    
    tr_sum = true_range.rolling(n).sum()
    tr_sum[tr_sum == 0] = 0.0001
    
    plus_di = (plus_dm / tr_sum) * 100
    minus_di = (minus_dm / tr_sum) * 100
    
    dx = ((plus_di - minus_di).abs() / (plus_di + minus_di).abs()) * 100
    adx = dx.rolling(n).mean()
    
    return adx


def adxr(n):
    adx_value = adx(n)  # 先計算ADX值
    adxr = adx_value.rolling(n).mean()  # 計算ADX值的平均值
    return adxr

def rsi(n):
    diff = close.diff()
    up = diff.where(diff > 0, 0)
    down = -diff.where(diff < 0, 0)
    avg_gain = up.rolling(n).mean()
    avg_loss = down.rolling(n).mean()
    rs = avg_gain / avg_loss
    rsi = 100 - (100 / (1 + rs))
    return rsi

def kd(n):
    l = close.rolling(n).min()
    h = close.rolling(n).max()
    rsv = (close - l) / (h - l)
    k = rsv.rolling(3).mean()
    d = k.rolling(3).mean()
    return k - d


#---------

def ma_ratio(n):
    ma_short = close.rolling(n).mean()
    ma_long = close.rolling(2 * n).mean()
    return ma_short / ma_long

def volume_change(n):
    return vol.pct_change(n)

def range_ratio(n):
    range_high = high.rolling(n).max()
    range_low = low.rolling(n).min()
    return (range_high - range_low) / range_low


In [6]:
features = {
    'mom{}'.format(n): mom(n) for n in range(1, 10)
}

features.update({
    'bias_{}'.format(n): bias(n) for n in [5, 60, 120, 240]
})

features.update({
    'PB': PB,
    'PE': pe,
    'DY': DY
})


features.update({
    'acc_{}'.format(n): acc(n) for n in [5, 10, 20, 60, 120, 240]
})

features.update({
    'rsv_{}'.format(n): rsv(n) for n in [60, 120, 240]
})

####上為原版資料###
features.update({
    'yoy_{}'.format(n): yoy(n) for n in range(0, 0)
})
features.update({
    'delta_yoy_{}'.format(n): delta_yoy(n) for n in range(0, 2)
})

features.update({
    'willr_{}'.format(n): willr(n) for n in [14, 20, 60]
})

features.update({
    'linearreg_slope_{}'.format(n): linearreg_slope(n) for n in [14, 112, 224]
})

features.update({
    f'kd_{n}': kd(n) for n in [ 9, 14, 20, 60]
})

features.update({
    'adxr_{}'.format(n): adxr(n) for n in [14, 20, 30]
})

features.update({
    'ma_ratio_{}'.format(n): ma_ratio(n) for n in [20, 40, 60]
})

features.update({
    'volume_change_{}'.format(n): volume_change(n) for n in [20, 40, 60]
})

features.update({
    'range_ratio_{}'.format(n): range_ratio(n) for n in [10, 30, 60]
})


In [7]:
#import pandas as pd
#t1 = pd.read_pickle("history/items/bargin_report/外陸資買賣超股數(不含外資自營商).pkl")
#t2 = pd.read_pickle("history/items/bargin_report/投信買賣超股數.pkl")
#t3 = pd.read_pickle("history/items/bargin_report/自營商買賣超股數(自行買賣).pkl")

### 製作dataset

##### 設定買賣頻率

In [8]:
every_month = rev.index
every_month

DatetimeIndex(['2005-02-13', '2005-03-13', '2005-04-13', '2005-05-13',
               '2005-06-13', '2005-07-13', '2005-08-13', '2005-09-13',
               '2005-10-13', '2005-11-13',
               ...
               '2023-01-13', '2023-02-13', '2023-03-13', '2023-04-13',
               '2023-05-13', '2023-06-13', '2023-07-13', '2023-08-13',
               '2023-09-13', '2023-10-13'],
              dtype='datetime64[ns]', name='date', length=225, freq=None)

##### 將dataframe 組裝起來

In [9]:
# features['bias20'].reindex(every_month, method='ffill')

for name, f in features.items():
    features[name] = f.reindex(every_month, method='ffill')

In [10]:

for name, f in features.items():
    features[name] = f.unstack()

In [11]:
import pandas as pd
dataset = pd.DataFrame(features)

In [12]:
##我要把金融股拿掉 所以把28開頭全去掉
#dataset = dataset[~dataset.index.get_level_values('stock_id').str.startswith('28')]

In [13]:
feature_names = list(dataset.columns)
feature_names

['mom1',
 'mom2',
 'mom3',
 'mom4',
 'mom5',
 'mom6',
 'mom7',
 'mom8',
 'mom9',
 'bias_5',
 'bias_60',
 'bias_120',
 'bias_240',
 'PB',
 'PE',
 'DY',
 'acc_5',
 'acc_10',
 'acc_20',
 'acc_60',
 'acc_120',
 'acc_240',
 'rsv_60',
 'rsv_120',
 'rsv_240',
 'delta_yoy_0',
 'delta_yoy_1',
 'willr_14',
 'willr_20',
 'willr_60',
 'linearreg_slope_14',
 'linearreg_slope_112',
 'linearreg_slope_224',
 'kd_9',
 'kd_14',
 'kd_20',
 'kd_60',
 'adxr_14',
 'adxr_20',
 'adxr_30',
 'ma_ratio_20',
 'ma_ratio_40',
 'ma_ratio_60',
 'volume_change_20',
 'volume_change_40',
 'volume_change_60',
 'range_ratio_10',
 'range_ratio_30',
 'range_ratio_60']

In [16]:
from ml_finlab import ml

vol=data.get('成交股數')/1000
vol_ma5=vol.rolling(20).mean()

股本 = data.get('股本合計').reindex(close.index, method='ffill')
市值 = 股本 * close / 10 * 1000

In [17]:
ml.add_feature(dataset, 'vol_ma5', vol_ma5)

ml.add_feature(dataset, '市值', 市值)

### 新增 label

In [18]:
from finlab import ml

ml.add_profit_prediction(dataset)
ml.add_rank_prediction(dataset)

ModuleNotFoundError: No module named 'finlab'

### 刪除太大太小的歷史資料

In [None]:
print(dataset.shape)

def drop_extreme_case(dataset, feature_names, thresh=0.01):
    
    extreme_cases = pd.Series(False, index=dataset.index)
    for f in feature_names:
        tf = dataset[f]
        extreme_cases = extreme_cases | (tf < tf.quantile(thresh)) | (tf > tf.quantile(1-thresh))
    dataset = dataset[~extreme_cases]
    return dataset

dataset_drop_extreme_case = drop_extreme_case(dataset,
                                              feature_names, thresh=0.01)

print(dataset_drop_extreme_case.shape)

In [None]:
dataset_dropna = dataset_drop_extreme_case.dropna(how='any')
dataset_dropna = dataset_dropna.reset_index().set_index("date")

In [None]:
dataset_drop_extreme_case.index.get_level_values("date")

In [None]:
dataset_drop_extreme_case.dropna(how='any')

## Split Train Test dataset

In [None]:
dataset_unique = dataset_dropna.index.astype(str).drop_duplicates().sort_values()
dataset_unique

# 計算要分割的索引位置
split_index = int(len(dataset_unique) * 0.9)
Bef = dataset_unique[split_index] 
Aft = dataset_unique[split_index+1]


# 分割資料集
dataset_train = dataset_dropna.loc[:Bef]
dataset_test = dataset_dropna.loc[Aft:]
Bef

In [None]:
train = dataset_train[feature_names].astype(float) , dataset_train['return'] > 1.00
test = dataset_test[feature_names].astype(float) , dataset_test['return'] > 0.99

In [None]:
#profit(return) rank
predi_target = 'rank'

### 神經網路模型

In [None]:
import os

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import initializers
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'

# 建立模型
model = keras.models.Sequential()
model.add(layers.BatchNormalization(input_shape=(len(feature_names),)))
model.add(layers.Dense(100, activation='relu',
                      kernel_initializer=initializers.he_normal(seed=0)))
model.add(layers.Dropout(0.4))
model.add(layers.Dense(100, activation='relu',
                      kernel_initializer=initializers.he_normal(seed=0)))
model.add(layers.Dropout(0.4))
model.add(layers.Dense(1, activation='sigmoid'))

# 摘要模型
model.summary()

# 建立優化器
optimizer = keras.optimizers.Adam(learning_rate=0.001)

# 建立損失函數和評估指標
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# 設定早停
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=10)

# 設定模型回檔
get_best_model = keras.callbacks.ModelCheckpoint(
    filepath='./best_model/v2_12.h5',
    save_weights_only=True,
    monitor='val_loss',
    mode='min',
    verbose=1,
    patience=10,
    save_best_only=True)

# 訓練模型
history = model.fit(dataset_train[feature_names], dataset_train[predi_target],
                    batch_size=888,
                    epochs=225,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[get_best_model]) #es, get_best_model


In [None]:
#import os
#from tensorflow import keras
#from tensorflow.keras import layers
#from tensorflow.keras import initializers
##
#os.environ['KMP_DUPLICATE_LIB_OK']='True'
##
#model = keras.models.Sequential()
#model.add(layers.Dense(256, activation='relu',
#                      input_shape=(len(feature_names),),
#                      kernel_initializer=initializers.he_normal(seed=0)))
#model.add(layers.Dense(128, activation='relu',
#                      kernel_initializer=initializers.he_normal(seed=0)))
#model.add(layers.Dropout(0.4))
##model.add(layers.Dense(64, activation='relu',
##                      kernel_initializer=initializers.he_normal(seed=0)))
##model.add(layers.Dense(32, activation='relu',
##                      kernel_initializer=initializers.he_normal(seed=0)))
#model.add(layers.Dense(16, activation='relu',
#                      kernel_initializer=initializers.he_normal(seed=0)))
#model.add(layers.Dropout(0.3))
#model.add(layers.Dense(1, activation='sigmoid'))
##
#model.summary()
##
#model.compile(loss='mean_squared_error',
#              optimizer="adam",)
##
#print('start fitting')
#history = model.fit(dataset_train[feature_names], dataset_train['rank'],
#                    batch_size=1000,
#                    epochs=200,
#                    verbose=1,
#                    validation_split=0.1)
##

In [None]:
#import os
#
#from tensorflow import keras
#from tensorflow.keras import layers
#from tensorflow.keras import initializers
#
#os.environ['KMP_DUPLICATE_LIB_OK']='True'
#
#model = keras.models.Sequential()
#model.add(layers.Dense(100, activation='relu',
#                      input_shape=(len(feature_names),),
#                      kernel_initializer=initializers.he_normal(seed=0)))
#model.add(layers.Dense(100, activation='relu',
#                      kernel_initializer=initializers.he_normal(seed=0)))
#model.add(layers.Dropout(0.35))
#model.add(layers.Dense(1, activation='sigmoid'))
#
#
#model.summary()
#
#model.compile(loss='mean_squared_error',
#              optimizer="adam",)
#
#print('start fitting')
#history = model.fit(dataset_train[feature_names], dataset_train['rank'],
#                    batch_size=1000,
#                    epochs=225,
#                    verbose=1,
#                    validation_split=0.1, )

#### 結果

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

# 訓練過程中的損失函數
train_loss = history.history['loss'][5:]
val_loss = history.history['val_loss'][5:]

# 繪製損失函數圖表
plt.plot(range(len(train_loss)), train_loss, label='Training Loss')
plt.plot(range(len(val_loss)), val_loss, label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.show()

### lightgbm Model

In [None]:
import lightgbm as lgb
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
from sklearn.model_selection import RandomizedSearchCV

fit_params = {
    "early_stopping_rounds": 30,
    "eval_metric": 'logloss',  # 使用对数损失作为分类任务的评估指标
    "eval_set": [(test[0], test[1])],
    'eval_names': ['valid'],
    'verbose': 100,
    'categorical_feature': 'auto'
}

param_test = {
    'num_leaves': sp_randint(10, 100),
    'min_child_samples': sp_randint(50, 200),
    'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
    'subsample': sp_uniform(loc=0.1, scale=0.9),
    'colsample_bytree': sp_uniform(loc=0.2, scale=0.8),
    'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
    'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100]
}

# 調整為較小的值，例如100
n_HP_points_to_test = 100

# 将n_estimators设置为10000
clf = lgb.LGBMClassifier(max_depth=-1, random_state=314, silent=True, n_jobs=4, n_estimators=10000)

gs = RandomizedSearchCV(
    estimator=clf, param_distributions=param_test,
    n_iter=n_HP_points_to_test,
    scoring='neg_log_loss',  # 使用对数损失作为分类任务的评估指标
    cv=3,
    refit=True,
    random_state=314,
    verbose=True
)


In [None]:
# 替換為實際的訓練數據和適合的參數
gs.fit(*train, **fit_params)

In [None]:
best_params = gs.best_params_
print("Best Parameters:", best_params)
print("----------------")
gs_best_model = lgb.LGBMRegressor(**best_params )
gs_best_model.fit(*train)

#### 結果

In [None]:
print('#Training accuracy {:.4f}'.format(gs_best_model.score(*train)))
print('#Testing accuracy {:.4f}'.format(gs_best_model.score(*test)))

#l1那個
#Training accuracy 0.1127
#Testing accuracy -0.0138

#Training accuracy 0.0581
#Testing accuracy -0.0043

#Training accuracy 0.1361
#Testing accuracy -0.0167



In [None]:
feature_imp = pd.DataFrame(zip(gs_best_model.feature_importances_, feature_names), 
                           columns=['Value','Feature']).sort_values('Value', ascending=False)
feature_imp

%matplotlib inline
import seaborn as sns
plt.figure(figsize=(10,10))
sns.barplot(x="Value", y="Feature", data=feature_imp)

### xgboost Model

In [None]:
import xgboost as xgb
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'

cf_xgb = xgb.XGBClassifier()  # 使用XGBClassifier作为分类器
cf_xgb.fit(*train)
accuracy = cf_xgb.score(*test)  # 计算分类准确率等评估指标
print("Accuracy:", accuracy)


In [None]:
import xgboost as xgb
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'

from sklearn.model_selection import RandomizedSearchCV

clf = xgb.XGBClassifier(objective='binary:logistic')  # 使用XGBClassifier作为分类器，设置分类的目标函数

param_grid = {
    'max_depth': [6, 10, 15, 20],
    'learning_rate': [0.001, 0.01, 0.1, 0.2, 0.3],
    'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bylevel': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    'min_child_weight': [0.5, 1.0, 3.0, 5.0, 7.0, 10.0],
    'gamma': [0, 0.25, 0.5, 1.0],
    'reg_lambda': [0.1, 1.0, 5.0, 10.0, 50.0, 100.0],
    'n_estimators': [100]
}

fit_params = {
    'early_stopping_rounds': 10,
    'eval_set': [(test[0], test[1])],
    'verbose': False,
    'eval_metric': 'logloss'  # 使用对数损失作为分类任务的评估指标
}

rs = RandomizedSearchCV(clf, param_grid, n_iter=20, n_jobs=1, verbose=2, cv=2, scoring='neg_log_loss', refit=True, random_state=42)
rs.fit(train[0], train[1], **fit_params)


In [None]:
xgb_best_params = rs.best_params_
print("Best Parameters:", xgb_best_params)
print("----------------")
xgb_best_model = xgb.XGBClassifier(**best_params )
xgb_best_model.fit(*train)

#### 結果

In [None]:
xgb_best_model.score(*test)

In [None]:
feature_imp = pd.DataFrame(zip(xgb_best_model.feature_importances_, feature_names), 
                           columns=['Value','Feature']).sort_values('Value', ascending=False)
feature_imp

%matplotlib inline
import seaborn as sns
plt.figure(figsize=(10,10))
sns.barplot(x="Value", y="Feature", data=feature_imp)

### Random Forest Model

In [None]:
#from sklearn.ensemble import RandomForestRegressor
#
#cf2 = RandomForestRegressor(n_estimators=100)
#cf2.fit(dataset_train[feature_names].astype(float), dataset_train['rank'])

In [None]:
#from sklearn.ensemble import RandomForestRegressor
#from sklearn.model_selection import RandomizedSearchCV
#
## 建立隨機森林回歸模型
#cf2 = RandomForestRegressor()
#
## 定義超參數範圍
#param_dist = {
#    'n_estimators': [50, 100, 200, 300],
#    'max_features': ['auto', 'sqrt', 'log2'],
#    'max_depth': [None, 10, 20, 30],
#    'min_samples_split': [2, 5, 10],
#    'min_samples_leaf': [1, 2, 4],
#    'bootstrap': [True, False]
#}
#
## 執行隨機參數搜尋
#rs = RandomizedSearchCV(cf2, param_distributions=param_dist, n_iter=100, cv=5, verbose=2, random_state=42)
#rs.fit(dataset_train[feature_names].astype(float), dataset_train['rank'])

在以上程式碼中，我們建立了 RandomForestRegressor 模型，
並定義了超參數的範圍。使用 RandomizedSearchCV 進行參數搜尋，
設定了迭代次數為 100，交叉驗證次數為 5，並設定了 random_state 來保持結果的可重現性。
透過這樣的方式，您可以進一步優化 RandomForestRegressor 模型，以提高其性能。

In [None]:
#feature_imp = pd.DataFrame(zip(cf2.feature_importances_, feature_names), 
#                           columns=['Value','Feature']).sort_values('Value', ascending=False)
#feature_imp
#
#%matplotlib inline
#import seaborn as sns
#plt.figure(figsize=(10,10))
#sns.barplot(x="Value", y="Feature", data=feature_imp)

### Ensemble learning

In [None]:
dataset_drop = dataset.dropna(subset=feature_names+['return'])

vals = model.predict(dataset_drop[feature_names].astype(float))
dataset_drop['result1'] = pd.Series(vals.swapaxes(0,1)[0], dataset_drop.index)

vals = gs_best_model.predict(dataset_drop[feature_names].astype(float))
dataset_drop['result2'] = pd.Series(vals, dataset_drop.index)

vals = xgb_best_model.predict(dataset_drop[feature_names].astype(float))
dataset_drop['result3'] = pd.Series(vals, dataset_drop.index)

dataset_drop = dataset_drop.reset_index().set_index("date")

# backtest

In [None]:
import math
from tqdm import tqdm

dates = sorted(list(set(dataset_drop.index)))
fil_ma_s = 1000  # start
fil_ma_e = 3000  # end

rs_1 = []
rs_2 = []
rs_3 = []  
rs_4 = []  
rs_5 = []  # 收益数据列表

for d in tqdm(dates):
    dataset_time = dataset_drop.loc[d]
    dataset_time = drop_extreme_case(dataset_time, feature_names, thresh=0.01)
    predi_target_0 = dataset_time['result1'] + dataset_time['result2'] + dataset_time['result3']
    predi_target_1 = predi_target_0.where((dataset_time['vol_ma5'] > fil_ma_s) & (dataset_time['vol_ma5'] < fil_ma_e) & (dataset_time['市值'] < 1e10), 0)
    predi_target_2 = predi_target_0.where((dataset_time['vol_ma5'] > fil_ma_s) & (dataset_time['市值'] < 1e9), 0)
    predi_target_3 = predi_target_0.where((dataset_time['vol_ma5'] < fil_ma_e) & (dataset_time['市值'] < 1e9), 0)
    predi_target_4 = predi_target_0.where((dataset_time['vol_ma5'] > fil_ma_e) & (dataset_time['市值'] > 1e9), 0)
    
        

    condition_0 = (predi_target_0 >= predi_target_0.nlargest(20).iloc[-1])
    condition_1 = (predi_target_1 >= predi_target_1.nlargest(20).iloc[-1])
    condition_2 = (predi_target_2 >= predi_target_2.nlargest(20).iloc[-1])
    condition_3 = (predi_target_3 >= predi_target_3.nlargest(20).iloc[-1])
    condition_4 = (predi_target_4 >= predi_target_4.nlargest(20).iloc[-1])

    
    r0 = dataset_time['return'][condition_0].mean()
    r1 = dataset_time['return'][condition_1].mean()
    r2 = dataset_time['return'][condition_2].mean()
    r3 = dataset_time['return'][condition_3].mean()
    r4 = dataset_time['return'][condition_4].mean()

    
    rs_1.append(r0 * (1 - 3 / 1000 - 1.425 / 1000 * 2 * 0.6))
    rs_2.append(r1 * (1 - 3 / 1000 - 1.425 / 1000 * 2 * 0.6))
    rs_3.append(r2 * (1 - 3 / 1000 - 1.425 / 1000 * 2 * 0.6))
    rs_4.append(r3 * (1 - 3 / 1000 - 1.425 / 1000 * 2 * 0.6))
    rs_5.append(r4 * (1 - 3 / 1000 - 1.425 / 1000 * 2 * 0.6))
    

rs_1 = pd.Series(rs_1, index=dates)[Aft:].cumprod()
rs_2 = pd.Series(rs_2, index=dates)[Aft:].cumprod()
rs_3 = pd.Series(rs_3, index=dates)[Aft:].cumprod()
rs_4 = pd.Series(rs_4, index=dates)[Aft:].cumprod()
rs_5 = pd.Series(rs_5, index=dates)[Aft:].cumprod()

s0050 = close['0050'][Aft:]

pd.DataFrame(
    {
        'nn strategy return': rs_1.reindex(s0050.index, method='ffill'),
        'nn strategy return_ma': rs_2.reindex(s0050.index, method='ffill'),
        'nn strategy return_mktcap->100 10e': rs_3.reindex(s0050.index, method='ffill'),
        #'nn strategy return_mktcap-<300 10e': rs_4.reindex(s0050.index, method='ffill'),
        'nn strategy return_mktcap->300 10e': rs_5.reindex(s0050.index, method='ffill'),
        '0050 return': s0050 / s0050[0],
    }
).plot()


In [None]:
    predi_target = dataset_time['result1'] + dataset_time['result2'] + dataset_time['result3']
predi_target

### 當月持股狀況

In [None]:
dataset.index.levels[1].max()

In [None]:
# get the latest dataset
last_date = dataset.index.levels[1].max()#"2022-10-15"
is_last_date = dataset.index.get_level_values('date') == last_date
last_dataset = dataset[is_last_date].copy()


last_dataset = drop_extreme_case(last_dataset,
                                 feature_names, thresh=0.01)


# remove NaN testcases
last_dataset = last_dataset.dropna(subset=feature_names)

# predict

vals = model.predict(last_dataset[feature_names].astype(float))
last_dataset['result1'] = pd.Series(vals.swapaxes(0,1)[0], last_dataset.index)

vals = gs_best_model.predict(last_dataset[feature_names].astype(float))
last_dataset['result2'] = pd.Series(vals, last_dataset.index)

vals = xgb_best_model.predict(last_dataset[feature_names].astype(float))
last_dataset['result3'] = pd.Series(vals, last_dataset.index)

# calculate score

rank = last_dataset['result1'] + last_dataset['result2'] + last_dataset['result3']
#rank_target = rank.where((last_dataset['vol_ma5'] > fil_ma_s)  & (last_dataset['vol_ma5'] < fil_ma_e)  & (last_dataset['市值'] < 1e10), 0)
rank_target = rank.where((last_dataset['vol_ma5'] > fil_ma_e)  & (last_dataset['市值'] < 1e10), 0)

condition = (rank_target >= rank_target.nlargest(20).iloc[-1]) 

# plot rank distribution
rank_target[rank_target != 0].hist(bins=20)


# show the best 20 stocks
slist1 = rank_target[rank_target != 0].reset_index()['stock_id']

In [None]:
#rank_target['8299']

# 平均分配資產於股票之中

In [None]:
close = data.get("收盤價")

money = 662919
stock_prices = close[rank_target[condition].reset_index()['stock_id']].iloc[-1]


print("股票平分張數:")
money / len(stock_prices) / stock_prices / 1000


In [None]:
T_stock = '1101'
(last_dataset['vol_ma5'][T_stock] > fil_ma_s)
(last_dataset['vol_ma5'][T_stock] < fil_ma_e)  & (last_dataset['市值'] < 1e10)

In [None]:
import pyfolio as pf

import pickle
pickle.dump(rs_2, open('230618_return_history.pkl', 'wb'))

In [None]:
 rs_2

In [None]:
pf.tears.create_capacity_tear_sheet(rs_2)

In [None]:
import pickle
import pandas as pd

# 得到 上面的 回測結果
ret = pickle.load(open("230618_return_history.pkl", "rb"))

# 將回測報酬率取出來
ret = ret.pct_change().dropna()
#ret.index = pd.to_datetime(ret.index).tz_localize('Asia/Taipei')

# 利用pyfolio 比較報酬率

pf.create_returns_tear_sheet(ret, benchmark_rets=close['0050'].reindex(ret.index, method='ffill').pct_change())

###################################################################################################
## 移動窗格最佳化
[有空來試試看連結](https://hahow.in/courses/5b9d3a6dca498a001e917383/discussions/61b4c90147843d0006cf2593)

###################################################################################################

In [None]:
train = dataset_train[feature_names].astype(float) , dataset_train['return'] #> 1.00
test = dataset_test[feature_names].astype(float) , dataset_test['return'] #> 1.00

In [None]:
from sklearn.ensemble import RandomForestRegressor

# 創建Random Forest模型
random_forest = RandomForestRegressor(n_estimators=100)

# 使用訓練資料訓練模型
random_forest.fit(*train)

# 使用測試資料評估模型
accuracy = random_forest.score(*test)
print("Random Forest 模型的準確率：", accuracy)

In [None]:
import lightgbm as lgb

from sklearn.metrics import mean_squared_error

import matplotlib.pyplot as plt

import numpy as np

%matplotlib inline

def select(df):

    rank = df['pre']

    condition1 = (rank >= rank.nlargest(1).iloc[-1])

    return df['return'][condition1].mean() * (1-3/1000-1.425/1000*2*0.6)

end = 5

cf = lgb.LGBMRegressor(n_estimators=500)



train_time = ['2015','2016','2017','2018','2019']

s_time = ['2007','2008','2009','2010','2011']

test_time = ['2016','2017','2018','2019','2020']

dataset_copy = dataset_dropna.copy()

store_mse = []

In [None]:
for time in range(end):

    print('%d 次執行中'%(time+1))

    dataset_dropna2_train = dataset_copy.loc[s_time[time]:train_time[time]] #2007~ 2015   2008~2016   2009~2017  2010~2018  ....

    dataset_dropna2_test = dataset_copy.loc[test_time[time]:test_time[time]]#            2016                2017               2018              2019  .....

    
    cf.fit(dataset_dropna2_train[feature_names].astype(float), dataset_dropna2_train['rank'])
    predict = cf.predict(dataset_dropna2_test[feature_names])

    dataset_dropna2_test['pre'] = predict

    dates = dataset_dropna2_test.index.get_level_values('date')
    
    b = dataset_dropna2_test.groupby(dates).apply(select).cumprod()
    
    s0050 = close['0050'][test_time[time]:test_time[time]]
    
    s0056 = close['0056'][test_time[time]:test_time[time]]
    
    pd.DataFrame({'Best 1 stocks return(include handling fee)':b.reindex(s0050.index, method='ffill'), 
    
                  '0050':s0050/s0050[0],'0056':s0056/s0056[0]}).plot()
    
    plt.ylabel('return')



In [None]:
cf.predict(dataset_dropna2_test[feature_names])

In [None]:
dataset_dropna2_train[feature_names].astype(float)