In [65]:
import json
import time
import random
import numpy as np
import pandas as pd
import xgboost as xgb
import datetime
# import pandas as pd
from core.initialization.load_data_file_paths import load_selected_data_files_paths
# from core.classification.learning_utils import encode_categorical_labels_to_numerical
# from core.classification.main import classify
from core.feature_extraction.FE_v0 import load_preprocess_extract_features
import warnings
warnings.filterwarnings("ignore")
np.random.seed(135)
random.seed(135)
pd.set_option("display.max_cols", None)

def get_predictions(model_folder, start_day, end_day):
    configs = json.load(open('Results/' + model_folder + '/configs.json', 'r'))
    configs["data_selection"]["time_filtering"] = [start_day, end_day]
    performances = pd.read_csv("Results/" + model_folder + '/performances.csv')
    precision_1_test = performances.iloc[1, 4]
    # 1:
    ""
    data_paths, metadata = load_selected_data_files_paths(configs)

    # 2: Get Xy for prediction (future)
    Xy = load_preprocess_extract_features(data_paths, metadata, configs)
    print("Xy.shape", Xy.shape)

    samples_for_prediction = Xy[Xy['Label'].isnull()]
    y_samples_for_prediction = samples_for_prediction['Label']
    cols_not_for_training = ["Ticker", 'Date', 'close', 'open', 'high', 'low', 'volume', 'Label', 'Max_increase', 
                             'Max_decrease']
    X_samples_for_prediction = samples_for_prediction.drop(cols_not_for_training, axis=1)

    print("X_samples_for_prediction", X_samples_for_prediction.shape)

    # 3:
    model = xgb.Booster()  # init model
    model.load_model('Results/' + model_folder + '/model_sklearn.json')  # load data

    d_test = xgb.DMatrix(X_samples_for_prediction, label=y_samples_for_prediction, nthread=2)

    prob_y_test = model.predict(d_test)
    result_fold_ts = samples_for_prediction[['Ticker', 'Date']]
    result_fold_ts[model_folder] = prob_y_test

    # n =2
    # import shap
    # import matplotlib.pyplot as plt
    # top_n_predictions = result_fold_ts[model_folder].nlargest(n).reset_index()['index'].values
    # X_top_n = X_samples_for_prediction.loc[top_n_predictions]
    # for x in top_n_predictions:
    #     explainer = shap.Explainer(model)
    #     shap_values = explainer(X_top_n)
    #     shap.plots.force(shap_values[0])
    #     plt.show()

    # configs = json.load(open(model_folder + '/configs.json', 'r'))
    # first_day = samples_for_prediction['Date'].min().strftime("%Y%m%d")
    # last_day = samples_for_prediction['Date'].max().strftime("%Y%m%d")
    # Market = str(configs['data_selection']['ticker_filtering']['Country'][0])
    # m_caps = configs['data_selection']['ticker_filtering']['MarketCap_in_Billion']

    # result_fold_ts.to_csv(f'Results/Future_prediction/{first_day}_to_{last_day}_{Market}_{m_caps[0]}_{m_caps[1]}B.csv', 
    # index=False)

    # result_fold_ts = pd.concat([X_test[cols_not_for_training].reset_index(drop=True),
    #                             result_fold_ts.reset_index(drop=True)], axis=1)
    return pd.DataFrame(result_fold_ts), precision_1_test

OptionError: "No such keys(s): 'display.max_cols'"

In [62]:
st = time.time()

today = datetime.datetime.today().strftime('%Y-%m-%d')
end_date = today
start_date = str(datetime.date.today() + datetime.timedelta(days=-15))  # "2022-02-03"
print(start_date, end_date)

models = [
    ['Jan2022/2022.02.01-1153_All_10_3000B_20210101_to_20220131, 5_in_5days', start_date, end_date],
    ['Jan2022/2022.02.01-1203_All_30_3000B_20210101_to_20220131, 5_in_5days', start_date, end_date],
    ['Jan2022/+2022.01.30-2245_All_40_3000B_20210101_to_20220128, 7_in_14days', start_date, end_date],
    ['Jan2022/+2022.01.30-2356_All_40_3000B_20210101_to_20220128, 5_in_10days', start_date, end_date],
    ['Jan2022/+2022.01.31-2246_All_10_3000B_20210101_to_20220131, 5_in_10days', start_date, end_date],
    ['Jan2022/+2022.01.31-2256_All_10_3000B_20210501_to_20211231, 7_in_14days', start_date, end_date],
    ['Jan2022/2022.01.31-2305_All_10_3000B_20210101_to_20220131, 7_in_14days',  start_date, end_date],
    ['2022.03.02-1516_Canada_All_30_3000B_20210101_to_20220228, 5_in_10days', start_date, end_date],
    ['2022.03.02-1514_Canada_All_30_3000B_20210101_to_20220228, 4_in_4days', start_date, end_date],
    ['2022.03.02-1505_Canada_All_5_3000B_20210101_to_20220228, 4_in_4days', start_date, end_date],
    ['2022.03.02-1500_Canada_All_5_3000B_20210101_to_20220228, 7_in_14days', start_date, end_date],
    # ['',  start_date, end_date],
    ]

res, Precision_1_test = get_predictions(models[0][0], models[0][1], models[0][2])
classifiers_weights = [Precision_1_test]

for model_folder1, from_date, to_date in models[1:]:
    print(f"\n {model_folder1}")
    res1, Precision_1_test = get_predictions(model_folder1, from_date, to_date)
    res = pd.merge(res, res1, how='outer', on=['Ticker', 'Date'])
    classifiers_weights.append(Precision_1_test)

res['Weighted Avg'] = np.average(res.iloc[:, 2:2+len(models)], axis=1, weights=classifiers_weights)
res['Mean'] = np.mean(res.iloc[:, 2:2+len(models)], axis=1)
res['Prod'] = np.prod(res.iloc[:, 2:2+len(models)], axis=1)

now = time.strftime('%Y.%m.%d-%H%M')
res.to_csv(f'Results/+ Future_prediction/{now}.csv', index=False)

print(classifiers_weights)
print("Elapsed time (seconds) : ", (time.time() - st)/60)

2022-03-08 2022-03-23
selected_meta_data.shape =  (83, 13)
Finance                   16
Non-Energy Minerals       13
Retail Trade               8
Technology Services        7
Communications             6
Utilities                  6
Industrial Services        6
Energy Minerals            6
Consumer Non-Durables      4
Transportation             3
Commercial Services        3
Consumer Services          2
Health Technology          1
Producer Manufacturing     1
Process Industries         1
Name: Sector, dtype: int64
Processing 1 / 83: 	 ABX.to
Duplicated column names [] []
Processing 2 / 83: 	 AEM.to
Processing 3 / 83: 	 AQN.to
Processing 4 / 83: 	 ATD.to
Processing 5 / 83: 	 BAM-A.to
Processing 6 / 83: 	 BCE.to
Processing 7 / 83: 	 BEP-UN.to
Processing 8 / 83: 	 BHC.to
Processing 9 / 83: 	 BIP-UN.to
Processing 10 / 83: 	 BMO.to
Processing 11 / 83: 	 BNS.to
Processing 12 / 83: 	 CAE.to
Processing 13 / 83: 	 CCL-A.to
Processing 14 / 83: 	 CCL-B.to
Processing 15 / 83: 	 CCO.to
Processing 

Processing 3 / 83: 	 AQN.to
Processing 4 / 83: 	 ATD.to
Processing 5 / 83: 	 BAM-A.to
Processing 6 / 83: 	 BCE.to
Processing 7 / 83: 	 BEP-UN.to
Processing 8 / 83: 	 BHC.to
Processing 9 / 83: 	 BIP-UN.to
Processing 10 / 83: 	 BMO.to
Processing 11 / 83: 	 BNS.to
Processing 12 / 83: 	 CAE.to
Processing 13 / 83: 	 CCL-A.to
Processing 14 / 83: 	 CCL-B.to
Processing 15 / 83: 	 CCO.to
Processing 16 / 83: 	 CDAY.to
Processing 17 / 83: 	 CM.to
Processing 18 / 83: 	 CNQ.to
Processing 19 / 83: 	 CNR.to
Processing 20 / 83: 	 CP.to
Processing 21 / 83: 	 CSU.to
Processing 22 / 83: 	 CTC-A.to
Processing 23 / 83: 	 CTC.to
Processing 24 / 83: 	 CVE.to
Processing 25 / 83: 	 DOL.to
Processing 26 / 83: 	 EMA.to
Processing 27 / 83: 	 EMP-A.to
Processing 28 / 83: 	 ENB.to
Processing 29 / 83: 	 FFH-U.to
Short history of ticker, Skip it !
Processing 30 / 83: 	 FFH.to
Processing 31 / 83: 	 FM.to
Processing 32 / 83: 	 FNV.to
Processing 33 / 83: 	 FSV.to
Processing 34 / 83: 	 FTS.to
Processing 35 / 83: 	 GFL.to

Processing 54 / 83: 	 OTEX.to
Processing 55 / 83: 	 OVV.to
Processing 56 / 83: 	 POW.to
Processing 57 / 83: 	 PPL.to
Processing 58 / 83: 	 QSP-UN.to
Processing 59 / 83: 	 QSR.to
Processing 60 / 83: 	 RCI-A.to
Processing 61 / 83: 	 RCI-B.to
Processing 62 / 83: 	 RY.to
Processing 63 / 83: 	 SAP.to
Processing 64 / 83: 	 SJR-B.to
Processing 65 / 83: 	 SLF.to
Processing 66 / 83: 	 SU.to
Processing 67 / 83: 	 Shop.to
Processing 68 / 83: 	 T.to
Processing 69 / 83: 	 TD.to
Processing 70 / 83: 	 TECK-A.to
Processing 71 / 83: 	 TECK-B.to
Processing 72 / 83: 	 TFII.to
Processing 73 / 83: 	 TIXT.to
Processing 74 / 83: 	 TOU.to
Processing 75 / 83: 	 TPX-A.to
Processing 76 / 83: 	 TPX-B.to
Processing 77 / 83: 	 TRI.to
Processing 78 / 83: 	 TRP.to
Processing 79 / 83: 	 WCN.to
Processing 80 / 83: 	 WFG.to
Processing 81 / 83: 	 WN.to
Processing 82 / 83: 	 WPM.to
Processing 83 / 83: 	 WSP.to
Whole Dataset Generated with Shape of (33757, 163)
FinTA.OBV_Normalized column .isnull().values.any()
FinTA.Pct_c

Processing 93 / 100: 	 TPX-B.to
Processing 94 / 100: 	 TRI.to
Processing 95 / 100: 	 TRP.to
Processing 96 / 100: 	 WCN.to
Processing 97 / 100: 	 WFG.to
Processing 98 / 100: 	 WN.to
Processing 99 / 100: 	 WPM.to
Processing 100 / 100: 	 WSP.to
Whole Dataset Generated with Shape of (40859, 163)
FinTA.OBV_Normalized column .isnull().values.any()
FinTA.Pct_change_400 column .isnull().values.any()
Feature extraction elapsed time (seconds) :  149.66118168830872
Xy.shape (1156, 163)
X_samples_for_prediction (388, 153)

 2022.03.02-1500_Canada_All_5_3000B_20210101_to_20220228, 7_in_14days
selected_meta_data.shape =  (100, 13)
Finance                   20
Non-Energy Minerals       14
Utilities                 10
Technology Services        9
Retail Trade               8
Industrial Services        7
Energy Minerals            7
Communications             6
Consumer Non-Durables      4
Commercial Services        4
Transportation             4
Consumer Services          2
Consumer Durables          

In [66]:
res2 = res[['Ticker', 'Date', 'Weighted Avg', 'Mean','Prod']]


def best_by_Weighted_Avg(df):
    top_candidates = []
    print(df.shape)
    n = 6
    cand1 = list(df.sort_values(by='Weighted Avg', ascending=False)['Ticker'].iloc[0:n])
    cand2 = list(df.sort_values(by='Mean', ascending=False)['Ticker'].iloc[0:n])
    cand3 = list(df.sort_values(by='Prod', ascending=False)['Ticker'].iloc[0:n])
    top_candidates = pd.Series([cand1, cand2, cand3])
    return top_candidates

d = pd.DataFrame(res2.groupby('Date').apply(best_by_Weighted_Avg), )
d.columns = ['Weighted Avg', 'Mean', 'Prod']
d

(97, 5)
(97, 5)
(95, 5)
(95, 5)
(96, 5)
(96, 5)
(97, 5)
(96, 5)
(97, 5)
(97, 5)
(96, 5)
(97, 5)


Unnamed: 0_level_0,Weighted Avg,Mean,Prod
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-03-08,"[ABX.to, ATD.to, BAM-A.to, BCE.to, BMO.to, BNS...","[Shop.to, DOO.to, MG.to, TIXT.to, FSV.to, ONEX...","[DOO.to, ONEX.to, LSPD.to, AC.to, MG.to, CIGI.to]"
2022-03-09,"[ABX.to, ATD.to, BAM-A.to, BCE.to, BMO.to, BNS...","[FSV.to, MG.to, Shop.to, GFL.to, BHC.to, CDAY.to]","[LSPD.to, FSV.to, AC.to, DOO.to, ONEX.to, RBA.to]"
2022-03-10,"[ABX.to, ATD.to, BAM-A.to, BCE.to, BMO.to, BNS...","[Shop.to, CDAY.to, MG.to, TIXT.to, LSPD.to, FS...","[LSPD.to, ONEX.to, GLXY.to, AC.to, STN.to, CIG..."
2022-03-11,"[ABX.to, ATD.to, BAM-A.to, BCE.to, BMO.to, BNS...","[LSPD.to, TIXT.to, Shop.to, CDAY.to, MG.to, FS...","[LSPD.to, ONEX.to, GLXY.to, DOO.to, AC.to, STN..."
2022-03-14,"[ABX.to, ATD.to, BAM-A.to, BCE.to, BMO.to, BNS...","[LSPD.to, Shop.to, TIXT.to, CDAY.to, MG.to, GL...","[LSPD.to, GLXY.to, ONEX.to, DOO.to, AC.to, STN..."
2022-03-15,"[ABX.to, ATD.to, BAM-A.to, BCE.to, BMO.to, BNS...","[TIXT.to, MG.to, GLXY.to, Shop.to, FSV.to, BHC...","[GLXY.to, LSPD.to, ONEX.to, CIGI.to, AC.to, DS..."
2022-03-16,"[CCL-A.to, CTC.to, ABX.to, ATD.to, BAM-A.to, B...","[CDAY.to, MG.to, CCL-B.to, CAE.to, Shop.to, BH...","[LSPD.to, DOO.to, GLXY.to, RBA.to, STN.to, K.to]"
2022-03-17,"[ABX.to, AEM.to, AQN.to, ATD.to, BAM-A.to, BCE...","[MG.to, CDAY.to, CAE.to, TIXT.to, FSV.to, Shop...","[STN.to, GLXY.to, DOO.to, ONEX.to, LSPD.to, DS..."
2022-03-18,"[Shop.to, NGT.to, SU.to, ABX.to, CSU.to, CP.to]","[TIXT.to, Shop.to, FSV.to, TOU.to, CCL-A.to, C...","[CIGI.to, K.to, RBA.to, BEPC.to, STN.to, ONEX.to]"
2022-03-21,"[Shop.to, CSU.to, ATD.to, TRI.to, BAM-A.to, AB...","[CDAY.to, Shop.to, TIXT.to, MG.to, NVEI.to, ON...","[ONEX.to, GLXY.to, DOO.to, RBA.to, CIGI.to, DS..."


In [57]:
res2 = res[['Ticker', 'Date', 'Weighted Avg', 'Mean','Prod']]

def best_by_Weighted_Avg(df):
    top_candidates = []
    print(df.shape)
    cand1 = list(df.sort_values(by='Weighted Avg', ascending=False)['Ticker'].iloc[0:5])
    cand2 = list(df.sort_values(by='Mean', ascending=False)['Ticker'].iloc[0:5])
    cand3 = list(df.sort_values(by='Prod', ascending=False)['Ticker'].iloc[0:5])
    top_candidates = pd.Series([cand1, cand2, cand3])
    return top_candidates

d = pd.DataFrame(res2.groupby('Date').apply(best_by_Weighted_Avg), )
d.columns = ['Weighted Avg', 'Mean', 'Prod']
d

(80, 5)
(79, 5)


Unnamed: 0_level_0,Weighted Avg,Mean,Prod
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-03-21,"[Shop.to, CSU.to, ATD.to, TRI.to, BAM-A.to]","[CDAY.to, TIXT.to, Shop.to, NVEI.to, MG.to]","[CDAY.to, TIXT.to, NVEI.to, CCL-A.to, BHC.to]"
2022-03-22,"[Shop.to, CSU.to, SU.to, BAM-A.to, TRI.to]","[TIXT.to, CDAY.to, FSV.to, GFL.to, Shop.to]","[TIXT.to, CDAY.to, FSV.to, GFL.to, CAE.to]"
