In [None]:
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import io
from dowhy import CausalModel
import pandas as pd
import numpy as np
import qlib
from qlib.utils import init_instance_by_config
from tqdm.auto import tqdm
import yaml
qlib.init()

In [None]:
# DATASET = "Alpha158"
DATASET = "Alpha360"
PRICE = 'CLOSE'

In [None]:
np.random.seed(42)  # for get consistant result

In [None]:
# 1) loading data
conf = f"""
class: {DATASET}
module_path: qlib.contrib.data.handler
kwargs:
    start_time: 2008-01-01
    end_time: 2020-08-01
    fit_start_time: 2008-01-01
    fit_end_time: 2014-12-31
    instruments: csi300
    infer_processors:
        - class: RobustZScoreNorm
          kwargs:
              fields_group: feature
              clip_outlier: true
        - class: Fillna
          kwargs:
              fields_group: feature
    learn_processors:
        - class: DropnaLabel
        - class: CSRankNorm
          kwargs:
              fields_group: label
"""
hconf = yaml.safe_load(io.StringIO(conf))
hd = init_instance_by_config(hconf)

In [None]:
df = hd.fetch(slice("2008-01-01", "2014-12-31"), data_key=hd.DK_L)
df = df.loc[:, df.columns.str.contains(f'{PRICE}|LABEL')]

In [None]:
df.columns[df.std() < 0.2]  # have no vwap

In [None]:
# Outcome
LABEL = "LABEL0"

# Explain feature

## causality

In [None]:
def estimate_causal(df, col, label="LABEL0"):
    # Building the causal graph;
    n = int(col.lstrip(PRICE))
    common_causes = [col for col in df.columns[~df.columns.isin([col, label])].to_list() if int(col.lstrip(PRICE)) < n]
    
    # print(col, label, common_causes)
    model = CausalModel(data=df, treatment=col, outcome=label, common_causes=common_causes)
    # https://github.com/microsoft/dowhy/issues/261
    # - identify_effec is really slow;  So we have to change the method to make it faster
    identified_estimand = model.identify_effect(proceed_when_unidentifiable=True, method_name="maximal-adjustment")

    estimate = model.estimate_effect(identified_estimand, method_name="backdoor.linear_regression")
    print("Causal Estimate is " + str(estimate.value))
    return estimate.value

In [None]:
# causal calculation is often slow. So multiprocessing is used.
from joblib import Parallel, delayed
import multiprocessing as mp

In [None]:
keys = []
res = []
for col in tqdm(df.columns):
    if col == LABEL:  # skip label
        continue
    keys.append(col)
    res.append(delayed(estimate_causal)(df, col))
res = Parallel(n_jobs= max(1, mp.cpu_count() // 2), verbose=10)(res)

In [None]:
cause_dict = dict(zip(keys, res))

In [None]:
cause_s = pd.Series(cause_dict)
cause_s_top = cause_s.reindex(cause_s.abs().nlargest(10).index)

In [None]:
cause_s_top.to_frame('causal effect')

##  SHAP

In [None]:
import lightgbm as lgb
import shap
X, y = df.loc[:, ~df.columns.isin([LABEL])], df[LABEL]

In [None]:
model = lgb.train({"objective": "mse"}, train_set=lgb.Dataset(X, label=y))

In [None]:
explainer = shap.Explainer(model)
shap_values = explainer(X)

In [None]:
shap_s = pd.Series(shap_values.values.mean(axis=0), index=df.columns[~df.columns.isin([LABEL])])

In [None]:
shap_s_top = shap_s.reindex(shap_s.abs().nlargest(10).index)

In [None]:
shap_s_top.to_frame("avg SHAP values")

## Compare

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set_theme(color_codes=True)

In [None]:
sns.regplot(x="causal", y="SHAP", data=pd.DataFrame({'causal': cause_s, 'SHAP': shap_s}))
plt.show()

# Select and Train models

In [None]:
exp_name = "fea_select"

In [None]:
from copy import deepcopy
from qlib.model.trainer import task_train
import ruamel.yaml as yaml

In [None]:
with open(f"../../benchmarks/LightGBM/workflow_config_lightgbm_{DATASET}.yaml") as fp:
    config = yaml.safe_load(fp)

In [None]:
qlib.init(**config.get("qlib_init"))

## Full feature

In [None]:
full_task = deepcopy(config.get('task'))

full_task['dataset']['kwargs']['handler']['kwargs']['infer_processors'] = [{
    "class": "FilterCol",
    "kwargs": {
        "fields_group": "feature",
        "col_list": df.columns[df.columns.str.contains("CLOSE")].to_list()
    },
}]

recorder_full = task_train(deepcopy(config.get('task')), experiment_name=exp_name)

## Causal Feature

In [None]:
causal_task = deepcopy(config.get('task'))

causal_task['dataset']['kwargs']['handler']['kwargs']['infer_processors'] = [{
    "class": "FilterCol",
    "kwargs": {
        "fields_group": "feature",
        "col_list": cause_s_top.index.to_list()
    },
}]

recorder_cause = task_train(causal_task, experiment_name=exp_name)

## SHAP feature

In [None]:
shap_task = deepcopy(config.get('task'))

In [None]:
shap_task['dataset']['kwargs']['handler']['kwargs']['infer_processors'] = [{
    "class": "FilterCol",
    "kwargs": {
        "fields_group": "feature",
        "col_list": shap_s_top.index.to_list()
    },
}]

In [None]:
recorder_shap = task_train(shap_task, experiment_name=exp_name)

## Random

In [None]:
rand_task = deepcopy(config.get('task'))

rand_task['dataset']['kwargs']['handler']['kwargs']['infer_processors'] = [{
    "class": "FilterCol",
    "kwargs": {
        "fields_group": "feature",
        "col_list": cause_s.index.to_series().sample(10).to_list()
    },
}]

recorder_rand = task_train(rand_task, experiment_name=exp_name)

## compare

In [None]:
cmp_res = {}

In [None]:
for name, rec in [('full', recorder_full), ('causal', recorder_cause), ('SHAP', recorder_shap), ('rand', recorder_rand)]:
    cmp_res[name] = {"IC": rec.load_object('sig_analysis/ic.pkl').mean(), "Rank IC ": rec.load_object('sig_analysis/ric.pkl').mean()}

In [None]:
pd.DataFrame(cmp_res).plot(kind='bar')