# Intelligent Asset Allocator to build a Globally Diverse Portfolio
Objective of this research is to build a comprehensive robo-advisory solution for portfolio optimization using Qlib framework over the ETFs listed in U.S. Market to build a globally diverse portfolio

### Install Pre-requsites

%pip install --upgrade pip

%pip install --upgrade numpy

%pip install --upgrade pandas

%pip install --upgrade cython

%pip install --upgrade xgboost

%pip install --upgrade catboost

%pip install --upgrade torch

%pip install --upgrade matplotlib

%pip install ../../

### Download the getdata script from microsoft/qlib repository

In [None]:
import sys, site
from pathlib import Path

scripts_dir = Path.cwd().parent.joinpath("scripts")
if not scripts_dir.joinpath("get_data.py").exists():
    # download get_data.py script
    scripts_dir = Path("~/tmp/qlib_code/scripts").expanduser().resolve()
    scripts_dir.mkdir(parents=True, exist_ok=True)
    import requests
    with requests.get("https://raw.githubusercontent.com/microsoft/qlib/main/scripts/get_data.py") as resp:
        with open(scripts_dir.joinpath("get_data.py"), "wb") as fp:
            fp.write(resp.content)

### Download U.S. Market data from Yahoo Finance using Qlib Data Collector

In [None]:
import sys, site 
from pathlib import Path
import qlib 

provider_uri = "~/.qlib/qlib_data/us_data"  # target_dir
sys.path.append(str(scripts_dir))
from get_data import GetData
GetData().qlib_data(target_dir=provider_uri, region='us', delete_old=False, exists_skip=True)

### Initialize Qlib with data downloaded 

In [1]:
import qlib 
provider_uri = "~/.qlib/qlib_data/v2/us_data"  # target_dir
qlib.init(provider_uri=provider_uri, region='us')

[1289:MainThread](2022-11-09 01:09:43,643) INFO - qlib.Initialization - [config.py:413] - default_conf: client.
[1289:MainThread](2022-11-09 01:09:43,940) INFO - qlib.workflow - [expm.py:31] - experiment manager uri is at file:/home/studio-lab-user/qlib/qlib/examples/global_opportunity/mlruns
[1289:MainThread](2022-11-09 01:09:43,941) INFO - qlib.Initialization - [__init__.py:74] - qlib successfully initialized based on client settings.
[1289:MainThread](2022-11-09 01:09:43,942) INFO - qlib.Initialization - [__init__.py:76] - data_path={'__DEFAULT_FREQ': PosixPath('/home/studio-lab-user/.qlib/qlib_data/v2/us_data')}


### Setup Control Parameters

In [2]:
market = "all"
benchmark = "SPY"
filter_instruments = "^(GLD|SHV|HYG|AGG|BNDX|NFTY|EWZ|EWJ|ASHR|VGK|EWS|EWA|VNQ|PDBC|PXE|IRBO|BITQ|UVXY|DOG|HDGE)$"
filter_start_time = "2007-01-01"
filter_end_time = "2022-10-01"
fit_start_time = "2007-01-01"
fit_end_time = "2015-12-31"
val_start_time = "2016-01-01"
val_end_time = "2018-12-31"
test_start_time = "2019-01-01"
test_end_time = "2022-10-01"


inputSize = 158
datasetClass = "Alpha158" #  "Alpha360"

### Prepare the model by training with the filter data

In [None]:
from qlib.utils import exists_qlib_data, init_instance_by_config
from qlib.utils import flatten_dict
from qlib.workflow import R

task = {
    "model": {
        "class": "LGBModel",
        "module_path": "qlib.contrib.model.gbdt",
        "kwargs": {
            "loss": "mse",
            "colsample_bytree": 0.8879,
            "learning_rate": 0.0421,
            "subsample": 0.8789,
            "lambda_l1": 205.6999,
            "lambda_l2": 580.9768,
            "max_depth": 8,
            "num_leaves": 210,
            "num_threads": 20,
        },
    },
    "dataset": {
        "class": "DatasetH",
        "module_path": "qlib.data.dataset",
        "kwargs": {
            "handler": {
                "class": datasetClass,
                "module_path": "qlib.contrib.data.handler",
                "kwargs": {
                    "start_time": filter_start_time,
                    "end_time": filter_end_time,
                    "fit_start_time": fit_start_time,
                    "fit_end_time": fit_end_time,
                    "instruments": market,
                    "filter_pipe": [{
                        "filter_type": "NameDFilter",
                        "name_rule_re": filter_instruments,
                        "filter_start_time": filter_start_time,
                        "filter_end_time": filter_end_time,
                    }],
                    "infer_processors":[{
                        "class": "RobustZScoreNorm",
                        "kwargs": {
                            "fields_group": "feature",
                            "clip_outlier": "true"
                        }},
                        {
                        "class": "Fillna",
                        "kwargs": {
                            "fields_group": "feature"
                        }}],
                    "learn_processors":[{
                        "class": "CSRankNorm",
                        "kwargs":{
                              "fields_group": "label"
                        }
                        }],
                     "label": ["Ref($close, -2) / Ref($close, -1) - 1"]
                },
            },
            "segments": {
                "train": (fit_start_time, fit_end_time),
                "valid": (val_start_time, val_end_time),
                "test": (test_start_time, test_end_time)
            },
        },
    },
}


tra_task = {
  "model": {
     "class": "TRAModel",
     "module_path": "qlib.contrib.model.pytorch_tra",
     "kwargs": {
        "tra_config": {
              "num_states": 3,
              "rnn_arch": "LSTM",
              "hidden_size": 32,
              "num_layers": 1,
              "dropout": 0.0,
              "tau": 1.0,
              "src_info": "LR_TPE",
        },
        "model_config": {
              "input_size": inputSize,
              "hidden_size": 256,
              "num_layers": 2,
              "rnn_arch": "LSTM",
              "use_attn": True,
              "dropout": 0.2,
        },
        "model_type": "RNN",
        "lr": 0.0001,
        "n_epochs": 100,
        "early_stop": 20,
        "logdir": "output_tra",
        "seed": 0,
        "lamb": 1.0,
        "rho": 0.99,
        "alpha": 0.5,
        "transport_method": "router",
        "memory_mode": "sample",
        "eval_train": False,
        "eval_test": True,
        "pretrain": True,
        "freeze_model": False,
        "freeze_predictors": False,
     }
  },
  "dataset": {
     "class": "MTSDatasetH",
     "module_path": "qlib.contrib.data.dataset",
      "kwargs": {
            "handler": {
                "class": datasetClass,
                "module_path": "qlib.contrib.data.handler",
                "kwargs": {
                    "start_time": filter_start_time,
                    "end_time": filter_end_time,
                    "fit_start_time": fit_start_time,
                    "fit_end_time": fit_end_time,
                    "instruments": market,
                    "filter_pipe": [{
                        "filter_type": "NameDFilter",
                        "name_rule_re": filter_instruments,
                        "filter_start_time": filter_start_time,
                        "filter_end_time": filter_end_time,
                    }],
                    "infer_processors":[{
                        "class": "RobustZScoreNorm",
                        "kwargs": {
                            "fields_group": "feature",
                            "clip_outlier": "true"
                        }},
                        {
                        "class": "Fillna",
                        "kwargs": {
                            "fields_group": "feature"
                        }}],
                    "learn_processors":[{
                        "class": "CSRankNorm",
                        "kwargs":{
                              "fields_group": "label"
                        }
                        }],
                     "label": ["Ref($close, -2) / Ref($close, -1) - 1"]
                },
            },
            "segments": {
                "train": (fit_start_time, fit_end_time),
                "valid": (val_start_time, val_end_time),
                "test": (test_start_time, test_end_time)
            },
         "seq_len": 60,
         "horizon": 2,
         "num_states": 5,
         "batch_size": 1024,
         "memory_mode": "sample",
         "drop_last": True,
      }
    }
}

# model initiaiton
model = init_instance_by_config(task["model"])
dataset = init_instance_by_config(task["dataset"])

# start exp to train model
with R.start(experiment_name="train_model"):
    R.log_params(**flatten_dict(task))
    model.fit(dataset)
    R.save_objects(trained_model=model)
    rid = R.get_recorder().id

### Backtest the model and prepare the statistics

In [None]:
from qlib.workflow.record_temp import SignalRecord, PortAnaRecord

topk_strategy =  {
        "class": "TopkDropoutStrategy",
        "module_path": "qlib.contrib.strategy.signal_strategy",
        "kwargs": {
            "model": model,
            "dataset": dataset,
            "topk": 9,
            "n_drop": 2,
    }
}

enhanced_index_strategy =  {
        "class": "EnhancedIndexingStrategy",
        "module_path": "qlib.contrib.strategy.signal_strategy",
        "kwargs": {
            "model": model,
            "dataset": dataset,
            "riskmodel_root": "./riskdata",
            "market": benchmark,
            "optimizer_kwargs": {
                "solver_kwargs" : {
                    "verbose" : "true"
                }
            }
    }
}

port_analysis_config = {
    "executor": {
        "class": "SimulatorExecutor",
        "module_path": "qlib.backtest.executor",
        "kwargs": {
            "time_per_step": "day",
            "generate_portfolio_metrics": True,
        },
    },
    "strategy": topk_strategy,
    "backtest": {
        "start_time": test_start_time,
        "end_time": test_end_time,
        "account": 100000000,
        "benchmark": benchmark,
        "exchange_kwargs": {
            "freq": "day",
            "limit_threshold": 0.095,
            "deal_price": "close",
            "open_cost": 0.0005,
            "close_cost": 0.0015,
            "min_cost": 5,
        },
    },
}

# backtest and analysis
with R.start(experiment_name="backtest_analysis"):
    recorder = R.get_recorder(recorder_id=rid, experiment_name="train_model")
    model = recorder.load_object("trained_model")

    # prediction
    recorder = R.get_recorder()
    ba_rid = recorder.id
    sr = SignalRecord(model, dataset, recorder)
    sr.generate()

    # backtest & analysis
    par = PortAnaRecord(recorder, port_analysis_config, "day")
    par.generate()


### Analysis the statitics

In [None]:
from qlib.utils import flatten_dict
from qlib.workflow import R
from qlib.contrib.report import analysis_model, analysis_position
from qlib.data import D
recorder = R.get_recorder(recorder_id=ba_rid, experiment_name="backtest_analysis")

pred_df = recorder.load_object("pred.pkl")
report_normal_df = recorder.load_object("portfolio_analysis/report_normal_1day.pkl")
positions = recorder.load_object("portfolio_analysis/positions_normal_1day.pkl")
analysis_df = recorder.load_object("portfolio_analysis/port_analysis_1day.pkl")

In [None]:
analysis_position.report_graph(report_normal_df)

In [None]:
analysis_position.risk_analysis_graph(analysis_df, report_normal_df)

In [None]:
import pandas as pd

label_df = dataset.prepare("test", col_set="label")
label_df.columns = ['label']
pred_label = pd.concat([label_df, pred_df], axis=1, sort=True).reindex(label_df.index)
analysis_position.score_ic_graph(pred_label)

In [None]:
analysis_model.model_performance_graph(pred_label)

*

*

*

# ---------------------------------------- END -------------------------------------------- ####