# Notebook for downloading/refreshing trainign data

## 1. Download data from Bloomberg for training
Note: 
- in the future the download script will be rewritten to adapt to a REST api
of our choice.
- Use the `asia_dd_env` environment for downloading data from bbg.

In [1]:
import pandas as pd
from xbbg import blp
import numpy as np
from typing import Union, Optional, Any, List, Dict

In [None]:
# market cap > 500m
universe = pd.read_csv("./datafiles/mpax_universe_all.csv", header=[0], index_col=[0], low_memory=False)
universe = universe.query("`CUR_MKT_CAP_USD`>1e3")
figi_list = universe.ID_BB_GLOBAL

In [None]:
training_desc = universe.filter(["SECURITY_NAME", "PARSEKYABLE_DES_SOURCE", 
                "GICS_SUB_INDUSTRY", "ID_ISIN", "UD_ECONOMIC_CORRELATION", 
                "UD_ALGO_RATING", "ID_BB_GLOBAL"])

In [None]:
training_desc

Unnamed: 0,SECURITY_NAME,PARSEKYABLE_DES_SOURCE,GICS_SUB_INDUSTRY,ID_ISIN,UD_ECONOMIC_CORRELATION,UD_ALGO_RATING,ID_BB_GLOBAL
1,ITOCHU Corp,8001 JP Equity,20107010,JP3143600009,Cyclical,2B,BBG000B9WJ55
5,Enerpac Tool Group Corp,ATU US Equity,20106020,US2927651040,Cyclical,3,BBG000B9WX45
8,Tatneft PJSC,ATAD LI Equity,10102020,US8766292051,Commodity,7+,BBG000B9X7K3
9,Ameren Corp,AEE US Equity,55103010,US0236081024,Defensive,2A,BBG000B9X8C0
10,Woodside Petroleum Ltd,WPL AU Equity,10102020,AU000000WPL2,Commodity,3+,BBG000B9XBS6
...,...,...,...,...,...,...,...
22510,Apogee Therapeutics Inc,APGE US Equity,35201010,US03770N1019,Defensive,,BBG01H51WYQ5
22511,BGC Group Inc,BGC US Equity,40203020,US0889291045,Cyclical,,BBG01H9FTGX5
22512,Atlanta Braves Holdings Inc,BATRA US Equity,50202010,US0477261046,Cyclical,,BBG01HCDRG86
22513,Atlanta Braves Holdings Inc,BATRK US Equity,50202010,US0477263026,Cyclical,,BBG01HCX3Y34


In [None]:
bbg_fields = dict(
    return_fields = {"return_com_eqy", "normalized_roe", "operating_roic", "return_on_asset"},
    margin_fields = {"ebitda_margin", "gross_margin", "ebit_margin", "eff_tax_rate",
            "fcf_margin_after_oper_lea_pymt"},
    is_fields = {"sales_rev_turn", "net_income", "is_rd_expend", 
                 "ardr_selling_general_admin_exp", 
            "is_selling_expenses", "is_opex_adjusted", "tot_int_exp"
            "cf_cap_expend_prpty_add", "cf_cash_from_oper"},
    leverage_fields = {"total_debt_to_tot_eqy", "net_debt_to_shrhldr_eqty", 
            "net_debt_to_ebitda", "fixed_charge_coverage_ratio"},
    bs_ratios = {"invent_days", "acct_rcv_days", "days_accounts_payable", 
            "cash_conversion_cycle", },
    est_fields = {"best_sales", "best_gross_margin", "best_net_income"},
    best_overrides = [{"best_fperiod_override": "1FY"}, {"best_fperiod_override": "2FY"}, {"best_fperiod_override": "3FY"}]
    )

In [None]:
import itertools
default_columns = pd.MultiIndex.from_tuples(
    (itertools.chain(*[[(k, i) for i in v] for k, v in bbg_fields.items() 
                       if k not in ("est_fields", "default_override", "best_overrides")])))

In [None]:
from typing import Collection, Sequence, Literal, Dict
import datetime as dt


TODAY = dt.datetime.today().date()

def get_hist_financials(
        tickers: Collection[str], 
        start_date: dt.date=dt.date(1995, 1, 1), 
        end_date: dt.date=TODAY):
    hist_fields = set()
    for fld_name, fld in bbg_fields.items():
        if fld_name not in ("est_fields", "default_override", "best_overrides"):
            hist_fields = hist_fields.union(fld)
    hist_financials = blp.bdh(tickers, hist_fields, start_date=start_date, 
                    end_date=end_date, 
                    Per="Y",
                    # **bbg_fields.get("default_override")
                    )
    hist_financials.index = hist_financials.index.astype("datetime64[ns]").to_series().apply(lambda d: pd.Period(d, freq="Y"))
    hist_financials = hist_financials.rename_axis("year", axis=0).rename_axis(["figi", "field"], axis=1)
    hist_financials = hist_financials.reset_index().groupby("year").mean().stack()\
        .unstack(0).reindex(default_columns.get_level_values(1))
    hist_financials.index = default_columns[default_columns.get_level_values(1).isin(hist_financials.index)]

    # calculated ratios fields
    temp_ratios = hist_financials.loc['is_fields'].drop(['sales_rev_turn'], axis=0) / hist_financials.loc['is_fields'].loc['sales_rev_turn']
    temp_ratios.index = pd.MultiIndex.from_product((["margins"], temp_ratios.index.to_series().apply(lambda x: f"{x}_to_sales").values))
    
    # calculated growth fields
    temp_growth = hist_financials.loc['is_fields'].loc[["sales_rev_turn", "net_income"]]
    temp_growth = temp_growth.stack(1).unstack(0).pct_change(periods=1).stack(1).unstack(0)
    temp_growth.index = pd.MultiIndex.from_product((["growth"], temp_growth.index.to_series().apply(lambda x: f"{x}_growth").values))

    res = pd.concat([hist_financials.drop("is_fields", axis=0), temp_growth])
    return res

# def get_estimates(
#         tickers,
#         start_date: dt.date=dt.date(1995, 1, 1), 
#         end_date: dt.date=TODAY):


def get_price_multiples(
        tickers: Collection[str], 
        start_date: dt.date=dt.date(2000, 1, 1), 
        end_date: dt.date=TODAY):
    df = blp.bdh(tickers, ["px_last", "best_cur_ev_to_ebitda", 
                "fcf_yield_with_cur_entp_val", "best_pe_next_ear",
                "px_to_book_ratio", "px_to_sales_ratio"],
                start_date=start_date,
                end_date=end_date,
                Per="W", 
                )
    df.index = df.index.astype("datetime64[ns]").to_period(freq="W")
    df = df.ffill()
    return df
    
def get_future_returns(ref_date: dt.date, price_df: pd.DataFrame):
    df = price_df.copy(deep=True)
    df.index = df.index.astype("datetime64[ns]")
    base_date_ind = df.index.to_series().lt(np.datetime64(ref_date)).sum()
    _3m_ind = df.index.to_series().lt(np.datetime64(ref_date + dt.timedelta(days=90))).sum()
    _6m_ind = df.index.to_series().lt(np.datetime64(ref_date + dt.timedelta(days=180))).sum()
    _1yr_ind = df.index.to_series().lt(np.datetime64(ref_date + dt.timedelta(days=365))).sum()
    _3yr_ind = df.index.to_series().lt(np.datetime64(ref_date + dt.timedelta(days=365 * 3))).sum()
    base_price = df.iloc[base_date_ind]
    _3m_returns= (df.iloc[ _3m_ind - 5 : _3m_ind + 5] / base_price).mean()
    # _3m_returns_std = (df.iloc[ _3m_ind - 5 : _3m_ind + 5] / base_price).std()
    _6m_returns = (df.iloc[ _6m_ind - 10 : _6m_ind + 10] / base_price).mean()
    _1yr_returns = (df.iloc[ _1yr_ind - 20 : _1yr_ind + 20] / base_price).mean()
    _3yr_returns = (df.iloc[ _3yr_ind - 60 : _3yr_ind + 60] / base_price).mean()
    res = pd.concat([_3m_returns, _6m_returns, _1yr_returns, _3yr_returns], axis=1)
    res.columns = ["3m", "6m", "1yr", "3yr"]
    return res


In [None]:
hist_financial_data = get_hist_financials(training_desc.ID_BB_GLOBAL)
hist_financial_data.to_csv("historical_financial_data.csv")

  hist_financials = hist_financials.reset_index().groupby("year").mean().stack()\
  res = pd.concat([hist_financials.drop("is_fields", axis=0), temp_growth])


In [None]:
px = get_price_multiples(training_desc.ID_BB_GLOBAL)

In [None]:
px.to_csv("price_multiples.csv")

## 2. Organizing data for SQL server

In [2]:
# universe = pd.read_csv("./datafiles/impax_universe_all.csv", 
#                        header=[0], index_col=[0], low_memory=False)
# financial_data = pd.read_csv("./datafiles/historical_financial_data.csv", 
#                              header=[0,1], index_col=[0,1])
# financial_data_forsql = financial_data.T
# financial_data_forsql.columns = financial_data_forsql.columns.get_level_values(1)
# financial_data_forsql.reset_index().to_csv(
#     "./datafiles/historical_financial_data_sql.csv",
#     index=False)
price_multiples = pd.read_csv("./datafiles/price_multiples.csv", 
                             header=[0,1], index_col=[0])

In [3]:
import os, sys
os.chdir("../")
sys.path.append('c:\\Users\\p.peng\\StockEncoder')
from utils.database import SQLDatabase

In [4]:
import datetime as dt
price_multiples.index.names = ["period"]
price_multiples.columns.names = ["figi", "field"]
price_multiples_pivot = price_multiples\
    .unstack().to_frame("value")\
        .pivot_table(index=["figi", "period"], columns=["field"], aggfunc="mean")  # remove duplicates by averaging
price_multiples_pivot.columns = price_multiples_pivot.columns.get_level_values(1)
price_multiples_pivot = price_multiples_pivot.reset_index()
price_multiples_pivot["period"] = price_multiples_pivot.period.apply(
    lambda x: dt.datetime.strptime(x.split("/")[-1], "%Y-%m-%d")).values

In [5]:
price_multiples_pivot = price_multiples_pivot\
            .replace(float("inf"), np.nan)\
            .replace(float("-inf"), np.nan)

In [12]:
# trouble shooting

chunk_size = 10000
for i in tqdm(range(900000, len(price_multiples_pivot), chunk_size)):
    print(i)
    price_multiples_pivot\
                .iloc[i: i + chunk_size].to_sql( 
                "price_multiples_stock_encoder", 
                index=False, 
                if_exists="append",
                con=sql_engine
                )

# price_multiples_pivot\
#             .replace(float("inf"), np.nan)\
#             .replace(float("-inf"), np.nan)\
#             .iloc[(i+1) * chunk_size - 1 : (i+1) * chunk_size].to_sql( 
#             "price_multiples_stock_encoder", 
#             index=False, 
#             if_exists="append",
#             con=sql_engine
#             )

 86%|████████▌ | 606/703 [4:08:27<34:06, 21.10s/it]

6960000


 86%|████████▋ | 607/703 [4:08:51<35:02, 21.90s/it]

6970000


 86%|████████▋ | 608/703 [4:09:24<39:59, 25.25s/it]

6980000


 87%|████████▋ | 609/703 [4:09:47<38:26, 24.54s/it]

6990000


 87%|████████▋ | 610/703 [4:10:09<36:53, 23.80s/it]

7000000


 87%|████████▋ | 611/703 [4:10:39<39:27, 25.73s/it]

7010000


 87%|████████▋ | 612/703 [4:11:07<39:53, 26.30s/it]

7020000


 87%|████████▋ | 613/703 [4:11:39<42:02, 28.03s/it]

7030000


 87%|████████▋ | 614/703 [4:12:09<42:41, 28.78s/it]

7040000


 87%|████████▋ | 615/703 [4:12:30<38:43, 26.40s/it]

7050000


 88%|████████▊ | 616/703 [4:13:02<40:32, 27.96s/it]

7060000


 88%|████████▊ | 617/703 [4:13:34<41:53, 29.23s/it]

7070000


 88%|████████▊ | 618/703 [4:13:57<38:37, 27.27s/it]

7080000


 88%|████████▊ | 619/703 [4:14:20<36:40, 26.20s/it]

7090000


 88%|████████▊ | 620/703 [4:14:50<37:42, 27.25s/it]

7100000


 88%|████████▊ | 621/703 [4:15:24<39:56, 29.23s/it]

7110000


 88%|████████▊ | 622/703 [4:15:58<41:12, 30.52s/it]

7120000


 89%|████████▊ | 623/703 [4:16:25<39:37, 29.71s/it]

7130000


 89%|████████▉ | 624/703 [4:16:56<39:24, 29.93s/it]

7140000


 89%|████████▉ | 625/703 [4:17:21<37:06, 28.54s/it]

7150000


 89%|████████▉ | 626/703 [4:17:43<34:11, 26.64s/it]

7160000


 89%|████████▉ | 627/703 [4:18:04<31:35, 24.95s/it]

7170000


 89%|████████▉ | 628/703 [4:18:26<29:58, 23.98s/it]

7180000


 89%|████████▉ | 629/703 [4:18:49<29:03, 23.56s/it]

7190000


 90%|████████▉ | 630/703 [4:19:13<28:59, 23.82s/it]

7200000


 90%|████████▉ | 631/703 [4:19:39<29:27, 24.55s/it]

7210000


 90%|████████▉ | 632/703 [4:20:03<28:46, 24.32s/it]

7220000


 90%|█████████ | 633/703 [4:20:27<28:22, 24.32s/it]

7230000


 90%|█████████ | 634/703 [4:20:55<29:04, 25.28s/it]

7240000


 90%|█████████ | 635/703 [4:21:18<27:54, 24.63s/it]

7250000


 90%|█████████ | 636/703 [4:21:42<27:08, 24.31s/it]

7260000


 91%|█████████ | 637/703 [4:22:05<26:23, 23.99s/it]

7270000


 91%|█████████ | 638/703 [4:22:34<27:45, 25.62s/it]

7280000


 91%|█████████ | 639/703 [4:23:10<30:39, 28.75s/it]

7290000


 91%|█████████ | 640/703 [4:23:36<29:10, 27.78s/it]

7300000


 91%|█████████ | 641/703 [4:23:57<26:40, 25.81s/it]

7310000


 91%|█████████▏| 642/703 [4:24:17<24:23, 24.00s/it]

7320000


 91%|█████████▏| 643/703 [4:24:36<22:41, 22.69s/it]

7330000


 92%|█████████▏| 644/703 [4:24:56<21:24, 21.78s/it]

7340000


 92%|█████████▏| 645/703 [4:25:18<20:58, 21.69s/it]

7350000


 92%|█████████▏| 646/703 [4:25:37<19:58, 21.02s/it]

7360000


 92%|█████████▏| 647/703 [4:25:57<19:12, 20.57s/it]

7370000


 92%|█████████▏| 648/703 [4:26:18<19:09, 20.91s/it]

7380000


 92%|█████████▏| 649/703 [4:26:38<18:28, 20.53s/it]

7390000


 92%|█████████▏| 650/703 [4:26:57<17:52, 20.23s/it]

7400000


 93%|█████████▎| 651/703 [4:27:18<17:34, 20.28s/it]

7410000


 93%|█████████▎| 652/703 [4:27:38<17:09, 20.18s/it]

7420000


 93%|█████████▎| 653/703 [4:27:59<17:08, 20.57s/it]

7430000


 93%|█████████▎| 654/703 [4:28:28<18:55, 23.17s/it]

7440000


 93%|█████████▎| 655/703 [4:28:48<17:40, 22.09s/it]

7450000


 93%|█████████▎| 656/703 [4:29:07<16:33, 21.15s/it]

7460000


 93%|█████████▎| 657/703 [4:29:26<15:46, 20.58s/it]

7470000


 94%|█████████▎| 658/703 [4:29:46<15:20, 20.45s/it]

7480000


 94%|█████████▎| 659/703 [4:30:10<15:46, 21.52s/it]

7490000


 94%|█████████▍| 660/703 [4:30:35<16:01, 22.35s/it]

7500000


 94%|█████████▍| 661/703 [4:30:59<15:57, 22.79s/it]

7510000


 94%|█████████▍| 662/703 [4:31:27<16:50, 24.63s/it]

7520000


 94%|█████████▍| 663/703 [4:32:01<18:17, 27.43s/it]

7530000


 94%|█████████▍| 664/703 [4:32:26<17:16, 26.59s/it]

7540000


 95%|█████████▍| 665/703 [4:32:53<16:50, 26.58s/it]

7550000


 95%|█████████▍| 666/703 [4:33:16<15:46, 25.58s/it]

7560000


 95%|█████████▍| 667/703 [4:33:38<14:42, 24.51s/it]

7570000


 95%|█████████▌| 668/703 [4:34:01<14:00, 24.01s/it]

7580000


 95%|█████████▌| 669/703 [4:34:27<13:55, 24.56s/it]

7590000


 95%|█████████▌| 670/703 [4:34:49<13:11, 23.97s/it]

7600000


 95%|█████████▌| 671/703 [4:35:15<13:08, 24.64s/it]

7610000


 96%|█████████▌| 672/703 [4:35:42<13:06, 25.36s/it]

7620000


 96%|█████████▌| 673/703 [4:36:07<12:30, 25.02s/it]

7630000


 96%|█████████▌| 674/703 [4:36:30<11:50, 24.50s/it]

7640000


 96%|█████████▌| 675/703 [4:37:00<12:09, 26.06s/it]

7650000


 96%|█████████▌| 676/703 [4:37:43<14:00, 31.14s/it]

7660000


 96%|█████████▋| 677/703 [4:38:05<12:22, 28.55s/it]

7670000


 96%|█████████▋| 678/703 [4:38:34<11:57, 28.69s/it]

7680000


 97%|█████████▋| 679/703 [4:38:58<10:52, 27.19s/it]

7690000


 97%|█████████▋| 680/703 [4:39:22<10:06, 26.37s/it]

7700000


 97%|█████████▋| 681/703 [4:39:45<09:14, 25.22s/it]

7710000


 97%|█████████▋| 682/703 [4:40:13<09:05, 25.99s/it]

7720000


 97%|█████████▋| 683/703 [4:40:48<09:37, 28.87s/it]

7730000


 97%|█████████▋| 684/703 [4:41:16<09:00, 28.44s/it]

7740000


 97%|█████████▋| 685/703 [4:41:43<08:26, 28.12s/it]

7750000


 98%|█████████▊| 686/703 [4:42:06<07:34, 26.73s/it]

7760000


 98%|█████████▊| 687/703 [4:42:41<07:46, 29.16s/it]

7770000


 98%|█████████▊| 688/703 [4:43:08<07:04, 28.29s/it]

7780000


 98%|█████████▊| 689/703 [4:43:37<06:41, 28.71s/it]

7790000


 98%|█████████▊| 690/703 [4:44:01<05:54, 27.28s/it]

7800000


 98%|█████████▊| 691/703 [4:44:23<05:06, 25.54s/it]

7810000


 98%|█████████▊| 692/703 [4:44:45<04:30, 24.57s/it]

7820000


 99%|█████████▊| 693/703 [4:45:11<04:10, 25.02s/it]

7830000


 99%|█████████▊| 694/703 [4:45:34<03:38, 24.25s/it]

7840000


 99%|█████████▉| 695/703 [4:45:55<03:07, 23.38s/it]

7850000


 99%|█████████▉| 696/703 [4:46:21<02:50, 24.33s/it]

7860000


 99%|█████████▉| 697/703 [4:46:48<02:29, 24.95s/it]

7870000


 99%|█████████▉| 698/703 [4:47:16<02:08, 25.79s/it]

7880000


 99%|█████████▉| 699/703 [4:47:42<01:43, 25.94s/it]

7890000


100%|█████████▉| 700/703 [4:48:05<01:15, 25.10s/it]

7900000


100%|█████████▉| 701/703 [4:48:32<00:51, 25.60s/it]

7910000


100%|█████████▉| 702/703 [4:48:53<00:24, 24.32s/it]

7920000


100%|██████████| 703/703 [4:49:01<00:00, 24.67s/it]


In [6]:
from tqdm import tqdm
import time
import threading
import json


class TimeoutException(Exception):
    pass

def timeout(seconds):
    def decorator(func):
        def wrapper(*args, **kwargs):
            res = [TimeoutException('function timeout')]
            def target(result, *args, **kwargs):
                try:
                    result[0] = func(*args, **kwargs)
                except Exception as e:
                    result[0] = e
            thread = threading.Thread(target=target, args=(res,)+args, kwargs=kwargs)
            thread.start()
            thread.join(seconds)
            if isinstance(res[0], BaseException):
                raise res[0]
            return res[0]
        return wrapper

def timeout_no_return_func(seconds):
    """timeout decorator for functions that returns None"""
    def decorator(func):
        def wrapper(*args, **kwargs):
            res = [TimeoutException('function timeout')]
            def target(result, *args, **kwargs):
                try:
                    func(*args, **kwargs)
                    result[0] = None  # Explicitly set result to None if function doesn't return anything
                except Exception as e:
                    print(e)
                    result[0] = e
            thread = threading.Thread(target=target, args=(res,)+args, kwargs=kwargs)
            thread.start()
            thread.join(seconds)
            if isinstance(res[0], BaseException):
                raise res[0]
            return res[0]
        return wrapper
    return decorator

sql = SQLDatabase()
sql_engine = sql.engine
chunk_size = 50
last_i = 101
if f"passed_price_multiple_index_chunksize={chunk_size}.json" in os.listdir("./data"):
    passed_i = json.load(open(f"./data/passed_price_multiple_index_chunksize={chunk_size}.json", "r"))
else:
    passed_i = []
# price_multiples_pivot\
#     .replace(float("inf"), np.nan)\
#     .replace(float("-inf"), np.nan)\
#     .iloc[: chunk_size]\
#     .to_sql(
#     "price_multiples_stock_encoder", 
#     index=False, 
#     if_exists="replace", # 1st write, need to replace
#     con=sql_engine
#     )

# @timeout_no_return_func(15)
def load_chunk_to_sql(i: int):
    if (i + 1) * chunk_size <= len(price_multiples) - 1:
        price_multiples_pivot\
            .iloc[(i)*chunk_size : (i+1) * chunk_size]\
            .to_sql( 
            "price_multiples_stock_encoder", 
            index=False, 
            if_exists="append",
            con=sql_engine
            )
    else:
        price_multiples_pivot\
            .iloc[(i)*chunk_size : ]\
            .to_sql(
            "price_multiples_stock_encoder", 
            index=False, 
            if_exists="append",
            con=sql_engine
            )
    return True

for i in tqdm(range(last_i, len(price_multiples_pivot) // chunk_size)):
    try:
        load_chunk_to_sql(i)
    except TimeoutException:
        pass
    passed_i.append(i)
    json.dump(passed_i, open(f"./data/passed_price_multiple_index_chunksize={chunk_size}.json", "w"))
    last_i = i 

  0%|          | 0/158323 [02:12<?, ?it/s]


KeyboardInterrupt: 

In [None]:
universe = pd.read_csv("impax_universe_all.csv", header=[0], index_col=[0], low_memory=False)

In [None]:
financial_data = pd.read_csv("historical_financial_data.csv", header=0, index_col=0)

## 3. Load data into database