# Data Exploration

This explores historical UK short disclosures as of 2022-12-29.

Findings:
- It's difficult to tell when historical shorts were closed
    - Funds disclose when their net short position crosses the 0.5% threshold
    - However some historical shorts end above the threshold
        - This could be due to delistings or ISIN changes, though it's hard to tell without further investigation
- There are some (3) duplicates, though these are easy to deal with (take the max disclosed position)

See https://www.fca.org.uk/markets/short-selling/notification-disclosure-net-short-positions for more details on the data.



In [77]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
from datetime import datetime, timedelta

from short_tracker.data import (
    query_all_sec_metadata, query_uk_si_disclosures,
    SHORT_URL_UK, DATE_COL, FUND_COL, ISIN_COL, SHORT_POS_COL,
    SHARE_ISSUER_COL, UK_DISCL_THRESHOLD,
)
from short_tracker.processing import (
    check_cur_hist_discl_overlap, remove_dupl_shorts, ffill_discl_data, calc_fund_short_flow_bounds
)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [78]:
discl_data, rept_date = query_uk_si_disclosures(SHORT_URL_UK) # exp_upd_time
rept_date

datetime.date(2022, 12, 28)

In [203]:
cur_discl = discl_data['current']
hist_discl = discl_data['historic']
all_isins = pd.concat([cur_discl[ISIN_COL], hist_discl[ISIN_COL]]).unique()

assert not check_cur_hist_discl_overlap(cur_discl, hist_discl)

hist_discl = remove_dupl_shorts(hist_discl)

exp_max_discl_date = rept_date - timedelta(days=1)
max_discl_date = cur_discl[DATE_COL].max().date()

assert max_discl_date <= exp_max_discl_date
max_discl_date

Found 6 duplicated rows:                   Position Holder       Name of Share Issuer          ISIN  Net Short Position (%) Position Date
23713  Bercheva Opportunities Ltd                  EQTEC PLC  IE00BH3XCL94                    1.41    2018-03-28
23714  Bercheva Opportunities Ltd                  EQTEC PLC  IE00BH3XCL94                    2.19    2018-03-28
46092      Jane Street Group, LLC  MICRO FOCUS INTERNATIONAL  GB00BJ1F4N75                    0.64    2017-09-01
46093      Jane Street Group, LLC  MICRO FOCUS INTERNATIONAL  GB00BJ1F4N75                    0.00    2017-09-01
57953                  Roble S.L.     QUINDELL PORTFOLIO PLC  GB00BMTS9H89                    0.54    2014-02-28
57954                  Roble S.L.     QUINDELL PORTFOLIO PLC  GB00BMTS9H89                    0.61    2014-02-28
Assuming the max disclosure is correct...


datetime.date(2022, 12, 23)

In [216]:
cur_discl_ind = cur_discl.set_index([ISIN_COL, FUND_COL]).index

# disclosures previously opened by a fund that are now still open
# need to ffill these up to the current disclosures
# FIXME: this logic doesn't work, see below
cont_discl_ind = hist_discl.set_index([ISIN_COL, FUND_COL]).index.isin(cur_discl_ind)
cont_hist_discl = hist_discl.loc[cont_discl_ind]

cont_discl = pd.concat([cont_hist_discl, cur_discl])

cont_discl_ffill = ffill_discl_data(cont_discl, exp_max_discl_date, UK_DISCL_THRESHOLD)

# historical shorts not currently marked as open
closed_discl = hist_discl.loc[~cont_discl_ind]
closed_discl_ffill = ffill_discl_data(closed_discl, None, UK_DISCL_THRESHOLD)

In [206]:
max_closed_idx = closed_discl.groupby([FUND_COL, ISIN_COL])[[DATE_COL]].max().set_index(DATE_COL, append=True).index
latest_closed_shorts = closed_discl[closed_discl.set_index([FUND_COL, ISIN_COL, DATE_COL]).index.isin(max_closed_idx)]

unclosed_hist_shorts = latest_closed_shorts[latest_closed_shorts[SHORT_POS_COL]>=UK_DISCL_THRESHOLD].sort_values(by=DATE_COL)
unclosed_hist_shorts
# ... so our ffill logic doesn't work since some shorts end > the threshold.

Unnamed: 0,Position Holder,Name of Share Issuer,ISIN,Position Date,Net Short Position (%)
7114,Anthion Management LLC,VALIANT PETROLEUM PLC,GB00B2NJD643,2013-03-07,1.9
68696,QUANTATATIVE LTD,RENOVO GROUP PLC,GB00B081NX89,2013-08-23,0.69
3531,"AQR Capital Management, LLC",EURASIAN NATURAL RESOURCES,GB00B29BCK10,2013-09-19,0.66
8169,"Bain Capital Public Equity Management, LLC",WEIR GROUP PLC/THE,GB0009465807,2013-12-11,0.96
65791,Oxford Asset Management,F&C ASSET MANAGEMENT,GB0004658141,2014-03-20,0.68
65640,Oxford Asset Management,AFRICAN MINERALS LTD,BMG0114P1005,2014-06-30,0.7
76576,Wolverine Asset Management LLC,LONDON MINING PLC,GB00B1VZK334,2014-08-19,0.52
37052,GSA Capital Partners LLP,AFRICAN MINERALS LTD,BMG0114P1005,2014-09-11,0.54
65912,Oxford Asset Management,LONDON MINING PLC,GB00B1VZK334,2014-09-23,0.99
65502,Oceanwood Capital Management LLP,AFRICAN MINERALS LTD,BMG0114P1005,2014-10-08,0.6


In [228]:
ix = [FUND_COL, ISIN_COL, DATE_COL]
# closed_discl_ffill.set_index(ix).index.isin(cont_discl_ffill.set_index(ix).index).sum()
# cont_discl_ffill.set_index(ix).index.isin(closed_discl_ffill.set_index(ix).index).sum()

discl_ffill = pd.concat([closed_discl_ffill, cont_discl_ffill])
discl_ffill_ = discl_ffill[~discl_ffill.duplicated(subset=ix)]

pivot_discl = lambda df: df.pivot(index=DATE_COL, columns=FUND_COL, values=SHORT_POS_COL)

hist_discl_dict = {isin: pivot_discl(df) for isin, df in discl_ffill_.groupby(ISIN_COL)}

In [293]:
a = cont_discl_ffill

In [294]:
a

Unnamed: 0,Position Holder,Name of Share Issuer,ISIN,Position Date,Net Short Position (%)
0,AHL Partners LLP,ASOS PLC,GB0030927254,2019-01-09,0.67
1,AHL Partners LLP,ASOS PLC,GB0030927254,2019-01-10,0.67
2,AHL Partners LLP,ASOS PLC,GB0030927254,2019-01-11,0.67
3,AHL Partners LLP,ASOS PLC,GB0030927254,2019-01-14,0.67
4,AHL Partners LLP,ASOS PLC,GB0030927254,2019-01-15,0.67
...,...,...,...,...,...
183985,XIB Asset Management Inc,SOLGOLD PLC,GB00B0WD0R35,2022-12-21,0.50
183986,XIB Asset Management Inc,SOLGOLD PLC,GB00B0WD0R35,2022-12-22,0.50
183987,XIB Asset Management Inc,SOLGOLD PLC,GB00B0WD0R35,2022-12-23,0.50
183988,XIB Asset Management Inc,SOLGOLD PLC,GB00B0WD0R35,2022-12-26,0.50


In [298]:
a[a[FUND_COL]=='AHL Partners LLP']

Unnamed: 0,Position Holder,Name of Share Issuer,ISIN,Position Date,Net Short Position (%)
0,AHL Partners LLP,ASOS PLC,GB0030927254,2019-01-09,0.67
1,AHL Partners LLP,ASOS PLC,GB0030927254,2019-01-10,0.67
2,AHL Partners LLP,ASOS PLC,GB0030927254,2019-01-11,0.67
3,AHL Partners LLP,ASOS PLC,GB0030927254,2019-01-14,0.67
4,AHL Partners LLP,ASOS PLC,GB0030927254,2019-01-15,0.67
...,...,...,...,...,...
1030,AHL Partners LLP,ASOS PLC,GB0030927254,2022-12-21,0.90
1031,AHL Partners LLP,ASOS PLC,GB0030927254,2022-12-22,0.90
1032,AHL Partners LLP,ASOS PLC,GB0030927254,2022-12-23,0.90
1033,AHL Partners LLP,ASOS PLC,GB0030927254,2022-12-26,0.90


In [88]:
sec_metadata, err_isins = query_all_sec_metadata(all_isins, "ID_ISIN")
exch_codes = {k: [x['exchCode'] for x in v] for k, v in sec_metadata.items()}
print(len(err_isins))

# 140 outdated isins???

140


In [100]:
from functools import reduce

ln_sec = {}

for k, v in sec_metadata.items():
    if "LN" in exch_codes[k]:
        ln_sec[k] = v[exch_codes[k].index("LN")]

sec_data_df = pd.DataFrame(ln_sec)

print(len(ln_sec), len(sec_metadata))

501 637


In [123]:
import yfinance as yf
from tenacity import retry, wait_fixed

@retry(wait=wait_fixed(10))
def query_mkt_data(sec_id, period='max'):
    return yf.Ticker(sec_id).history(period=period)
    

mkt_data_err_isins = []
mkt_data = {}

for isin in all_isins:
    try:
        mkt_data[isin] = query_mkt_data(isin)
    except:
        mkt_data_err_isins.append(isin)

print(len(mkt_data), len(mkt_data_err_isins))

777 0


In [275]:
UK_MKT_TICKER = "VUKE.L"
UK_MKT_SIZE_TICKER = "VMID.L"

mkt_tickers = [UK_MKT_TICKER, UK_MKT_SIZE_TICKER]

bm_data = {k: query_mkt_data(k) for k in mkt_tickers}

In [230]:
total_hist_discl_df = pd.DataFrame({k: v.sum(axis=1) for k, v in hist_discl_dict.items()})

In [277]:
def proc_mkt_data(df, col):
    df_ = df.copy()
    # print(k, df_.index.dtype)
    df_.index = df_.index.date
    return df_[col]
    

def mkt_data_to_df(data_dict, col):
    data_dict_ = {k: proc_mkt_data(v, col) for k, v in data_dict.items()}
    return pd.DataFrame(data_dict_)

# TODO: investigate why there's a lot of missing data - probably mostly delisted/changed isins
print(len([k for k, v in mkt_data.items() if v.empty]))

mkt_data_ = {k: v for k, v in mkt_data.items() if not v.empty}
mkt_data_ = {**mkt_data_, **bm_data}
mkt_data_ = {k: v.loc["2010-01-01":] for k, v in mkt_data_.items()}

362


In [278]:
price_data = mkt_data_to_df(mkt_data_, "Close")

In [210]:
returns = price_data.ffill().pct_change()
np.sum(np.isinf(returns).values)

0

In [282]:
m = yf.Ticker("MSFT")

In [289]:
m.stats()

In [272]:
# price_data[[k for k in unclosed_hist_shorts[ISIN_COL] if k in mkt_data_]]

In [233]:
total_hist_discl_df

Unnamed: 0_level_0,AEDFXA1EN018,AU000000FTE4,AU000000S320,BMG0114P1005,BMG0440M1284,BMG4209G2077,BMG4593F1389,BMG5307C1055,BMG5361W1047,BMG702781250,...,US46138B1035,US46140H1068,US68234L3069,US79400X1072,USU7744C1063,VGG0472G1063,VGG379591065,VGG4392T1075,ZAE000255360,ZAE000296554
Position Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-10-31,,,,,,,,,,,...,,,,,,,,,,
2012-11-01,,,,0.61,2.68,,,,0.88,,...,,,,,,,,,,
2012-11-02,,,,0.61,2.68,,,,0.88,,...,,,,,,,,,,
2012-11-05,,,,0.61,2.68,,,,0.88,,...,,,,,,0.58,,,,
2012-11-06,,,,0.61,3.73,,,,0.88,,...,,,,,,0.65,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-12-21,,,,,,,0.6,,,,...,,,,,,,,,0.79,
2022-12-22,,,,,,,0.6,,,,...,,,,,,,,,0.79,
2022-12-23,,,,,,,0.6,,,,...,,,,,,,,,0.79,
2022-12-26,,,,,,,0.6,,,,...,,,,,,,,,0.79,
