# Table of Contents
## [SRC](#--SRC--)
### 1. [Preprocessing utils](#Preprocessing-utils)
### 2. [Porfolio](#Portfolio)
### 3. [Clustering](#Clustering)



## [Scripts ](#Scripts)
### 1. [Data preprocessing](#01.-Data-Preprocessing)
### 2. [Main](#02.-Main)
### 3. [Analysis](#03.-Analysis)

* Check indices
    - check if it's 0-based?
    - especially when mixing slicing and indexing
    - check if it's a mere duplicate of R code counterparts

---

# SRC
## Preprocessing utils
### `src/preprocessing-utils.R` $\rightarrow$ `src/preprocessing_utils.py`

In [1]:
import pandas as pd
import numpy as np
import os
from functools import reduce

In [2]:
def as_quarter(months):
    """
    월(month) 값을 분기(quarter) 값으로 전환
    Example: (1, 2, 3, 4, 5, 6) -> (1, 1, 1, 2, 2, 2)
    
    INPUT:
      x: 1 ~ 12 사이의 integer vector
    RETURN:
      월 -> 분기로 전환된 integer vector
    """
    months = pd.to_numeric(months)
    if not np.all(np.isin(months, np.arange(1, 13))):
        raise ValueError("range of months exceeds 1~12")
    return (np.array(months) - 1) // 3 + 1

In [3]:
def is_quarter_interval(date_str):
    """
    시간 변수가 날짜 형식(ex: "20010131")인지 
    혹은 분기 형식(ex: "2001/1 Quarter)인지 검사
    
    INPUT:
      x: date string vector
    RETURN:
      날짜 형식이면 FALSE, 분기 형식이면 TRUE
    """
    return len(str(date_str)) == 5

In [4]:
def reshape_long(df):
    """
    Short form 데이터를 long form 데이터로 변환
    시간 변수가 날짜 형식인 경우 분기 형식으로 변환
    
    INPUT:
      data: raw data를 불러들인 data frame
    RETURN:
      long form으로 전환된 data frame
    """
    df = gather(df)
    df.val = pd.to_numeric(df.val)
    df = tidyup_timeframe(df, is_quarter_interval(df.time[0]))
    return df

In [5]:
def gather(df):
    df.rename(index=str, columns={"Unnamed: 1": "code", "Unnamed: 2": "name"}, inplace=True)
    df.columns = ["y"+format_quarter(str(c)) if c[0].isnumeric() else c for c in df.columns]
    df = pd.wide_to_long(df, stubnames="y", i=["code", "name"], j="time")
    df = pd.DataFrame(df.to_records()).rename(index=str, columns={"y": "val"})
    return df

In [6]:
def format_quarter(string):
    return string.replace(" ", "").replace("/", "").replace("Quarter", "").replace("SemiAnnual", "2").replace("Annual", "4")

In [7]:
def tidyup_timeframe(df, is_quarter):
    df.val = pd.to_numeric(df.val)
    if is_quarter:
        year_quarter = df.time.astype(str).str.extract('(.{4,4})(.{1,1})')
        year_quarter.columns = ["year", "quarter"]
        yq = year_quarter.year + "-" + year_quarter.quarter
        yq.name = "time"
        df = pd.concat([df.iloc[:, :2], yq, df.val], axis=1)
    else:
        year_quarter = df.time.astype(str).str.extract('(.{4,4})(.{2,2})')
        year_quarter.columns = ["year", "quarter"]
        yq = year_quarter.year + "-" + as_quarter(year_quarter.quarter).astype(str)
        yq.name = "time"
        df = pd.concat([df.iloc[:, :2], yq, df.val], axis=1)
        df = pd.DataFrame(df.groupby(["code", "name", "time"]).mean().to_records())
    df.sort_values(by=["code", "time"], inplace=True)
    return df

In [8]:
def preprocess(path, file_names, var_names, extension=".xls"):
    dfs = []
    for name in file_names:
        print(name, end=", ")
        file_path = os.path.join(path, name+extension)
        data = reshape_long( pd.read_excel(file_path, skiprows=5).iloc[1:, 1:] )
        dfs.append(data)

    vals = reduce(lambda x, y: x.merge(y, how="left", on=["code", "name", "time"]), dfs).iloc[:, 3:]
    vals.columns = var_names
    
    features = extract_features(vals)
    df = pd.concat([data.loc[:, ["code", "time"]], features], axis=1)
    return df

In [9]:
def extract_features(vals):
    leverage = vals.leverage
    asset_growth = vals.asset_growth
    shares_turnover = vals.trade_amount / vals.stock_num
    roa = vals.net_profit / vals.asset
    roe = vals.net_profit / vals.equity
    size = vals.market_cap
    pcr = vals.pcr
    per = vals.per
    equity_turnover = vals.equity_turnover
    volatility = vals.volatility
    logret = np.log(vals.price).diff()
    
    features = pd.concat([leverage, asset_growth, shares_turnover, roa, roe, size, 
                          pcr, per, equity_turnover, volatility, logret], axis=1)
    features.columns = ["leverage", "asset_growth", "shares_turnover", "roa", "roe", "size", 
                        "pcr", "per", "equity_turnover", "volatility", "logret"]
    return features

---

## Portfolio
### `src/functions-portfolio.R` $\rightarrow$ `src/portfolio.py`

In [240]:
from quadprog import solve_qp
from itertools import product

In [263]:
def get_weight(x, method, risk_free=None):
    n_cluster = x.shape[1]
    zeros = np.zeros(n_cluster).T
    
    if method == "GMV":
        A = np.c_[np.ones(n_cluster), np.diag(np.ones(n_cluster))]
        b = np.array([1] + [0]*n_cluster, dtype=np.float64)
    elif method == "Tangency":
        if not risk_free:
            raise ValueError("method is 'Tangency'. 'risk_free' should not be None")
        rf = risk_free.r.mean()
        A = np.c_[x.mean() - rf, np.diag([1] * n_cluster)]
        b = np.array([(x.mean() - rf).sum(), [0] * n_cluster], dtype=np.float64)
    else:
        raise ValueError("invalid method")
    
    qp = solve_qp(x.cov().to_numpy(), zeros, A, b, meq=1)
    return qp[0]   # solution

In [337]:
from itertools import product

In [274]:
def get_portfolio_return(data, timeset, n_time, with_, method,
                         market, risk_free, random_state=0):
    p_return = []
    for c_time in timeset:
        time_idx = np.arange((c_time-n_time), c_time+1)
        c_return = get_cluster_return(data, time_idx, with_, market, risk_free)
        
        x = c_return.x[:-1]
        y = c_return.y[:-1]
        
        weight = get_weight(x, method, risk_free.iloc[time_idx, :])
        p_return.append(integrate_return(y, weight))
        
    return np.array(p_return)

In [336]:
def expand_grid_py(*itrs):
    prod = list(product(*itrs))
    expanded = {'Var{}'.format(i+1): [x[i] for x in prod] for i in range(len(itrs))}
    return pd.DataFrame(expanded)

In [289]:
def expand_grid(*args):
    # all possible combinations of *args
    grd = expand_grid_py(*args)
    grd.sort_values(by=grd.columns, inplace=True)
    return grd

In [335]:
def evaluate_portfolio(data, market, risk_free, start, end,
                       with_list, n_time_list, method_list, random_state):
    timeset = data.time.unique()
    st = [i for i, t in enumerate(timeset) if start in t][0]
    en = [i for i, t in enumerate(timeset) if end in t][0]
    timeset = timeset[st:en]
    
    # Model grid
    grd = expand_grid(with_list, n_time_list, method_list)
    grd.columns = ["with", "n_time", "method"]
    
    n_each = grd.shape[0] // 3
    indices = np.repeat(["_1", "_2", "_3"], n_each)
    model_names = np.core.defchararray.add(grd.iloc[:, 0].to_numpy(), indices)
    
    # Portfolio returns table
    pr_tbl = kospi.iloc[timeset, :]
    pr_tbl.columns = ["time", "kospi"]
    
    print(f"from {start} to {end}")
    
    for with_ in with_list:
        for n_time in n_time_list:
            for method in method_list:
                print(f"  with: {with_}")
                print(f"  n_time: {n_time}")
                print(f"  method: {method}")
                
                pr = get_portfolio_return(data, timeset, n_time, with_, method, market, risk_free)
                pr_tbl = pd.concat([pr_tbl, pr], axis=1)
                pr_tbl.columns = list(pr_tbl.columns[:-1]) ["pr"]
    
    pr_tbl.columns = ["time", "kospi"] + model_names.tolist()
    
    # Model performance summary
    pr_cumsum = pr_tbl.iloc[:, 2:].cumsum().iloc[-1, :]
    pr_sd = np.diag(np.sqrt(pr_tbl.iloc[:, 2:].cov()))
    pr_info_rate = pr_tbl.iloc[:, 2:].to_numpy() - pr_tbl.iloc[:, 1].to_numpy()[np.newaxis].T
    pr_info_rate = pr_info_rate.mean(axis=0) / pr_info_rate.std(axis=0)
    
    summ = pd.concat([grd, pr_cumsum, pr_sd, pr_info_rate], axis=1)
    summ.columns = grd.columns.tolist() + ["cumsum", "sd", "info_r"]
    
    return {"return": pr_tbl, "summary": summ}

---

---

## Clustering
### `src/functions-clustering.R` $\rightarrow$ `src/clustering.py`

In [16]:
import statsmodels.api as sm
from scipy.spatial.distance import squareform, pdist
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
from tqdm import tqdm

In [11]:
def time_slice(data, time_idx):
    as_list = list(time_idx)
    return stock_tbl.groupby(["code"]).nth(as_list).reset_index()

In [12]:
def time_expand(data, skip=[0,1]):
    cols = [col for col in range(data.shape[1]) if col not in skip]
    
    while len(set(data.time)) > 1:
        lagged = data.iloc[:, cols].shift(1)
        lagged.columns = [f"x{c}" for c in cols]
        
        data = pd.concat([data, lagged], axis=1)
        data = data.groupby(["code"], as_index=False).apply(lambda x: x.iloc[1:]).reset_index(drop=True)
        
        cols = np.array(cols) + len(cols)
        
    return data

In [13]:
def scale_tbl(data, skip=[0,1]):
    idx = [i for i in range(data.shape[1]) if i not in skip]
    vals = data.iloc[:, idx]
    data.iloc[:, idx] = (vals - vals.mean()) / vals.std()
    return data.reset_index(drop=True)

##### PCA에서 데이터 손실

In [15]:
def pca_(data, skip=[0,1], threshold=.8):
    """
    데이터에 PCA(주성분분석)을 수행한다.
    
    INPUT:
      data: data frame
      skip: PCA 대상에서 제외할 열 번호 (integer vector)
      threshold: 주성분 개수 선택의 기준이 되는 변동의 비율 (0 ~ 1)
    
    RETURN:
      변수들이 주성분으로 대체된 data frame
    """
    idx = [i for i in range(data.shape[1]) if i not in skip]
    omit_na = data.iloc[:, idx].dropna()    #### 여기 dropna() 때문에 데이터 절반이 날아감
    
    pca = PCA()
    x_pc = pd.DataFrame(pca.fit_transform(omit_na))
    n_pc = np.where(pca.explained_variance_ratio_.cumsum() > threshold)[0][0] + 1
    x_pc = x_pc.iloc[:, :n_pc]
    x_pc.columns = [f"PC{c+1}" for c in x_pc.columns]
    
    df_pc = pd.concat([data.dropna().iloc[:, skip].reset_index(drop=True), x_pc], axis=1)
    return df_pc

In [54]:
def add_factors_residual(data, risk_free):
    idx_rf = [True if t in data.time.tolist() else False for t in risk_free.time]
    risk_free = risk_free.iloc[idx_rf, :]
    
    for code in tqdm(data.code.unique()):
        data.loc[data.code == code, "logret"] = data[data.code == code]["logret"].shift(-1)
        data.loc[data.code == code, "rf"] = risk_free.r.shift(-1)
    
    y = data.logret - data.rf
    x = pca_(data.drop(['logret', 'rf'], axis=1))
    
    df_pca = pd.concat([x, y], axis=1).iloc[:, 2:]
    df_pca.rename(index=str, columns={0: "y"}, inplace=True)
    fml = " + ".join([c for c in df_pca.columns if "PC" in c])
    lmfit = sm.OLS.from_formula(f"y ~ {fml}", data=df_pca).fit()
    
    yhat = lmfit.predict(x)
    res = y - yhat
    data["factors_res"] = (res - res.mean()) / res.std()
    
    n_time = data.time.unique().shape[0]
    dfs = [data.loc[data.code == code, :].iloc[:-1, :] for code in tqdm(data.code.unique())]
    data = pd.concat(dfs)
    
    return data

In [55]:
def add_market_residual(data, market, risk_free):
    idx_mk = [True if t in data.time.tolist() else False for t in market.time]
    market = market.iloc[idx_mk, :]
    
    idx_rf = [True if t in data.time.tolist() else False for t in risk_free.time]
    risk_free = risk_free.iloc[idx_rf, :]
    
    for code in tqdm(data.code.unique()):
        data.loc[data.code == code, "mk"] = market.logret
        data.loc[data.code == code, "rf"] = risk_free.r
        y = data.logret - data.rf
        x = data.mk - data.rf
    
    lmfit = sm.OLS.from_formula("y ~ x", data=data).fit()
    yhat = lmfit.predict(lmfit)
    res = y - yhat
    data["market_res"] = (res - res.mean()) / res.std()
    
    return data

In [152]:
def kmeanspp(x, k, random_state=0):
    #n = x.shape[0]
    #centers = [0] * k
    #centers[0] = np.random.randint(1, n+1)
    #
    #L2_mat = pd.DataFrame(squareform(pdist(x.iloc[:, 1:])), columns=x.index.unique(), index=x.index.unique())
    #L2_mat = L2_mat ** 2
    #
    #for i in range(1, k):
    #    weight = l2.iloc[:, centers].apply(np.min, axis=1)
    #    centers[i] = np.random.choice(range(1, n+1), p=weight/weight.sum())
    return KMeans(n_clusters=k, random_state=random_state).fit(x)

* Dunn index 대신 silhouette score를 사용
* 속도 느림

In [179]:
def get_kmeans_tbl(data, ncmin=2, ncmax=5):
    data = data.dropna()
    
    ncs = range(ncmin, nvmax)
    
    X = data.iloc[:, 2:]
    models = [kmeanspp(X, nc) for nc in ncs]
    silhouettes = np.array([silhouette_score(X, m.labels_) for m in tqdm(models)])
    best_model = models[silhouettes.argmax()]
    
    return pd.DataFrame(np.array([data.code.unique(), kmeans.labels_]).T, columns=["code", "cluster"])

In [180]:
def kmeans_with(data, with_, market, risk_free):
    if with_ == "return":
        return data.loc[:, ["code", "time", "logret"]]
    elif with_ == "market_residual":
        return add_market_residual(data, market, risk_free).loc[:, ["code", "time", "market_res"]]
    elif with_ == "factors":
        return pca_(data.drop(["logret"], axis=1))
    elif with_ == "factors_residual":
        return add_factors_residual(data, risk_free).loc[:, ["code", "time", "factors_res"]]
    else:
        raise ValueError("'with_' should be one of ['return', 'market_residual', 'factors', 'factors_residual']")

In [181]:
def integrate_return(return_, weight):
    weight = np.array(weight)
    weight = weight / weight.sum()
    return np.log(np.sum(weight * np.exp(return_)))

In [185]:
def integrate_return_apply(row):
    return integrate_return(row["logret"], row["size"])

In [227]:
def get_cluster_return(data, time_idx, with_, market, risk_free):
    cluster_df = time_slice(data, time_idx)
    cluster_df = scale_tbl(cluster_df)
    cluster_df = kmeans_with(with_, market, risk_free)
    cluster_df = time_expand(cluster_df)
    cluster_df = get_kmeans_tbl(cluster_df)
    
    data = data.merge(cluster_df, how="left", on=["code"]).loc[:, ["code", "time", "logret", "size", "cluster"]]
    data["size"] = data.size.shift(1)
    data.dropna(inplace=True)
    
    data["logret"] = data.groupby(["cluster", "time"]).apply(integrate_return_apply).reset_index(drop=True)
    data = pd.DataFrame(pd.pivot_table(data, values='logret', index=['cluster']).T.to_records()).iloc[:, 1:]
    data.columns = [f"time{c}" for c in data.columns]
    data = data.T[0]
    
    x = data[time_idx]
    y = data[time_idx[-1] + 1]
    y_time = data.index[time_idx[-1] + 1]
    
    if (x.shape[0] == len(time_idx)) and (y_time == data.time.unique()[time_idx[-1]+1]):
        return {"x": x, "y": y}
    else:
        raise ValueError()

---

---

---

# Scripts
## 01. Data Preprocessing
### `scripts/01_data-preprocessing.R` $\rightarrow$ `scripts/preprocess.py`

In [1]:
from src.preprocessing_utils import *

### Stock data

In [2]:
path = os.path.join("data", "raw")
file_names = ["asset", "asset-growth", "equity", "equity-turnover",
              "leverage", "market-cap", "net-profit", "pcr", "per",
              "stock-number", "stock-price", "trade-amount", "volatility"]
var_names = ["asset", "asset_growth", "equity", "equity_turnover",
             "leverage", "market_cap", "net_profit", "pcr", "per",
             "stock_num", "price", "trade_amount", "volatility"]
extension = ".xls"

In [3]:
stock_tbl = preprocess(path, file_names, var_names, extension=".xls")
stock_tbl.head()

asset, asset-growth, equity, equity-turnover, leverage, market-cap, net-profit, pcr, per, stock-number, stock-price, trade-amount, volatility, 

Unnamed: 0,code,time,leverage,asset_growth,shares_turnover,roa,roe,size,pcr,per,equity_turnover,volatility,logret
0,20,1997-1,,,0.006314,,,88504000000.0,,15.586667,,60.255281,
1,20,1997-2,206.92,23.6,0.015739,0.002507,0.007696,110488000000.0,,11.053333,2.22,62.004616,0.235505
2,20,1997-3,,,0.004035,,,108786700000.0,,10.543333,,56.33455,-0.040738
3,20,1997-4,324.23,12.41,0.004633,0.009214,0.039088,46133330000.0,,4.346667,2.89,62.702979,-0.847219
4,20,1998-1,,,0.012191,,,59733330000.0,,4.75,,73.726084,0.120535


### KOSPI index

In [4]:
def unpack_df(df):
    df = df.reset_index()
    df["time"] = df[['time', 'level_1']].astype(str).apply(lambda x: '-'.join(x), axis=1)
    df = pd.concat([df.time, df.price], axis=1)
    df.columns = ["time", "logret"]
    return df

In [14]:
kospi = pd.read_excel("data/raw/kospi-index.xlsx", names=["time", "price"])
g = kospi.groupby([pd.DatetimeIndex(kospi.time).year, as_quarter(pd.DatetimeIndex(kospi.time).month)])
g = np.log(g.mean()).diff()
kospi = unpack_df(g)
kospi.head()

Unnamed: 0,time,logret
0,1997-1,
1,1997-2,0.082477
2,1997-3,-0.064037
3,1997-4,-0.499768
4,1998-1,0.247514


### Risk-free rate

In [10]:
risk_free = pd.read_excel("data/raw/cd-risk-free.xlsx", names=["time", "r"])
risk_free.time = risk_free.time.str.replace("/", "-").str.split().str[0]
risk_free.r = np.log(1 + risk_free.r / 100)
risk_free.head()

Unnamed: 0,time,r
0,1997-1,0.119707
1,1997-2,0.118287
2,1997-3,0.119115
3,1997-4,0.14511
4,1998-1,0.204599


### Save processed data

In [11]:
stock_tbl.to_csv("data/processed/stock2.csv", index=False)
kospi.to_csv("data/processed/kospi2.csv", index=False)
risk_free.to_csv("data/processed/risk_free2.csv", index=False)

---

## 02. Main
### `scripts/02_main.R` $\rightarrow$ `scripts/main.py`

In [1]:
from src.clustering import *
from src.portfolio import *
from functools import reduce
import pandas as pd
import numpy as np

### Random state

In [2]:
random_state = 0

### Load data

In [3]:
stock_tbl = pd.read_csv("data/processed/stock.csv")
kospi = pd.read_csv("data/processed/kospi.csv")
risk_free = pd.read_csv("data/processed/risk_free.csv")

### Models

In [4]:
with_list = [
    #"return", 
    "market_residual", 
    #"factors", 
    #"factors_residual"
]
n_time_list = [
    6, 8, 
    #10, 12
]
method_list = [
    "GMV", #"Tangency"
]

### Validation period

In [5]:
start_list = ["2002-4", ]#"2005-4", "2008-4", "2011-4"]
end_list = ["2005-3", ]#"2008-3", "2011-3", "2014-3"]
valid_res = []

for st, en in zip(start_list, end_list):
    valid_res.append(evaluate_portfolio(stock_tbl, kospi, risk_free, st, en,
                                        with_list, n_time_list, method_list, 
                                        random_state=random_state))

from 2002-4 to 2005-3
  with: market_residual
  n_time: 6
  method: GMV


100%|██████████████████████████████████████████████████████████████████████████████████| 12/12 [01:25<00:00,  7.30s/it]


  with: market_residual
  n_time: 8
  method: GMV


100%|██████████████████████████████████████████████████████████████████████████████████| 12/12 [01:41<00:00,  8.68s/it]


In [9]:
valid_res[0]["return"].head()

Unnamed: 0,time,kospi,market_residual_6_GMV,market_residual_8_GMV
0,2002-4,-0.043557,-0.182029,-0.168141
1,2003-1,-0.166381,0.150088,0.133076
2,2003-2,0.110888,0.033397,0.123056
3,2003-3,0.131688,0.120947,0.070731
4,2003-4,0.096017,0.013463,0.015975


In [11]:
valid_res[0]["summary"]

Unnamed: 0,with,n_time,method,cumsum,sd,info_r
market_residual_6_GMV,market_residual,6,GMV,0.683359,0.09575,0.127044
market_residual_8_GMV,market_residual,8,GMV,0.496945,0.098431,0.006877


### Test period

In [12]:
start = "2014-4"
end = "2017-3"

test_res = evaluate_portfolio(stock_tbl, kospi, risk_free, start, end,
                              with_list, n_time_list, method_list)

from 2014-4 to 2017-3
  with: market_residual
  n_time: 6
  method: GMV


100%|██████████████████████████████████████████████████████████████████████████████████| 12/12 [01:24<00:00,  7.41s/it]


  with: market_residual
  n_time: 8
  method: GMV


100%|██████████████████████████████████████████████████████████████████████████████████| 12/12 [01:35<00:00,  7.96s/it]


### Save results

In [14]:
import os
import pickle

In [15]:
try:
    os.mkdir("outputs")
except OSError:
    print("outputs directory already exists")

outputs directory already exists


In [16]:
with open("outputs/valid_res_list.pickle", "wb") as wb:
    pickle.dump(valid_res, wb)
with open("outputs/test_res.pickle", "wb") as wb:
    pickle.dump(test_res, wb)

---

## 03. Analysis
### `scripts/03_analysis.R` $\rightarrow$ `scripts/analysis.py`

In [17]:
import pickle

In [18]:
with open("outputs/valid_res_list.pickle", "rb") as rb:
    valid_res = pickle.load(rb)
with open("outputs/test_res.pickle", "rb") as rb:
    test_res = pickle.load(rb)

---

# Trials and errors

In [6]:
timeset = stock_tbl.time.unique()
stst = [i for i, t in enumerate(timeset) if st in t][0]
enen = [i for i, t in enumerate(timeset) if en in t][0]
timeset = list(range(stst, enen+1))
timeset

[23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34]

In [7]:
stock_tbl.time.unique()[timeset]

array(['2002-4', '2003-1', '2003-2', '2003-3', '2003-4', '2004-1',
       '2004-2', '2004-3', '2004-4', '2005-1', '2005-2', '2005-3'],
      dtype=object)

In [9]:
for c_time in timeset:
    print(np.arange(c_time-6+1, c_time+1))
    break

[18 19 20 21 22 23]


In [10]:
time_idx = [18, 19, 20, 21, 22, 23]

In [11]:
grd = expand_grid(with_list, n_time_list, method_list)
grd.reset_index(drop=True, inplace=True)
grd.head()

Unnamed: 0,Var1,Var2,Var3
0,factors,6,GMV
1,factors,6,Tangency
2,factors,8,GMV
3,factors,8,Tangency
4,factors,10,GMV


In [12]:
model_names = reduce(lambda x, y: x.map(str) + "_" + y.map(str), [grd[c] for c in grd.columns])

In [15]:
df1 = pd.DataFrame(np.random.randn(10).reshape(5,2), index=[0,1,2,3,4])
df2 = pd.DataFrame(np.random.randn(10).reshape(5,2), index=[5,6,7,8,9])
pd.concat([df1, df2], axis=1)

Unnamed: 0,0,1,0.1,1.1
0,-0.086208,0.772396,,
1,-1.905268,-0.804845,,
2,1.118142,-1.227595,,
3,-0.983959,-0.827903,,
4,-0.463692,-0.443045,,
5,,,2.083735,-0.218963
6,,,-0.708171,0.635759
7,,,-0.306383,1.402967
8,,,-0.389463,2.332261
9,,,-0.230438,0.932226


In [28]:
with_list[1]

'market_residual'

In [80]:
cluster_df = time_slice(stock_tbl, time_idx)
cluster_df = scale_tbl(cluster_df)
#cluster_df = kmeans_with(cluster_df, with_list[1], kospi, risk_free)
#cluster_df = time_expand(cluster_df)
#cluster_df = get_kmeans_tbl(cluster_df)

In [81]:
data = cluster_df.copy()

idx_mk = [True if t in data.time.tolist() else False for t in kospi.time]
market = kospi.iloc[idx_mk, :]

idx_rf = [True if t in data.time.tolist() else False for t in risk_free.time]
rf = risk_free.iloc[idx_rf, :]

for code in data.code.unique():
    data.loc[data.code == code, "mk"] = market.logret.to_numpy()
    data.loc[data.code == code, "rf"] = rf.r.to_numpy()
data["y"] = data.logret - data.rf
data["x"] = data.mk - data.rf

lmfit = sm.OLS.from_formula("y ~ x", data=data).fit()
yhat = lmfit.predict(data["x"])
res = data.y.to_numpy() - yhat

data["market_res"] = (res - res.mean()) / res.std()

In [None]:
data = stock_tbl.merge(cluster_df, how="left", on=["code"]).loc[:, ["code", "time", "logret", "size", "cluster"]]
data["size"] = data["size"].shift(1)
data.dropna(inplace=True)
data.reset_index(drop=True, inplace=True)
data.cluster = data.cluster.astype(int)

data = pd.DataFrame(data.groupby(["cluster", "time"]).apply(integrate_return_apply), columns=["logret"]).reset_index()
pivot = pd.pivot_table(data, values='logret', index=['cluster'], columns=['time']).T
data = pd.DataFrame(pivot.to_records()).iloc[:, 1:]
data.columns = [f"cluster{int(c)+1}" for c in data.columns]
data = pd.concat([pd.Series(pivot.index), data], axis=1)

In [None]:
xx = data.iloc[time_idx, :].reset_index(drop=True)
yy = data.iloc[time_idx[-1] + 1, :]
y_time = data.time[time_idx[-1] + 1]

In [None]:
x = xx.iloc[:, 1:].reset_index(drop=True)
y = yy[1:]

In [None]:
n_cluster = x.shape[1]
zeros = np.zeros(n_cluster).T

A = np.c_[np.ones(n_cluster), np.diag(np.ones(n_cluster))]
b = np.array([1] + [0]*n_cluster, dtype=np.float64)

In [None]:
qp = solve_qp(x.cov().to_numpy(), zeros, A, b, meq=1)
weight = qp[0]

In [None]:
w = np.array(weight)
w = w / w.sum()

In [None]:
np.exp(y.tolist())

In [None]:
integrate_return(y, weight)

In [40]:
rr = valid_res[0]["return"]

In [41]:
rr = pd.concat([rr.iloc[12:, :2].reset_index(drop=True), rr.iloc[:12, 2:].reset_index(drop=True)], axis=1)

In [42]:
pr_cumsum = pr_tbl.iloc[:, 2:].cumsum().iloc[-1, :]
df_index = pr_cumsum.index
pr_cumsum.reset_index(drop=True, inplace=True)
pr_sd = np.diag(np.sqrt(pr_tbl.iloc[:, 2:].cov()))
pr_info_rate = pr_tbl.iloc[:, 2:].to_numpy() - pr_tbl.iloc[:, 1].to_numpy()[np.newaxis].T
pr_info_rate = pr_info_rate.mean(axis=0) / pr_info_rate.std(axis=0)

In [43]:
grd = expand_grid(with_list, n_time_list, method_list)
grd.reset_index(drop=True, inplace=True)
grd.columns = ["with", "n_time", "method"]

In [44]:
summ = pd.concat([grd] + [pd.Series(x) for x in [pr_cumsum, pr_sd, pr_info_rate]], axis=1)
summ.columns = grd.columns.tolist() + ["cumsum", "sd", "info_r"]
summ.index = df_index

In [45]:
summ

Unnamed: 0,with,n_time,method,cumsum,sd,info_r
market_residual_6_GMV,market_residual,6,GMV,0.479461,0.096741,-0.004129
market_residual_8_GMV,market_residual,8,GMV,0.562124,0.100113,0.047305
