In [252]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import json
from tqdm import tqdm, trange

'''one hot encoding'''
DATA_FROM_PATH = './data/clean_fund_stock.xlsx'
# load data
data = pd.read_excel(DATA_PATH, header=0)
stocks = (data.loc[:, [str(i) for i in range(1, 11)]]).values
# flat data
all_stocks = []
for row in stocks:
    all_stocks.extend(row)
'''
# output stocks' names
names = [str(i) for  i in set(all_stocks)]
names[0:3]
with open('./data/stocks_names.txt', 'w') as f:
    f.write('\n'.join(names))
'''
all_stocks = np.array(all_stocks).reshape(-1,1)
# encode data
onehot_encoder = OneHotEncoder(sparse=False ,dtype=int)
onehot_encoded = onehot_encoder.fit_transform(all_stocks)
VOCAB_NUM = onehot_encoded.shape[1]
# input a int, output a stock_name_string
def idx_to_name(idx, onehot_encoder=onehot_encoder, vacab_num=VOCAB_NUM):
    assert(idx < vacab_num), "one-hot-array's idx should less than vacab_num"
    x = np.zeros((vacab_num), dtype=int).reshape(1, -1)
    x[0, idx] = 1
    return onehot_encoder.inverse_transform(x)

# input: list:['06969.HK','000001.SZ'], output a one-hot-array
def name_to_array(str_name_list, onehot_encoder=onehot_encoder):
    arr = np.array(str_name_list).reshape(-1, 1)
    return onehot_encoder.transform(arr)

def name_to_idx(str_name_list, onehot_encoder=onehot_encoder):
    arr_list = name_to_array(str_name_list, onehot_encoder=onehot_encoder)
    return [np.argmax(arr) for arr in arr_list]

print(idx_to_name(299))
print(name_to_array(['06969.HK','000001.SZ']))
print(name_to_idx(['06969.HK','000001.SZ']))

[['06969.HK']]
[[0 0 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]]
[299, 1]


In [282]:
'''calculate covariance'''
def match_series(stock1, stock2):
    date_in_common = list(set(stock1['datetime']) & set(stock2['datetime']))
    date_in_common.sort(reverse=False)
    dict1 = dict(zip(stock1['datetime'], stock1['value']))
    dict2 = dict(zip(stock2['datetime'], stock2['value']))
    return [dict1[i] for i in date_in_common], [dict2[i] for i in date_in_common]

def de_mean(x):
    xmean = np.mean(x)
    return [xi - xmean for xi in x]

def covariance(x1, x2):
    n = len(x1)
    cov = abs(np.dot(de_mean(x1), de_mean(x2)) / n-1) if n>1 else 0
    return cov

with open('./data/stock_prices.json', 'r') as f:
    stock_prices = json.load(f)
    stock_prices = json.loads(stock_prices)
# stock_prices: [{'name':string, 'datetime':['20180830',...], 'value':[7.68,...], 'status:[-1,...]'},{}]d

stock_idx = name_to_idx([stock['name'] for stock in stock_prices])
temp = list(zip(stock_prices, stock_idx))
temp.sort(key=lambda x: x[1])
# 排序 cov_matrix 中x[i, j]表示stock_names中idx=i 和 idx=j 的stocks的cov
stock_prices = [i[0] for i in temp]
stock_names = [stock['name'] for stock in stock_prices]

cov_matrix = np.zeros((VOCAB_NUM, VOCAB_NUM), dtype=float)
for i in tqdm(range(VOCAB_NUM-1)):
    for j in range(i, VOCAB_NUM):
        matched_val_1, matched_val_2 = match_series(stock_prices[i], stock_prices[j])
        cov_matrix[i,j] = covariance(matched_val_1, matched_val_2)
        cov_matrix[j,i] = cov_matrix[i,j]
print(cov_matrix)



100%|██████████| 769/769 [02:44<00:00,  4.68it/s]

[[ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          4.45583063  4.88267291 ...  1.882975    0.
   0.        ]
 [ 0.          4.88267291 14.9739953  ...  1.1158      0.
   0.        ]
 ...
 [ 0.          1.882975    1.1158     ...  7.381025    0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]]





In [None]:
from explib.models.mgl_opt import solve_mgl
emp_cov_list = []
emp_cov_list.append(cov_matrix)
label_mat_list = []
label_mat_list.append(np.identity(VOCAB_NUM))
print(label_mat_list)

seed = 0
with np.errstate(all='raise'):
# emp_cov_list is the sample covariance matrix.
# Ak is the i-th variable’s attributes in the k-th task
    X_list, U, fvals = solve_mgl(emp_cov_list, label_mat_list,
                                 d=100, 
                                 lambda_1=.2,
                                 lambda_2=.04,
                                 beta=.01,
                                 outer_max_it=20,
                                 outer_tol=1e-4,
                                 seed=seed)

In [None]:
from explib._helper import *
X_filtered_list = [X.copy() for X in X_list]
for X in X_filtered_list:
    X[np.isclose(X_list[0], 0, atol=.04)] = 0
    pass
print('------')
show_tensor(X_filtered_list)

In [249]:
a = True if 2<1 else False
a

False