In [145]:
import numpy as np 
import matplotlib.pyplot as plt 
#!pip install yfinance
import yfinance as yf # https://pypi.org/project/yfinance/
import math
import random
import seaborn as sns
import datetime
import pandas as pd
#!pip install sklearn
from sklearn.linear_model import LinearRegression
from scipy import stats
import warnings
warnings.filterwarnings(action='ignore')

In [146]:
def calculate_week_number(df_data):
    start_weekday = df_data.index[0].weekday()
    start_date = df_data.index[0]
    week_nums = []
    i = 0
    while i<len(df_data.index):
        cur_date = df_data.index[i]
        cur_week_num = ( int((cur_date- start_date).days) + start_weekday ) // 7
        week_nums.append( cur_week_num )
        i += 1
    df_data['week_num'] = week_nums
    return df_data


# get data by ticker-name, start-time & end-time
def get_df_data(ticker_name="AAPL", start_time="2022-01-01", end_time="2022-10-09"):
    df_data = yf.download(tickers=ticker_name, start=start_time, end=end_time) 
    df_data = df_data[ ["Open", "High", "Low", "Close", "Volume"] ]
    df_data['Volume_log'] = np.log2(df_data['Volume'])
    df_data['previous_Close'] = df_data['Close'].shift(1)
    df_data['daily_return'] = (df_data['Close']-df_data['previous_Close'])/df_data['previous_Close']
    df_data['overnight_return'] = (df_data['Open']-df_data['previous_Close'])/df_data['previous_Close']
    MA1, MA2 = 5, 20
    df_data['MA1'] = df_data['Close'].rolling(MA1).mean()
    df_data['MA2'] = df_data['Close'].rolling(MA2).mean()
    df_data['weekday'] = df_data.index.weekday
    df_data = calculate_week_number(df_data)
    return df_data

def merge_stocks(df_data1, df_data2):
    data = []
    i = 0
    while i<len(df_data1):
        target_time = df_data1.index[i]

        j = 0
        while j<len(df_data2):
            ref_time = df_data2.index[j]
            if ref_time>=target_time:
                break
            j += 1
        ref_time = df_data2.index[j-1]
        if ref_time<target_time:
            it1 = df_data1.iloc[i]   
            open1, high1, low1, close1 = it1['Open'], it1['High'], it1['Low'], it1['Close']
            #daily_return1 = it1['daily_return']
            #volume1 = it1['Volume']
            #target_MA1, target_MA2 = it1['MA1'], it1['MA2']
            it2 = df_data2.iloc[j-1]
            open2, high2, low2, close2 = it2['Open'], it2['High'], it2['Low'], it2['Close']
            #daily_return2 = it2['daily_return']
            #volume2 = it2['Volume']
            #ref_MA1, ref_MA2 = it2['MA1'], it2['MA2']

            tmp_list = [ target_time, open1, high1, low1, close1, ref_time, open2, high2, low2, close2 ]
            data.append( tmp_list )
            #print( target_time, ref_time )
        i += 1
    col_names = ['target_time', 'target_open', 'target_high', 'target_low', 'target_close',
                'ref_time', 'ref_open', 'ref_high', 'ref_low', 'ref_close']
    df = pd.DataFrame(data, columns = col_names)
    return df

def get_datasets(merged_data, feature_list=['open', 'close', 'high', 'low'], label="close"):
    features = []
    for f in feature_list:
        tmp_list = list( merged_data['ref_' + f] )
        features.append( tmp_list )
    # pack features
    packed_features = []
    i = 0
    while i<len(features[0]):
        it = []
        j = 0
        while j<len(features):
            v = features[j][i]
            it.append(v)            
            j += 1
        packed_features.append(it)
        i += 1
    labels = merged_data['target_'+label]
    return packed_features, labels

def train_model(train_X, train_y):
    model = LinearRegression().fit(train_X, train_y)

    r_sq = model.score(train_X, train_y)
    print(f"coefficient of determination: {r_sq}")
    print(f"intercept: {model.intercept_}\tslope: {model.coef_}")
    return model

def basic_info(df):
    print("mean:\t", df['dif'].mean())
    print("median:\t", df['dif'].median())
    print("std:\t", df['dif'].std())
    print("25%:\t", df['dif'].quantile(0.25))
    print("50%:\t", df['dif'].quantile(0.50))
    print("75%:\t", df['dif'].quantile(0.75))
    return

def error_analyze(train_y, y_pred, printing=True):
    df = pd.DataFrame(columns = ['y_real', 'y_pred'])
    df['y_real'] = train_y
    df['y_pred'] = y_pred
    df['dif'] = (df['y_real'] - df['y_pred'])/df['y_real'] * 100
    df['dif'] = df['dif'].abs()
    if printing:
        basic_info(df)
    return df

In [149]:
stocks_info = [
    ('BABA', '9988.HK', 87.79, 1),
    ('BIDU', '9888.HK', 137.69, 1),
    ('JD', '9618.HK', 44.44, 0.5 * 7.8),
    ('MPNGY', '3690.HK', 34.65, 0.5 * 7.8),
    ('NTES', '9999.HK', 77.64, 0.2 * 7.8),
    ('TCOM', '9961.HK', 35.55, 1 * 7.8),
    ('BILI', '9626.HK', 21.57, 1 * 7.8),
    ('LI', '2015.HK', 23.61, 0.5 * 7.8),
    ('NIO', '9866.HK', 9.39, 1 * 7.8),
    ('ZH', '2390.HK', 1.43, 3 * 7.8)
]

OHLC_list = ['open', 'high', 'low', 'close']
daily_prediction = []
daily_real = []

for it in stocks_info[:]: # 
    reference_stock, target_stock, st, ratio = it
    st, et = "2020-01-01", "2023-03-31"
    print("time range:\t", st, "-", et)
    df_data1 = get_df_data(ticker_name=target_stock, start_time=st, end_time=et)
    print("target stock:\t", target_stock, "\t", len(df_data1))
    df_data2 = get_df_data(ticker_name=reference_stock, start_time=st, end_time=et)
    print("reference stock:\t", reference_stock, "\t", len(df_data2))
        # merge reference-stock & target-stock: target stock (datetime, open, high, low, close), 1-day previous reference stock (datetime, open, high, low, close) 
    merged_data = merge_stocks(df_data1, df_data2)
    print("merged_data:\ttarget_time:\t", merged_data.iloc[0]['target_time'], "\t", merged_data.iloc[-1]['target_time'])
    print("merged_data:\tref_time:\t", merged_data.iloc[0]['ref_time'], "\t", merged_data.iloc[-1]['ref_time'])

    stock_name = reference_stock + "_" + target_stock
    
    points = df_data1.iloc[-1]
    daily_real.append( [stock_name, points['Open'], points['High'], points['Low'], points['Close'], points['previous_Close']] )
    
    info_list = [stock_name]

    for label_name in OHLC_list:
        print(label_name)
        #label_name = "high"
        feature_names = ['open', 'high', 'low', 'close'] # 'open', 'high', 'low', 'close'

        n_days = 60

        train_X, train_y = get_datasets(merged_data[-n_days:-1], feature_names, label_name)
        train_X = np.array(train_X)
        train_y = np.array(train_y)

        model = train_model(train_X, train_y)
        y_pred = model.predict(train_X)


        df = error_analyze(train_y, y_pred, False)
        error_mean = df['dif'].mean()


        test_X, test_y = get_datasets(merged_data[-1:], feature_names, label_name)
        test_X = np.array(test_X)
        test_y = np.array(test_y)

        test_y_pred = model.predict(test_X)
        print(f"reference:\t{test_X} \t predicted:\t{test_y_pred}")
        print()
        info_list.append( test_y_pred[0] )
        info_list.append( error_mean )
    daily_prediction.append( info_list )

time range:	 2020-01-01 - 2023-03-31
[*********************100%***********************]  1 of 1 completed
target stock:	 9988.HK 	 794
[*********************100%***********************]  1 of 1 completed
reference stock:	 BABA 	 809
merged_data:	target_time:	 2020-01-03 00:00:00 	 2023-03-21 00:00:00
merged_data:	ref_time:	 2020-01-02 00:00:00 	 2023-03-20 00:00:00
open
coefficient of determination: 0.9962844545174142
intercept: 1.6578944073600184	slope: [ 0.05085014 -0.11721045  0.01069462  1.02311367]
reference:	[[80.15000153 81.90000153 79.48000336 81.        ]] 	 predicted:	[79.85621247]

high
coefficient of determination: 0.9837786495797953
intercept: 2.373209487458311	slope: [-0.5624046   0.57508904  0.35516676  0.60506516]
reference:	[[80.15000153 81.90000153 79.48000336 81.        ]] 	 predicted:	[81.63520602]

low
coefficient of determination: 0.9868405178016886
intercept: 3.699737529767873	slope: [-0.34972988  0.23111313  0.39677845  0.65369811]
reference:	[[80.15000153 81.90

In [151]:
###
col_names = ['stock-name', 'open_predicted', 'open_error%', 'high_predicted', 'high_error%', 'low_predicted', 'low_error%', 'close_predicted', 'close_error%']
info_df = pd.DataFrame(daily_prediction, columns=col_names)
info_df

col_names = ['stock-name', 'open', 'high', 'low', 'close', 'previous_Close']
real_df = pd.DataFrame(daily_real, columns=col_names)
real_df['daily_return'] = (real_df['close'] - real_df['previous_Close'])/real_df['previous_Close']*100
real_df

tmp_df = info_df[ ['stock-name', 'close_predicted', 'close_error%'] ].copy()
tmp_df['close_real'] = real_df['close']
tmp_df['dif%'] = (tmp_df['close_predicted'] - tmp_df['close_real'])/tmp_df['close_predicted']*100
tmp_df['dif%'] = tmp_df['dif%'].abs()
tmp_df['Correct'] = tmp_df['close_error%'] > tmp_df['dif%']
tmp_df['daily_return'] = real_df['daily_return'] 
tmp_df

Unnamed: 0,stock-name,close_predicted,close_error%,close_real,dif%,Correct,daily_return
0,BABA_9988.HK,80.223747,1.545314,80.5,0.344353,True,1.641418
1,BIDU_9888.HK,152.269248,2.163292,148.100006,2.738072,False,3.205579
2,JD_9618.HK,151.103029,1.705961,150.100006,0.6638,True,-0.19946
3,MPNGY_3690.HK,130.371431,1.763576,130.100006,0.208194,True,0.930964
4,NTES_9999.HK,134.88203,1.695863,133.0,1.395315,True,0.226076
5,TCOM_9961.HK,290.600007,1.434508,291.200012,0.206471,True,3.482594
6,BILI_9626.HK,186.370629,2.568853,181.0,2.881693,False,6.910807
7,LI_2015.HK,88.640179,2.183914,90.400002,1.985355,True,4.811596
8,NIO_9866.HK,67.78369,1.851279,69.25,2.163219,False,8.798112
9,ZH_2390.HK,18.169246,2.810601,18.26,0.499492,True,0.219544
