In [2]:
# find model between reference-stock & target-stock
    # get the details of the model
        # model parameters
        # error ranges <- analysis

# make algo-trading strategy based on the model
    # what price to buy 
        # if the low-price of stock < predicted buy price, -> buy
    # what price to sell
        # if profit > 2%, -> sell
        # loss >= 3%, -> sell

# back-testing of the algo-trading strategy

In [49]:
import numpy as np 
import matplotlib.pyplot as plt 
#!pip install yfinance
import yfinance as yf # https://pypi.org/project/yfinance/
import math
import random
import seaborn as sns
import datetime
import pandas as pd
from scipy import stats # python -m pip install scipy
import warnings
warnings.filterwarnings(action='ignore')


# get data by ticker-name, start-time & end-time
def get_df_data(ticker_name="AAPL", start_time="2022-01-01", end_time="2022-10-09"):
    df_data = yf.download(tickers=ticker_name, start=start_time, end=end_time) 
    df_data = df_data[ ["Open", "High", "Low", "Close", "Volume"] ]
    df_data['previous_Close'] = df_data['Close'].shift(1)
    df_data['daily_return'] = (df_data['Close']-df_data['previous_Close'])/df_data['previous_Close']
    MA1, MA2 = 5, 20
    df_data['MA1'] = df_data['Close'].rolling(MA1).mean()
    df_data['MA2'] = df_data['Close'].rolling(MA2).mean()
    return df_data

def search_by_index(df_data, index_time):
    tmp_list = list(df_data.index)
    i = 0
    for t in tmp_list:
        if t>=index_time:
            break
        i += 1
    return i-1

def merge_stocks(df_data1, df_data2):
    data = []
    for i in range(len(df_data1)):
        # get the info of target stock on current business day
        index1 = df_data1.index[i] 
        it1 = df_data1.iloc[i]    
        open1, high1, low1, close1 = it1['Open'], it1['High'], it1['Low'], it1['Close']
        daily_return1 = it1['daily_return']
        volume1 = it1['Volume']
        target_MA1, target_MA2 = it1['MA1'], it1['MA2']
            # get the info of reference stock on previous business day
        j = search_by_index(df_data2, index1)
        index2 = df_data2.index[j] 
        it2 = df_data2.iloc[j]    
        open2, high2, low2, close2 = it2['Open'], it2['High'], it2['Low'], it2['Close']
        daily_return2 = it2['daily_return']
        volume2 = it2['Volume']
        ref_MA1, ref_MA2 = it2['MA1'], it2['MA2']

        if index1>index2:
            tmp_list = [ index1, open1, high1, low1, close1, index2, open2, high2, low2, close2 ]
            data.append( tmp_list )
        i += 1
    col_names = ['target_time', 'target_open', 'target_high', 'target_low', 'target_close',
                'ref_time', 'ref_open', 'ref_high', 'ref_low', 'ref_close']
    df = pd.DataFrame(data, columns = col_names)
    return df


def remove_timezone(dt):
    # HERE `dt` is a python datetime
    # object that used .replace() method
    return dt.replace(tzinfo=None)

def filter_by_time(merged_data):
    oneday = datetime.timedelta(days=1)
    twodays = datetime.timedelta(days=2)
    merged_data['target_time01'] = merged_data['target_time'].apply(remove_timezone)
    merged_data['ref_time01'] = merged_data['ref_time'].apply(remove_timezone)
    merged_data['time_dif'] = merged_data['target_time01'] - merged_data['ref_time01']
    copied_data = merged_data.copy()
    merged_data = merged_data[ merged_data['time_dif'] == oneday ]
    print( "merged_data:\t", len(merged_data), "after filtering the time gap between reference-stock & target-stock > 1 business days" )
    print('model info:')
    print( "\tref_time \t", list(merged_data['ref_time'])[0], "\t", list(merged_data['ref_time'])[-1] )
    print( "\ttarget_time \t", list(merged_data['target_time'])[0], "\t", list(merged_data['target_time'])[-1] )
    return merged_data, copied_data

def LR(x, y):
    k, b, R, p, std_err = stats.linregress(x, y) # R*R -> R2
    print( '\tlinear model: y = ', round(k, 4), '* x + ', round(b, 4), "\t R2:", round(R*R, 4), "\t std error:", round(std_err, 3) )
    mymodel = []
    for v in x:
        mymodel.append( k*v + b )
    assert len(y)==len(mymodel)
    print( "\tdata numbers (x & y): ", len(y), len(mymodel) )
    return k, b, R, std_err, mymodel

def lists_dif(y1, y2, col_name):
    dif_list = []
    i = 0
    while i<len(y1):
        v = (y1[i] - y2[i])/y2[i] * 100
        dif_list.append( v )
        i += 1
    df = pd.DataFrame(dif_list, columns = [col_name])
    return df

def printing01(label, num):
    print( label, "\t", round(num, 2), "%" )
    return
    
def error_details(error_df):
    printing01('\tmodel_error '+'max', error_df['model_error'].max())
    printing01('\tmodel_error '+'mean', error_df['model_error'].mean())
    printing01('\tmodel_error '+'median', error_df['model_error'].median())
    printing01('\tmodel_error '+'min', error_df['model_error'].min())
    printing01('\tmodel_error '+'std', error_df['model_error'].std())
    printing01('\tmodel_error '+'10%', error_df['model_error'].quantile(0.1))
    printing01('\tmodel_error '+'25%', error_df['model_error'].quantile(0.25))
    printing01('\tmodel_error '+'50%', error_df['model_error'].quantile(0.5))
    printing01('\tmodel_error '+'75%', error_df['model_error'].quantile(0.75))
    printing01('\tmodel_error '+'90%', error_df['model_error'].quantile(0.9))
    return 

def move_line(old_list, offset_y=2):
    new_list = []
    for v in old_list:
        new_list.append(v-offset_y*v/100)
    return new_list

def model_visualization(x, y, mymodel, label_name, error_df, pred_x, pred_y, show_or_not):
    if show_or_not:
        plt.subplots(figsize=(20, 10))
        plt.scatter(x, y, label=label_name) # points
        plt.plot(x, mymodel, label=label_name+" LR") # model
    # error range
    for percentile in [75]: # 10, 25, 50, 75, 
        error_gap = error_df['model_error'].quantile( round(percentile/100.0, 2) )
        error_gap = round(error_gap, 2)
        print( "\terror_gap: ", error_gap, "%" )
        if show_or_not:
            plt.plot(x, move_line(mymodel,-error_gap), label=label_name+" LR - safety -"+str(percentile)+"%")
            plt.plot(x, move_line(mymodel,+error_gap), label=label_name+" LR - safety +"+str(percentile)+"%")
  # predict today point
    print( "\t", label_name+" ref: ", pred_x, "\t", label_name+" target predicted: ", round(pred_y, 2) )
    if show_or_not:
        plt.scatter([pred_x], [pred_y], label="prediction", color ="red")
        plt.legend()
        plt.show()
    return 

def list_str(tmp_list):
    st = ""
    for v in tmp_list:
        st = st + str(v) + "\t"
    return st

def recording(every_line, file_path):
    f = open(file_path, "w", encoding="utf-8")
    f.write(every_line + "\n")
    f.close()
    return 

In [56]:
stocks_info = [
    ('BABA', '9988.HK', '2022-11-01', 1),
    ('BIDU', '9888.HK', '2022-10-25', 1),
    ('JD', '9618.HK', '2022-10-24', 0.5 * 7.8),
    ('MPNGY', '3690.HK', '2022-10-24', 0.5 * 7.8),
    ('NTES', '9999.HK', '2022-10-26', 0.2 * 7.8),
    ('LI', '2015.HK', '2022-10-26', 0.5 * 7.8),
    ('NIO', '9866.HK', '2022-10-26', 1 * 7.8),
    ('ZH', '2390.HK', '2022-10-26', 4 * 7.8)
]


MODELS_parameters = {}
for it in stocks_info[:]:
        # get market data of target-stock & reference-stock
    reference_stock, target_stock, st, ratio = it
    st, et = "2020-11-01", "2022-11-01"
    print("time range:\t", st, "-", et)
    df_data1 = get_df_data(ticker_name=target_stock, start_time=st, end_time=et)
    print("target stock:\t", target_stock, "\t", len(df_data1))
    df_data2 = get_df_data(ticker_name=reference_stock, start_time=st, end_time=et)
    print("reference stock:\t", reference_stock, "\t", len(df_data2))
        # merge reference-stock & target-stock: target stock (datetime, open, high, low, close), 1-day previous reference stock (datetime, open, high, low, close) 
    merged_data = merge_stocks(df_data1, df_data2)
        # remove rows when target_time - ref_time > 1 business day
    merged_data, copied_data = filter_by_time(merged_data)
        # modelling: Linear regression to fit data of target-stock & reference-stock
    model_parameters = {} 
    for label_name in ["open", "high", "low", "close"]:
        print( label_name )
            # modelling
        x, y = list(merged_data['ref_' + label_name]), list(merged_data['target_'+label_name])
        k, b, R, std_err, mymodel = LR(x, y)
            # model_error
        error_df = lists_dif(y, mymodel, 'model_error')
        #error_details(error_df)
            # record model parameters
        model_parameters[label_name] = [k, b, R*R] 
            # predict 
        ref_it = df_data2.iloc[-1]
        ref_dic = {"high":ref_it['High'], "low":ref_it['Low'], "open":ref_it['Open'], "close":ref_it['Close']}
        pred_x =  round(ref_dic[label_name], 2)
        pred_y = k*pred_x+b
            # model visualization
        model_visualization(x, y, mymodel, label_name, error_df, pred_x, pred_y, False)
    MODELS_parameters[reference_stock +"-"+ target_stock] = model_parameters

# save model parameters
content = ""
for k in list(MODELS_parameters.keys()):
    stock_code = k
    dic = MODELS_parameters[stock_code]
    for label_name in ["open", "high", "low", "close"]:
        st = stock_code + "\t" + label_name + "\t" + list_str(dic[label_name])
        print(st)
        content = content + st + "\n"
file_path = "C:/Users/Admin/Desktop/stocks_analyze_predict/model_parameters.txt"
recording(content, file_path)

time range:	 2020-11-01 - 2022-11-01
[*********************100%***********************]  1 of 1 completed
target stock:	 9988.HK 	 494
[*********************100%***********************]  1 of 1 completed
reference stock:	 BABA 	 503
merged_data:	 384 after filtering the time gap between reference-stock & target-stock > 1 business days
model info:
	ref_time 	 2020-11-02 00:00:00 	 2022-10-27 00:00:00
	target_time 	 2020-11-03 00:00:00 	 2022-10-28 00:00:00
open
	linear model: y =  0.9612 * x +  2.1832 	 R2: 0.9966 	 std error: 0.003
	data numbers (x & y):  384 384
	error_gap:  1.22 %
	 open ref:  63.49 	 open target predicted:  63.21
high
	linear model: y =  0.9664 * x +  1.2558 	 R2: 0.9969 	 std error: 0.003
	data numbers (x & y):  384 384
	error_gap:  0.9 %
	 high ref:  64.85 	 high target predicted:  63.92
low
	linear model: y =  0.9625 * x +  2.1659 	 R2: 0.9977 	 std error: 0.002
	data numbers (x & y):  384 384
	error_gap:  0.89 %
	 low ref:  63.22 	 low target predicted:  63.02
c