In [1]:
import numpy as np 
import matplotlib.pyplot as plt 
#!pip install yfinance
import yfinance as yf # https://pypi.org/project/yfinance/
import math
import random
import seaborn as sns
import datetime
import pandas as pd
#!pip install sklearn
from sklearn.linear_model import LinearRegression
from scipy import stats
import warnings
warnings.filterwarnings(action='ignore')
from bs4 import BeautifulSoup
import requests
import json
import time
import re

In [191]:
def get_realtime_info(stock_code):
    URL_link = "https://www.citifirst.com.hk/en/data/json/json_realtimedata/code/"+stock_code
    headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:80.0) Gecko/20100101 Firefox/80.0'}
    soup = BeautifulSoup(requests.get(URL_link, headers=headers).content, 'html.parser')

    start_index = str(soup).find("{")
    end_index = str(soup).find("}")
    st = str(soup)[start_index:end_index+1]
    json_str = ""
    for ln in st.split(","):
        if "<" not in ln:
            json_str = json_str + ln + ","
    dic = json.loads(json_str[:-1])

    stock_info = {}
    stock_info['Open'] = dic['open']
    stock_info['High'] = dic['high']
    stock_info['Low'] = dic['low']
    stock_info['Close'] = dic['last']
    stock_info['previous_Close'] = dic['lastc']
    stock_info['turnover'] = dic['turnover']
    stock_info['date_time'] = dic['stimeNoformat']
    return stock_info

global_datasets = {}
def local_stored_data(ticker_name="AAPL", start_time="2021-01-01", end_time="2023-12-31"):
    date01 = datetime.datetime.strptime(start_time, "%Y-%m-%d")
    date02 = datetime.datetime.strptime(end_time, "%Y-%m-%d")
    #print(ticker_name, date01, date02)
    if ticker_name not in global_datasets:
        df_data = yf.download(tickers=ticker_name, start=start_time, end=end_time) 
        global_datasets[ticker_name] = df_data.copy()
    df_data = global_datasets[ticker_name].copy()
    idx_dates = df_data.index
    i, j, k = 0, 0, len(idx_dates)-1
    while i < len(idx_dates)-1:
        d = idx_dates[i]
        # print(str(d)[:10])
        if d<=date01:
            j = i
        if d<=date02:
            k = i
        i += 1
    #print(i, j, k, len(idx_dates))
    #print( ticker_name, start_time, end_time, idx_dates[j], idx_dates[k] )
    assert j>=0
    assert k+1<=len(idx_dates)
    return df_data.iloc[j:k+1]

# get data by ticker-name, start-time & end-time
def get_df_data(ticker_name="AAPL", start_time="2022-01-01", end_time="2023-12-31", real_time=True, local_data=False):
    df_data = None
    if local_data:
        df_data = local_stored_data(ticker_name, start_time, end_time)
    else:
        df_data = yf.download(tickers=ticker_name, start=start_time, end=end_time) 

    if real_time:
        stock_info = get_realtime_info(ticker_name)
        open_price, high_price, low_price, current_price = stock_info['Open'], stock_info['High'], stock_info['Low'], stock_info['Close']
        try:
            df_data.at[df_data.index[-1], "Open"] = float(open_price)
            df_data.at[df_data.index[-1], "High"] = float(high_price)
            df_data.at[df_data.index[-1], "Low"] = float(low_price)
            df_data.at[df_data.index[-1], "Close"] = float(current_price)
        except:
            print(stock_info)
            df_data.at[df_data.index[-1], "Open"] = float(current_price)
            df_data.at[df_data.index[-1], "High"] = float(current_price)
            df_data.at[df_data.index[-1], "Low"] = float(current_price)
            df_data.at[df_data.index[-1], "Close"] = float(current_price)
    elif ".HK" in ticker_name:
        print("data may late for 15 minutes")
    # basic calculations such as: daily return, the log of Volume, Moving Average
    df_data['previous_Close'] = df_data['Close'].shift(1)
    df_data['daily_return'] = (df_data['Close']-df_data['previous_Close'])/df_data['previous_Close']
    df_data['Volume_log'] = np.log2(df_data['Volume'])
    MA1, MA2 = 5, 20
    df_data['MA1'] = df_data['Close'].rolling(MA1).mean()
    df_data['MA2'] = df_data['Close'].rolling(MA2).mean()
    return df_data

# in dataframe, search the info of certain previous rows by index
def search_info_by_index(df_data, tmp_idx, num=5):
    idx_list = df_data.index
    i = 0
    for it in idx_list:
        if it == tmp_idx:
            j = max(0, i-num)
            idx1, idx2 = idx_list[j], idx_list[i-1]
            return df_data.loc[idx1:idx2].copy()
        i += 1
    return None

# extract info from dataframe such as: rise / fall ratio, avg, std of Open, High, Low, Close, Volume
def get_info_from_df(sub_df):
    # rise / fall ratio
    rise_N, fall_N = len(sub_df[ sub_df['daily_return']>0 ]), len(sub_df[ sub_df['daily_return']<0 ])
    # avg, std of Open, High, Low, Close, Volume_log
    avg_open, std_open = sub_df['Open'].mean(), sub_df['Open'].std()
    avg_high, std_high = sub_df['High'].mean(), sub_df['High'].std()
    avg_low, std_low = sub_df['Low'].mean(), sub_df['Low'].std()
    avg_close, std_close = sub_df['Close'].mean(), sub_df['Close'].std()
    avg_volume, std_volume = sub_df['Volume_log'].mean(), sub_df['Volume_log'].std()
    return [rise_N, fall_N, avg_open, std_open, avg_high, std_high, avg_low, std_low, avg_close, std_close, avg_volume, std_volume]

# create features
def create_features(df_data, row_offset=20, num=5):
    df_data['rise_N'] = 0
    df_data['fall_N'] = 0
    df_data['avg_open'] = 0
    df_data['std_open'] = 0
    df_data['avg_high'] = 0
    df_data['std_high'] = 0
    df_data['avg_low'] = 0
    df_data['std_low'] = 0
    df_data['avg_close'] = 0
    df_data['std_close'] = 0
    df_data['avg_volume'] = 0
    df_data['std_volume'] = 0

    i = row_offset
    while i<len(df_data):
        cur_idx = df_data.index[i]
        #print(cur_idx)
        sub_df = search_info_by_index(df_data, cur_idx, num)
        it_list = get_info_from_df(sub_df)
        df_data.at[cur_idx, "rise_N"] = it_list[0]
        df_data.at[cur_idx, "fall_N"] = it_list[1]
        df_data.at[cur_idx, "avg_open"] = it_list[2]
        df_data.at[cur_idx, "std_open"] = it_list[3]
        df_data.at[cur_idx, "avg_high"] = it_list[4]
        df_data.at[cur_idx, "std_high"] = it_list[5]
        df_data.at[cur_idx, "avg_low"] = it_list[6]
        df_data.at[cur_idx, "std_low"] = it_list[7]
        df_data.at[cur_idx, "avg_close"] = it_list[8]
        df_data.at[cur_idx, "std_close"] = it_list[9]
        df_data.at[cur_idx, "avg_volume"] = it_list[10]
        df_data.at[cur_idx, "std_volume"] = it_list[11]        
        i += 1

    return df_data[row_offset:]

# LR model
from sklearn.linear_model import LinearRegression
def train_model(train_X, train_y, printing=True):
    model = LinearRegression().fit(train_X, train_y)

    r_sq = model.score(train_X, train_y)
    if printing:
        print(f"coefficient of determination: {r_sq}")
        print(f"intercept: {model.intercept_}\tslope: {model.coef_}")
    return model

def error_analyze(train_y, y_pred):
    df = pd.DataFrame(columns = ['y_real', 'y_pred'])
    df['y_real'] = train_y
    df['y_pred'] = y_pred
    df['dif'] = (df['y_real'] - df['y_pred'])/df['y_real'] * 100
    df['dif'] = df['dif'].abs()
    return df

def basic_info(df, col='dif'):
    print("max:\t", df[col].max())
    print("min:\t", df[col].min())
    print("median:\t", df[col].median())
    print("mean:\t", df[col].mean())
    print("std:\t", df[col].std())
    print("10%:\t", df[col].quantile(0.10))
    print("25%:\t", df[col].quantile(0.25))
    print("50%:\t", df[col].quantile(0.50))
    print("75%:\t", df[col].quantile(0.75))
    print("90%:\t", df[col].quantile(0.90))
    return

def predict_current_day(df_data, features, label="Close"): 
    df_data['next_Open'] = df_data['Open'].shift(-1)
    df_data['next_'+label] = df_data[label].shift(-1)
    # in case for some rows that 'Volume' that is 0
    model_data = df_data.copy()
    model_data.replace([np.inf, -np.inf], np.nan, inplace=True)
    model_data = model_data.dropna()
    

    # training data
    X_data, y_data = [], []
    date_data = []
    i = max(0, len(model_data)-121)
    while i<len(model_data)-1:
        sub_x = list(model_data.iloc[i][:4]) + list(model_data.iloc[i][8:-1])
        sub_y = model_data.iloc[i][-1]
        X_data.append(sub_x)
        y_data.append(sub_y)
        #print( model_data.index[i] )
        date_data.append( model_data.index[i] )
        i += 1
    
    print("training data:", date_data[0], date_data[-1], sep="\t")

    X_test, y_test = [], []
    i = -1
    sub_x = list(model_data.iloc[i][:4]) + list(model_data.iloc[i][8:-1])
    sub_y = model_data.iloc[i][-1]
    X_test.append(sub_x)
    y_test.append(sub_y)
    test_date = model_data.index[i]
    print("testing data:", test_date, sep="\t")

    model = train_model(X_data, y_data)
    y_pred = model.predict(X_data) # error analysis
    error_df = error_analyze(y_data, y_pred)
    error_df['date-time'] = date_data
    

    test_y_pred = model.predict(X_test)[0] # predict current day
    test_y_real = y_test[0]
    #print( test_y_pred, test_y_real )
    return error_df, test_y_pred, test_y_real, test_date

def predict_next_day(df_data, features, label="Close"): 
    df_data['next_'+label] = df_data[label].shift(-1)
    # in case for some rows that 'Volume' that is 0
    model_data = df_data.copy()
    model_data.replace([np.inf, -np.inf], np.nan, inplace=True)
    model_data = model_data.dropna()

    # training data
    X_data, y_data = [], []
    date_data = []
    i = max(0, len(model_data)-121)
    while i<len(model_data)-1:
        sub_x = list(model_data.iloc[i][:4]) + list(model_data.iloc[i][8:-1])
        sub_y = model_data.iloc[i][-1]
        X_data.append(sub_x)
        y_data.append(sub_y)
        #print( model_data.index[i] )
        date_data.append( model_data.index[i] )
        i += 1

    X_test, y_test = [], []
    i = -1
    sub_x = list(df_data.iloc[i][:4]) + list(df_data.iloc[i][8:-1])
    sub_y = df_data.iloc[i][-1]
    X_test.append(sub_x)
    y_test.append(sub_y)
    #print( df_data.index[i], X_test, y_test )

    model = train_model(X_data, y_data)
    y_pred = model.predict(X_data) # error analysis
    error_df = error_analyze(y_data, y_pred)
    error_df['date-time'] = date_data

    test_y_pred = model.predict(X_test)[0] # predict current day
    test_y_real = y_test[0]
    #print( test_y_pred, test_y_real )
    return error_df, test_y_pred, test_y_real

def business_dates():
    st, et = "2022-01-01", "2023-12-31"
    tmp_df = get_df_data('9988.HK', st, et, False)
    str_dates = []
    for d in tmp_df.index:
        str_dates.append( str(d)[:10] )
    return str_dates

def draw_pred_real01(pred_list, real_list, x_dates):    
    L = len(x_dates)
    plt.figure(figsize=(20, 6))
    plt.plot( list(range(0, L)), pred_list, label="pred" )
    plt.plot( list(range(0, L)), real_list, label="real" )
    
    #plt.scatter( real_list, pred_list )
    plt.xlabel('times')
    plt.ylabel('real / predicted-value')
    plt.title('predicted vs real')
    plt.xticks(list(range(0, L)), x_dates, rotation=90)
    plt.legend()
    plt.grid(True)
    plt.show()
    return

def draw_pred_real02(dif_list, x_dates):    
    L = len(x_dates)
    plt.figure(figsize=(20, 6))
    plt.bar( list(range(0, L)), dif_list, label="pred" )
    
    plt.xlabel('times')
    plt.ylabel('real-predicted dif %')
    plt.title('predicted vs real')
    plt.xticks(list(range(0, L)), x_dates, rotation=90)
    plt.legend()
    plt.grid(True)
    plt.show()
    return

In [149]:
features = ['Open', 'High', 'Low', 'Close', 'Volume_log', 'MA1', 'MA2']
features = features + [ 'rise_N', 'fall_N', 'avg_open', 'std_open', 'avg_high', 'std_high' ]
features = features + [ 'avg_low', 'std_low', 'avg_close', 'std_close', 'avg_volume', 'std_volume' ]
features = features + [ 'next_Open' ]
label = "Close"

st, et = "2022-01-01", "2023-08-31"

stocks_info = [
    #('HSI', '^HSI', 1),
    ('BABA', '9988.HK', 1),
    ('BIDU', '9888.HK', 1),
    ('JD', '9618.HK', 0.5 * 7.8),
    ('MPNGY', '3690.HK', 0.5 * 7.8),
    ('NTES', '9999.HK', 0.2 * 7.8),
	('LI', '2015.HK', 0.5 * 7.8),
	('XPEV', '9868.HK', 0.5*7.8),	
	('BILI', '9626.HK', 1 * 7.8),
	('TCOM', '9961.HK', 1 * 7.8),
	('YUMC', '9987.HK', 1*7.8),
	('EDU', '9901.HK', 0.1*7.8), 
    ('NIO', '9866.HK', 1 * 7.8),
	('ZTO', '2057.HK', 1*7.8),
    ('BEKE', '2423.HK', 0.5*7.8),
    ('ZH', '2390.HK', 3 * 7.8), 
    ('WB', '9898.HK', 1*7.8),
    ('MNSO', '9896.HK', 0.5*7.8),
    ('ZLAB', '9688.HK', 0.5*7.8),
    ('TENCENT', '0700.HK', 1*7.8),
    ('TME', '1698.HK', 1*7.8),
    ('SMIC', '0981.HK', 1*7.8),
    ('SenseTime', '0020.HK', 1*7.8),
    ('Kuaishou', '1024.HK', 1*7.8),
    ('Xiaomi', '1810.HK', 1*7.8),
    ('CMB', '3968.HK', 1*7.8),
]

In [103]:
### current-day prediction

pred_feature = ['Close', 'High', 'Low']
pred_real_error_dic = {}
pred_data = []
for it in stocks_info[:]:
    each_stock_data = []

    stock_name, stock_code, _ = it
    test_df = get_df_data(stock_code, st, et, True)

    ## add info 
    each_stock_data.append( stock_name )
    each_stock_data.append( stock_code )

    cur_features = features
    for label in pred_feature:
        cur_label = label
        error_df, test_y_pred, test_y_real, test_date = predict_current_day( create_features(test_df) , cur_features, cur_label)
        #basic_info(error_df)
        med_err, mean_err = error_df['dif'].median(), error_df['dif'].mean()
        print( label, test_y_pred, test_y_real, med_err, mean_err )

        ## add info
        each_stock_data.append( test_y_pred )
        each_stock_data.append( med_err )
        each_stock_data.append( mean_err )

        new_name = stock_name + "_" + stock_code + "_" + label 
        pred_real_error_dic[ new_name ] = error_df
    
    pred_data.append( each_stock_data )

[*********************100%***********************]  1 of 1 completed
coefficient of determination: 0.961259749666917
intercept: 16.020619862453245	slope: [ 0.15866482 -0.02459616  0.04172442 -0.28332239  0.15105011  0.53153624
 -0.06847185 -0.10009064  0.10009064  1.51294282  0.37336163  0.10223826
 -0.51913378 -1.51322985 -0.0572278  -0.46523388 -0.11173086 -0.54031245
 -0.40844456  0.9229966 ]
Close 93.35966455303213 92.55 0.8697648375000031 1.0798136360524813
coefficient of determination: 0.9763526020724645
intercept: 6.195892047852894	slope: [ 0.13869388 -0.06044989  0.03691754 -0.2344615   0.30360678  0.27205219
 -0.0098329  -0.02360831  0.02360831  1.0381643  -0.06679762 -0.09313124
 -0.06303181 -1.22360788  0.09517817  0.13589053 -0.07960327 -0.48385928
 -0.56723951  0.98567062]
High 94.68812861418449 93.8 0.6801133532158838 0.8524718425133808
coefficient of determination: 0.9863031073554176
intercept: 9.420762764885978	slope: [ 0.28816764 -0.13373434 -0.37733142  0.14393462 -0.

In [206]:
stock_data = []
for it in pred_data:
    ## predicted info
    stock_name, stock_code = it[0], it[1]
    close_pred, close_med_err, close_mean_err = it[2], it[3], it[4]
    high_pred, high_med_err, high_mean_err = it[5], it[6], it[7]
    low_pred, low_med_err, low_mean_err = it[8], it[9], it[10]

    ## real info
    stock_info = get_realtime_info(stock_code)
    close_real = stock_info['Close']
    high_real = stock_info['High']
    low_real = stock_info['Low']

    open_real = stock_info['Open']    
    prev_close = stock_info['previous_Close']
    turnover = stock_info['turnover']
    update_time = stock_info['date_time']
    
    new_name = stock_name + "_" + stock_code
    row_data = [
        new_name, prev_close, open_real, 
        close_pred, close_real, close_mean_err, 
        high_pred, high_real, high_mean_err,
        low_pred, low_real, low_mean_err, 
        turnover, update_time
    ]
    stock_data.append( row_data )

###
col_names = [
            "stock-name", "previous_close", "cur_open",            
            "close_pred", "close_real", "close_err_range%", #"cw_close_er2",
            "high_pred", "high_real", "high_err_range%", #"cw_high_er2",
            "low_pred", "low_real", "low_err_range%", #"cw_low_er2",
            "turnover", "update_time"
            ]
round_dic = {'close_pred': 2, 'high_pred': 2, 'low_pred': 2, 'close_err_range%': 2, 'high_err_range%':2, 'low_err_range%':2, 'daily_return%':2 }
stock_df = pd.DataFrame( stock_data, columns=col_names )
stock_df['previous_close'] = stock_df['previous_close'].astype(float)
stock_df['close_real'] = stock_df['close_real'].astype(float)
stock_df['price_change'] = stock_df['close_real'] - stock_df['previous_close']
stock_df['daily_return%'] = (stock_df['close_real'] - stock_df['previous_close'])/stock_df['previous_close']*100

show_cols = [
        "stock-name", "previous_close", "cur_open",  
        "close_pred", "close_real", "close_err_range%", "price_change", "daily_return%",
        "high_pred", "high_real",
        "low_pred", "low_real",
        "turnover", "update_time"
        ]


stock_df.round(round_dic)[ show_cols ]

Unnamed: 0,stock-name,previous_close,cur_open,close_pred,close_real,close_err_range%,price_change,daily_return%,high_pred,high_real,low_pred,low_real,turnover,update_time
0,BABA_9988.HK,94.1,93.8,93.36,94.3,1.08,0.2,0.21,94.69,94.7,92.55,92.35,3.24B,"2023-08-10, 16:08"
1,BIDU_9888.HK,141.0,139.0,138.52,140.6,1.44,-0.4,-0.28,140.78,141.3,136.74,138.8,482.19M,"2023-08-10, 16:08"
2,JD_9618.HK,149.1,147.4,146.71,149.1,1.47,0.0,0.0,149.33,149.7,144.47,145.4,801.34M,"2023-08-10, 16:08"
3,MPNGY_3690.HK,140.9,139.4,138.23,141.0,1.52,0.1,0.07,140.49,142.0,137.36,137.6,1.75B,"2023-08-10, 16:08"
4,NTES_9999.HK,168.7,167.2,167.07,170.3,1.18,1.6,0.95,170.09,171.2,165.14,166.2,762.24M,"2023-08-10, 16:08"
5,LI_2015.HK,167.7,164.0,160.18,171.8,1.63,4.1,2.44,166.28,172.1,159.26,164.0,1.40B,"2023-08-10, 16:08"
6,XPEV_9868.HK,69.1,66.75,60.98,67.9,2.63,-1.2,-1.74,64.72,68.25,62.06,65.85,906.15M,"2023-08-10, 16:08"
7,BILI_9626.HK,136.9,135.3,137.13,135.6,1.76,-1.3,-0.95,138.66,136.8,132.29,132.0,326.84M,"2023-08-10, 16:08"
8,TCOM_9961.HK,317.8,319.8,320.96,326.4,1.31,8.6,2.71,324.45,330.2,317.03,319.4,700.99M,"2023-08-10, 16:08"
9,YUMC_9987.HK,446.4,442.0,443.43,444.2,0.92,-2.2,-0.49,449.66,446.6,439.75,441.2,69.01M,"2023-08-10, 16:08"


In [199]:
### testing for current-day prediction
working_days = business_dates()
st, et = "2021-01-01", "2023-08-02"
stock_code = '9988.HK'

overall_evaluation = []
for tmp_et in working_days[-10:-1]:
    #print(tmp_et)
    
    test_df = get_df_data(stock_code, st, tmp_et, False, False)
    error_df, test_y_pred, test_y_real, test_date = predict_current_day( create_features(test_df) , cur_features, 'Close')
    err_med, err_mean = round(error_df['dif'].median(), 2), round(error_df['dif'].mean(), 2)
    test_y_pred = round(test_y_pred, 2)
    test_y_real = round(test_y_real, 2)
    err_real = round((test_y_pred-test_y_real)/test_y_real*100, 2)
    test_date = str(test_date)[:10]
    train_date1, train_date2 = str(error_df.iloc[0]['date-time'])[:10], str(error_df.iloc[-1]['date-time'])[:10]
    #print()
    row_info = [tmp_et, test_date, train_date1, train_date2, test_y_pred, test_y_real, err_real, err_med, err_mean]
    overall_evaluation.append( row_info )

col_names = ['data-date', 'test-date', 'train-date-1', 'train-date-2', 
    'pred-close', 'real-close', 'err-real',
    'err-range-med', 'err-range-mean'
    ]
evaluation_df = pd.DataFrame( overall_evaluation, columns=col_names )
evaluation_df['err-real-abs'] = evaluation_df['err-real'].abs()
basic_info(evaluation_df, 'err-real-abs')

[*********************100%***********************]  1 of 1 completed
data may late for 15 minutes
[*********************100%***********************]  1 of 1 completed
data may late for 15 minutes
training data:	2023-01-30 00:00:00	2023-07-25 00:00:00
testing data:	2023-07-26 00:00:00
coefficient of determination: 0.971019556357957
intercept: 10.757011930957063	slope: [ 0.32554505 -0.11418905 -0.14020005 -0.14019595  0.03271243  0.45029756
 -0.01733476 -0.04396522  0.04396522  1.26331258  0.4450555  -0.06543305
 -0.12073685 -1.11366652 -0.16682539 -0.40933418 -0.42680183 -0.29091646
 -0.21712374  0.90236493]
[*********************100%***********************]  1 of 1 completed
data may late for 15 minutes
training data:	2023-01-31 00:00:00	2023-07-26 00:00:00
testing data:	2023-07-27 00:00:00
coefficient of determination: 0.9704979820409654
intercept: 2.348670390846067	slope: [ 0.36402186 -0.04791785 -0.19243135 -0.21896421  0.14017298  0.4417885
 -0.02575031 -0.06959381  0.06959381  1.1

In [200]:
basic_info(evaluation_df, 'err-real-abs')
#draw_pred_real01(evaluation_df['pred-close'], evaluation_df['real-close'], evaluation_df['test-date'])
#draw_pred_real02(evaluation_df['err-real-abs'], evaluation_df['test-date'])
evaluation_df

max:	 3.6
min:	 0.04
median:	 1.17
mean:	 1.4777777777777776
std:	 1.111854057169575
10%:	 0.33599999999999997
25%:	 0.93
50%:	 1.17
75%:	 2.05
90%:	 2.7920000000000003


Unnamed: 0,data-date,test-date,train-date-1,train-date-2,pred-close,real-close,err-real,err-range-med,err-range-mean,err-real-abs
0,2023-07-28,2023-07-26,2023-01-30,2023-07-25,93.49,95.45,-2.05,0.93,1.11,2.05
1,2023-07-31,2023-07-27,2023-01-31,2023-07-26,92.45,95.9,-3.6,1.01,1.12,3.6
2,2023-08-01,2023-07-28,2023-02-01,2023-07-27,97.46,97.5,-0.04,0.99,1.14,0.04
3,2023-08-02,2023-07-31,2023-02-02,2023-07-28,99.32,97.85,1.5,1.01,1.12,1.5
4,2023-08-03,2023-08-01,2023-02-03,2023-07-31,97.61,95.15,2.59,0.98,1.13,2.59
5,2023-08-04,2023-08-02,2023-02-06,2023-08-01,94.02,93.15,0.93,0.93,1.14,0.93
6,2023-08-07,2023-08-03,2023-02-07,2023-08-02,95.54,95.15,0.41,0.91,1.14,0.41
7,2023-08-08,2023-08-04,2023-02-08,2023-08-03,94.48,95.6,-1.17,0.91,1.14,1.17
8,2023-08-09,2023-08-07,2023-02-09,2023-08-04,93.99,93.05,1.01,0.87,1.1,1.01


In [207]:
### next-day prediction

next_features = features[:-1]
st, et = "2022-01-01", "2023-08-31"
pred_next_data = []
for it in stocks_info[:]:
    #
    stock_name, stock_code, _ = it
    test_df = get_df_data(stock_code, st, et, True)
    today_info = test_df.iloc[-1]
    O, H, L, C = today_info['Open'], today_info['High'], today_info['Low'], today_info['Close']
    
    new_name = stock_name + "_" + stock_code
    each_stock_data = [new_name, O, H, L, C]

    for label in pred_feature:
        next_label = label
        error_df, test_y_pred, test_y_real = predict_next_day(create_features(test_df), next_features, next_label)
        med_err, mean_err = error_df['dif'].median(), error_df['dif'].mean()

        each_stock_data.append(test_y_pred)
        each_stock_data.append(mean_err)
        
    #
    pred_next_data.append( each_stock_data )

[*********************100%***********************]  1 of 1 completed
coefficient of determination: 0.8969951872087789
intercept: 60.68683937828773	slope: [-0.16185355 -0.17841715  0.3419781   0.46598096  0.25923747  1.60513833
 -0.08384149 -0.04676671  0.04676671  0.83665008 -0.08567124  1.89431218
  1.08360323 -3.55001142  0.08927256 -0.43458622 -1.34210909 -1.92790744
 -0.8471622 ]
coefficient of determination: 0.9072764882403627
intercept: 53.89507153873861	slope: [-0.20358858 -0.22471575  0.35755928  0.56572163  0.41914036  1.41855483
 -0.02624618  0.03333646 -0.03333646  0.3159494  -0.55700009  1.82062946
  1.64853548 -3.39869256  0.2516263   0.16861926 -1.39352758 -1.96567581
 -1.0357473 ]
coefficient of determination: 0.9223741145120262
intercept: 53.69240328358763	slope: [-0.02951929 -0.28619648 -0.07973017  0.88661867  0.01698836  1.45422273
 -0.0278243  -0.02348802  0.02348802  0.38994208 -0.0233945   1.62984711
  1.33619925 -3.21661461 -0.02697439  0.04666238 -1.45726511 -1.

In [208]:
col_names = [
            "stock-name", "Open", "High", "Low", "Close",
            "close_pred", "close_err_range%", #"cw_close_er2",
            "high_pred", "high_err_range%", #"cw_high_er2",
            "low_pred", "low_err_range%", #"cw_low_er2",
            ]
round_dic = {'close_pred': 2, 'high_pred': 2, 'low_pred': 2, 'close_err_range%': 2, 'high_err_range%':2, 'low_err_range%':2 }
stock_df = pd.DataFrame( pred_next_data, columns=col_names )
stock_df.round(round_dic)

Unnamed: 0,stock-name,Open,High,Low,Close,close_pred,close_err_range%,high_pred,high_err_range%,low_pred,low_err_range%
0,BABA_9988.HK,93.8,94.7,92.35,94.3,92.7,1.77,94.44,1.68,92.18,1.49
1,BIDU_9888.HK,139.0,141.3,138.8,140.6,137.91,2.02,140.51,1.85,136.67,1.8
2,JD_9618.HK,147.4,149.7,145.4,149.1,147.29,2.12,149.85,1.92,145.31,1.85
3,MPNGY_3690.HK,139.4,142.0,137.6,141.0,138.91,1.9,142.19,1.59,137.82,1.49
4,NTES_9999.HK,167.2,171.2,166.2,170.3,169.97,1.65,172.92,1.35,167.78,1.44
5,LI_2015.HK,164.0,172.1,164.0,171.8,168.32,2.31,175.9,1.97,169.09,1.82
6,XPEV_9868.HK,66.75,68.25,65.85,67.9,72.1,4.23,77.9,3.71,69.07,3.01
7,BILI_9626.HK,135.3,136.8,132.0,135.6,133.07,2.89,135.12,2.6,129.61,2.38
8,TCOM_9961.HK,319.8,330.2,319.4,326.4,324.49,1.85,327.9,1.46,319.65,1.57
9,YUMC_9987.HK,442.0,446.6,441.2,444.2,446.5,1.47,451.06,1.14,441.09,1.19
