In [1]:
import numpy as np 
import matplotlib.pyplot as plt 
#!pip install yfinance
import yfinance as yf # https://pypi.org/project/yfinance/
import math
import random
import seaborn as sns
import datetime
import pandas as pd
#!pip install sklearn
from sklearn.linear_model import LinearRegression
from scipy import stats
import warnings
warnings.filterwarnings(action='ignore')
from bs4 import BeautifulSoup
import requests
import json
import time
import re

In [98]:
def get_realtime_info(stock_code):
    URL_link = "https://www.citifirst.com.hk/en/data/json/json_realtimedata/code/"+stock_code
    headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:80.0) Gecko/20100101 Firefox/80.0'}
    soup = BeautifulSoup(requests.get(URL_link, headers=headers).content, 'html.parser')

    start_index = str(soup).find("{")
    end_index = str(soup).find("}")
    st = str(soup)[start_index:end_index+1]
    json_str = ""
    for ln in st.split(","):
        if "<" not in ln:
            json_str = json_str + ln + ","
    dic = json.loads(json_str[:-1])

    stock_info = {}
    stock_info['Open'] = dic['open']
    stock_info['High'] = dic['high']
    stock_info['Low'] = dic['low']
    stock_info['Close'] = dic['last']
    stock_info['previous_Close'] = dic['lastc']
    stock_info['turnover'] = dic['turnover']
    stock_info['date_time'] = dic['stimeNoformat']
    return stock_info

# get data by ticker-name, start-time & end-time
def get_df_data(ticker_name="AAPL", start_time="2022-01-01", end_time="2023-12-31", real_time=True):
    df_data = None
    df_data = yf.download(tickers=ticker_name, start=start_time, end=end_time) 

    if real_time:
        stock_info = get_realtime_info(ticker_name)
        open_price, high_price, low_price, current_price = stock_info['Open'], stock_info['High'], stock_info['Low'], stock_info['Close']
        try:
            df_data.at[df_data.index[-1], "Open"] = float(open_price)
            df_data.at[df_data.index[-1], "High"] = float(high_price)
            df_data.at[df_data.index[-1], "Low"] = float(low_price)
            df_data.at[df_data.index[-1], "Close"] = float(current_price)
        except:
            print(stock_info)
            df_data.at[df_data.index[-1], "Open"] = float(current_price)
            df_data.at[df_data.index[-1], "High"] = float(current_price)
            df_data.at[df_data.index[-1], "Low"] = float(current_price)
            df_data.at[df_data.index[-1], "Close"] = float(current_price)
    elif ".HK" in ticker_name:
        print("data may late for 15 minutes")
    # basic calculations such as: daily return, the log of Volume, Moving Average
    df_data['previous_Close'] = df_data['Close'].shift(1)
    df_data['daily_return'] = (df_data['Close']-df_data['previous_Close'])/df_data['previous_Close']
    df_data['Volume_log'] = np.log2(df_data['Volume'])
    MA1, MA2 = 5, 20
    df_data['MA1'] = df_data['Close'].rolling(MA1).mean()
    df_data['MA2'] = df_data['Close'].rolling(MA2).mean()
    return df_data

def get_pair_dates(US_dates, HK_dates):
    pair_indexes = []
    for hk_d in HK_dates:
        idx1 = hk_d
        idx2 = None
        for us_d in US_dates:
            if idx1>=us_d:
                idx2 = us_d
        if idx2 is not None:
            pair_indexes.append( [idx1, idx2] )
    return pair_indexes

def get_merged_df(HK_stock_df, US_stock_df, target_name="Close", offset_row=20):
    #add label for HK stock
    HK_stock_df['next_'+target_name] = HK_stock_df[target_name].shift(-1)
    # 
    US_dates = US_stock_df.index
    HK_dates = HK_stock_df.index
    pair_indexes = get_pair_dates(US_dates, HK_dates)
    # 
    merged_col = [ "HK_date", "HK_Open", "HK_High", "HK_Low", "HK_Close", "HK_Volume", "HK_MA1", "HK_MA2", "HK_next_Close",
                    "US_date", "US_Open", "US_High", "US_Low", "US_Close", "US_Volume", "US_MA1", "US_MA2"
                  ]
    merged_data = []
    for p in pair_indexes[:]:
        hk_d, us_d = p[0], p[1]
        hk_info, us_info = HK_stock_df.loc[hk_d], US_stock_df.loc[us_d]
        hk_use = list(hk_info)[:4] + list(hk_info)[8:-1] # HK_stock_df.columns
        us_use = list(us_info)[:4] + list(us_info)[8:] # US_stock_df.columns
        label_use = list(hk_info)[-1]
        #print( hk_use, us_use, label_use )
        it = [hk_d] + hk_use + [label_use] + [us_d] + us_use
        merged_data.append( it )
    #
    merged_df = pd.DataFrame( merged_data, columns=merged_col )
    return merged_df[offset_row:].copy()

def get_merged_df_update(HK_stock_df, US_stock_df, target_name="Close", offset_row=20):
    # next Open is one of the features
    HK_stock_df['next_Open'] = HK_stock_df['Open'].shift(-1)

    #add label for HK stock
    HK_stock_df['next_'+target_name] = HK_stock_df[target_name].shift(-1)    
    # 
    US_dates = US_stock_df.index
    HK_dates = HK_stock_df.index
    pair_indexes = get_pair_dates(US_dates, HK_dates)
    # 
    merged_col = [ "HK_date", "HK_Open", "HK_High", "HK_Low", "HK_Close", "HK_Volume", "HK_MA1", "HK_MA2", "HK_next_Open", "HK_next_Close",
                    "US_date", "US_Open", "US_High", "US_Low", "US_Close", "US_Volume", "US_MA1", "US_MA2"
                  ]
    merged_data = []
    for p in pair_indexes[:]:
        hk_d, us_d = p[0], p[1]
        hk_info, us_info = HK_stock_df.loc[hk_d], US_stock_df.loc[us_d]
        hk_use = list(hk_info)[:4] + list(hk_info)[8:-1] # HK_stock_df.columns, hk_info
        us_use = list(us_info)[:4] + list(us_info)[8:] # US_stock_df.columns, us_info
        label_use = list(hk_info)[-1]
        #print( hk_use, us_use, label_use )
        it = [hk_d] + hk_use + [label_use] + [us_d] + us_use
        merged_data.append( it )
    #
    merged_df = pd.DataFrame( merged_data, columns=merged_col )
    return merged_df[offset_row:].copy()


# LR model
from sklearn.linear_model import LinearRegression
def train_model(train_X, train_y, printing=True):
    model = LinearRegression().fit(train_X, train_y)

    r_sq = model.score(train_X, train_y)
    if printing:
        print(f"coefficient of determination: {r_sq}")
        print(f"intercept: {model.intercept_}\tslope: {model.coef_}")
    return model

def error_analyze(train_y, y_pred):
    df = pd.DataFrame(columns = ['y_real', 'y_pred'])
    df['y_real'] = train_y
    df['y_pred'] = y_pred
    df['dif'] = (df['y_real'] - df['y_pred'])/df['y_real'] * 100
    df['dif'] = df['dif'].abs()
    return df

def basic_info(df, col='dif'):
    print("max:\t", df[col].max())
    print("min:\t", df[col].min())
    print("median:\t", df[col].median())
    print("mean:\t", df[col].mean())
    print("std:\t", df[col].std())
    print("10%:\t", df[col].quantile(0.10))
    print("25%:\t", df[col].quantile(0.25))
    print("50%:\t", df[col].quantile(0.50))
    print("75%:\t", df[col].quantile(0.75))
    print("90%:\t", df[col].quantile(0.90))
    return

def predict_current_next_days01(merged_df, train_rows=120, test_row1=-2, test_row2=-1):
    row_data1 = merged_df.iloc[test_row1]
    test_data1, label1 = [list(row_data1)[1:8]+list(row_data1)[10:]], list(row_data1)[8]
    d1 = list(row_data1)[0]
    row_data2 = merged_df.iloc[test_row2]
    test_data2, label2 = [list(row_data2)[1:8]+list(row_data2)[10:]], list(row_data2)[8]
    d2 = list(row_data2)[0]
    #
    train_df = merged_df[-train_rows-2:-2].copy()
    X_data, y_data = [], []
    dates01, dates02 = [], []
    i = 0
    while i<len(train_df):
        row_info = train_df.iloc[i]
        sub_x = list(row_info)[1:8]+list(row_info)[10:]
        sub_y = list(row_info)[8]
        #print(row_info)
        X_data.append( sub_x )
        y_data.append( sub_y )
        #
        dates01.append( row_info[0] )
        dates02.append( row_info[9] )
        i += 1
    #
    model = train_model(X_data, y_data)
    y_pred = model.predict(X_data) # error analysis
    error_df = error_analyze(y_data, y_pred)
    error_df['HK_date'] = dates01
    error_df['US_date'] = dates02

    #
    test_y_pred1 = model.predict(test_data1)[0]
    test_y_real1 = label1
    test_y_pred2 = model.predict(test_data2)[0]
    test_y_real2 = label2
    return error_df, (d1, test_y_pred1, test_y_real1), (d2, test_y_pred2, test_y_real2)

def predict_current_next_days02(merged_df, train_rows=120, test_row1=-2, test_row2=-1):
    row_data1 = merged_df.iloc[test_row1]
    test_data1, label1 = [list(row_data1)[1:9]+list(row_data1)[11:]], list(row_data1)[9]
    d1 = list(row_data1)[0] # (d1, test_data1, label1)
    row_data2 = merged_df.iloc[test_row2]
    test_data2, label2 = [list(row_data2)[1:9]+list(row_data2)[11:]], list(row_data2)[9]
    d2 = list(row_data2)[0] # (d2, test_data2, label2)
    # 
    train_df = merged_df[-train_rows-2:-2].copy()
    X_data, y_data = [], []
    dates01, dates02 = [], []
    i = 0
    while i<len(train_df):
        row_info = train_df.iloc[i]
        sub_x = list(row_info)[1:9]+list(row_info)[11:]
        sub_y = list(row_info)[9]
        #print(row_info)
        X_data.append( sub_x )
        y_data.append( sub_y )
        #
        dates01.append( row_info[0] )
        dates02.append( row_info[10] )
        i += 1
    #
    model = train_model(X_data, y_data)
    y_pred = model.predict(X_data) # error analysis
    error_df = error_analyze(y_data, y_pred)
    error_df['HK_date'] = dates01
    error_df['US_date'] = dates02

    #
    test_y_pred1 = model.predict(test_data1)[0]
    test_y_real1 = label1
    #test_y_pred2 = model.predict(test_data2)[0]
    #test_y_real2 = label2
    return error_df, (d1, test_y_pred1, test_y_real1), (d2, test_data2, label2) #(d2, test_y_pred2, test_y_real2)

In [122]:
stocks_info = [
    ('BABA', '9988.HK', 1),
    ('BIDU', '9888.HK', 1),
    ('JD', '9618.HK', 0.5 * 7.8),
    ('MPNGY', '3690.HK', 0.5 * 7.8),
    ('NTES', '9999.HK', 0.2 * 7.8),
	('LI', '2015.HK', 0.5 * 7.8),
	('XPEV', '9868.HK', 0.5*7.8),	
	('BILI', '9626.HK', 1 * 7.8),
	('TCOM', '9961.HK', 1 * 7.8),
	('YUMC', '9987.HK', 1*7.8),
	('EDU', '9901.HK', 0.1*7.8), 
    ('NIO', '9866.HK', 1 * 7.8),
	('ZTO', '2057.HK', 1*7.8),
    ('BEKE', '2423.HK', 0.5*7.8),
    ('ZH', '2390.HK', 3 * 7.8), 
    ('WB', '9898.HK', 1*7.8),
    ('MNSO', '9896.HK', 0.5*7.8),
    ('ZLAB', '9688.HK', 0.5*7.8),
    ('TME', '1698.HK', 1*7.8),
]

st, et = "2021-01-01", "2023-08-31"
US_stock_code, HK_stock_code, _ = stocks_info[0]
US_stock_df = get_df_data(US_stock_code, st, et, False)
HK_stock_df = get_df_data(HK_stock_code, st, et, True)

mode = 1
if mode == 1:
    target_name = "Close"
    merged_df = get_merged_df(HK_stock_df, US_stock_df, target_name, 20)
    error_df, pred_it1, it2 = predict_current_next_days01(merged_df, 200, -2, -1)
elif mode == 2:
    target_name = "Close"
    merged_df = get_merged_df_update(HK_stock_df, US_stock_df, target_name, 20)
    error_df, pred_it1, it2 = predict_current_next_days02(merged_df, 200, -2, -1) # 

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
coefficient of determination: 0.9728742650715233
intercept: -3.458779987225256	slope: [ 0.01324108  0.14816035 -0.13530156 -0.16530105  0.1159919   0.12011254
  0.23849598 -0.36136073  0.48692786  0.67832353  0.29561466  0.00455057
 -0.10962024 -0.22110116]


In [123]:
pred_it1, it2

((Timestamp('2023-08-10 00:00:00'), 97.94499491833758, 96.2),
 (Timestamp('2023-08-11 00:00:00'), 97.79693654984897, nan))

In [124]:
pred_it1, it2

((Timestamp('2023-08-10 00:00:00'), 97.94499491833758, 96.2),
 (Timestamp('2023-08-11 00:00:00'), 97.79693654984897, nan))