In [1]:
import numpy as np 
import matplotlib.pyplot as plt 
#!pip install yfinance
import yfinance as yf # https://pypi.org/project/yfinance/
import math
import random
import seaborn as sns
import datetime
import pandas as pd
#!pip install sklearn
from sklearn.linear_model import LinearRegression
from scipy import stats
import warnings
warnings.filterwarnings(action='ignore')
from bs4 import BeautifulSoup
import requests
import json
import time
import re

In [211]:
def get_realtime_info(stock_code):
    URL_link = "https://www.citifirst.com.hk/en/data/json/json_realtimedata/code/"+stock_code
    headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:80.0) Gecko/20100101 Firefox/80.0'}
    soup = BeautifulSoup(requests.get(URL_link, headers=headers).content, 'html.parser')

    start_index = str(soup).find("{")
    end_index = str(soup).find("}")
    st = str(soup)[start_index:end_index+1]
    json_str = ""
    for ln in st.split(","):
        if "<" not in ln:
            json_str = json_str + ln + ","
    dic = json.loads(json_str[:-1])

    stock_info = {}
    stock_info['Open'] = dic['open']
    stock_info['High'] = dic['high']
    stock_info['Low'] = dic['low']
    stock_info['Close'] = dic['last']
    stock_info['previous_Close'] = dic['lastc']
    stock_info['turnover'] = dic['turnover']
    stock_info['date_time'] = dic['stimeNoformat']
    return stock_info

# add week number for the dataframe with date as index
def calculate_week_number(df_data):
    start_weekday = df_data.index[0].weekday() # offset
    start_date = df_data.index[0]
    week_nums = []
    i = 0
    while i<len(df_data.index):
        cur_date = df_data.index[i]
        cur_week_num = ( int((cur_date- start_date).days) + start_weekday ) // 7
        week_nums.append( cur_week_num )
        i += 1
    df_data['week_num'] = week_nums
    return df_data

# get data by ticker-name, start-time & end-time
def get_df_data(ticker_name="AAPL", start_time="2022-01-01", end_time="2023-12-31", real_time=True):
    df_data = None
    df_data = yf.download(tickers=ticker_name, start=start_time, end=end_time) 

    if real_time:
        stock_info = get_realtime_info(ticker_name)
        open_price, high_price, low_price, current_price = stock_info['Open'], stock_info['High'], stock_info['Low'], stock_info['Close']
        try:
            df_data.at[df_data.index[-1], "Open"] = float(open_price)
            df_data.at[df_data.index[-1], "High"] = float(high_price)
            df_data.at[df_data.index[-1], "Low"] = float(low_price)
            df_data.at[df_data.index[-1], "Close"] = float(current_price)
        except:
            print(stock_info)
            df_data.at[df_data.index[-1], "Open"] = float(current_price)
            df_data.at[df_data.index[-1], "High"] = float(current_price)
            df_data.at[df_data.index[-1], "Low"] = float(current_price)
            df_data.at[df_data.index[-1], "Close"] = float(current_price)
    elif ".HK" in ticker_name:
        print("data may late for 15 minutes")
    # basic calculations such as: daily return, the log of Volume, Moving Average
    df_data['previous_Close'] = df_data['Close'].shift(1)
    df_data['daily_return'] = (df_data['Close']-df_data['previous_Close'])/df_data['previous_Close']
    df_data['Volume_log'] = np.log2(df_data['Volume'])
    MA1, MA2 = 5, 20
    df_data['MA1'] = df_data['Close'].rolling(MA1).mean()
    df_data['MA2'] = df_data['Close'].rolling(MA2).mean()
    df_data['daily_range'] = (df_data['High']-df_data['Low'])/df_data['previous_Close'] * 100
    df_data['daily_to'] = (df_data['Open']+df_data['High']+df_data['Low']+df_data['Close'])/4*df_data['Volume'] * 100 # turnover
    df_data['daily_to'] = np.log2(df_data['daily_to'])

    # add week number
    df_data['weekday'] = df_data.index.weekday
    df_data = calculate_week_number(df_data) 
    return df_data

def check_valid(sub_df, printing=True):
    for f in ['Open', 'Close', 'High', 'Low', 'Volume_log']:
        for v in list(sub_df[f]):
            """if math.isnan(v):
                if printing:
                    print( sub_df[['Open', 'Close', 'High', 'Low', 'Volume', 'week_num']], f, "NaN" )
                return False"""
            if math.isinf(v):
                if printing:
                    print( sub_df[['Open', 'Close', 'High', 'Low', 'Volume', 'week_num']], f, "INF" )
                return False
    return True


def weekly_info(df_data, week_num):
    tmp_df = df_data[ df_data['week_num']==week_num ].copy()
    if not check_valid(tmp_df.copy(), False):
        """tmp_df.replace([np.inf, -np.inf], np.nan, inplace=True)
        tmp_df.dropna(inplace=True)
        if len(tmp_df)<=1:
            print( len(tmp_df), "week_num", week_num )
            return []"""
        return []
        
    Len = len(tmp_df)
    start_date, end_date = tmp_df.index[0], tmp_df.index[-1]
    open_price, close_price = tmp_df['Open'][0], tmp_df['Close'][-1]

    high_price, high_date = open_price, start_date
    low_price, low_date = open_price, start_date
    i = 0
    while i<Len:
        cur_date = tmp_df.index[i]
        cur_high, cur_low = tmp_df['High'][i], tmp_df['Low'][i]
        if cur_high>=high_price:
            high_price = cur_high
            high_date = cur_date
        if cur_low<=low_price:
            low_price = cur_low
            low_date = cur_date
        i += 1
    
    # rise-fall ratio
    rise_N, fall_N = len( tmp_df[tmp_df['daily_return']>0] ), len( tmp_df[tmp_df['daily_return']<0] )
    # open avg, open std, 
    open_avg, open_std = tmp_df['Open'].mean(), tmp_df['Open'].std()
    #high avg, high std, 
    high_avg, high_std = tmp_df['High'].mean(), tmp_df['High'].std()
    #low avg, low std, 
    low_avg, low_std = tmp_df['High'].mean(), tmp_df['High'].std()
    # close avg, close std, 
    close_avg, close_std = tmp_df['Close'].mean(), tmp_df['Close'].std()
    # volume avg, volume std
    volume_avg, volume_std = tmp_df['Volume_log'].mean(), tmp_df['Volume_log'].std() # Volume, Volume_log
    # MA
    ma_avg1, ma_avg2 = tmp_df['MA1'].mean(), tmp_df['MA2'].mean()
    #ma_avg1, ma_avg2 = tmp_df['MA1'][-1], tmp_df['MA2'][-1]
    ma_std1, ma_std2 = tmp_df['MA1'].std(), tmp_df['MA2'].std()
    #
    weekly_change = (close_price - tmp_df['previous_Close'][0]) / tmp_df['previous_Close'][0] * 100
    # 
    range_avg, range_std = tmp_df['daily_range'].std(), tmp_df['daily_range'].std()
    turnover_avg, turnover_std = tmp_df['daily_to'].std(), tmp_df['daily_to'].std()

    it_info = [
                week_num, 
                start_date, open_price, end_date, close_price, 
                high_date, high_price, low_date, low_price,  
                rise_N, fall_N, 
                open_avg, open_std, 
                high_avg, high_std,
                low_avg, low_std,
                close_avg, close_std, 
                volume_avg, volume_std, 
                ma_avg1, ma_avg2,
                ma_std1, ma_std2,
                weekly_change,
                range_avg, range_std,
                turnover_avg, turnover_std
            ]
    
    return it_info

# week num, start date, end date, open price, close price, high price, high date, low price, low date
def generate_weekly_df(df_data):
    data_col = []

    weeknum_list = list( set(list(df_data['week_num'])) )
    weeknum_list = sorted(weeknum_list)
    invalid_weeks = []
    for i in weeknum_list[1:]:
        wn = i
        it_info = weekly_info(df_data, wn)
        if len(it_info)>0:
            data_col.append( it_info )
        else:
            invalid_weeks.append( wn )
    print( len(invalid_weeks), "invalid_weeks: ", invalid_weeks )

    col_names = [   
                    'week-num', 
                    'start_date', 'open_price', 'end_date', 'close_price',                     
                    'high_date', 'high_price', 'low_date', 'low_price',                     
                    'rise_N', 'fall_N',
                    'open_avg', 'open_std',
                    'high_avg', 'high_std',
                    'low_avg', 'low_std',
                    'close_avg', 'close_std',
                    'volume_avg', 'volume_std',
                    'ma_avg1', 'ma_avg2',
                    'ma_std1', 'ma_std2',
                    'weekly_change',
                    'range_avg', 'range_std',
                    'turnover_avg', 'turnover_std'
                ]
    weekly_df = pd.DataFrame(data_col, columns = col_names)
    # next week features & labels
    weekly_df['nw_open'] = weekly_df['open_price'].shift(-1)
    weekly_df['nw_high'] = weekly_df['high_price'].shift(-1)
    weekly_df['nw_low'] = weekly_df['low_price'].shift(-1)
    weekly_df['nw_close'] = weekly_df['close_price'].shift(-1)
    return weekly_df

def get_datasets(weekly_df, features, label):
    X_data, y_data = [], []
    i = 0
    while i < len(weekly_df): # 
        row_data = weekly_df.iloc[i]
        x_tmp = []
        for col in features:
            v = row_data[col]
            x_tmp.append( v )
        y_tmp = row_data[label]
        #print( x_tmp, y_tmp )
        X_data.append( x_tmp )
        y_data.append( y_tmp )
        #print( row_data['week-num'], i )
        i += 1
    return X_data, y_data

from sklearn.linear_model import LinearRegression
def train_model(train_X, train_y, printing=True):
    model = LinearRegression().fit(train_X, train_y)

    r_sq = model.score(train_X, train_y)
    if printing:
        print(f"coefficient of determination: {r_sq}")
        print(f"intercept: {model.intercept_}\tslope: {model.coef_}")
    return model

def error_analyze(train_y, y_pred):
    df = pd.DataFrame(columns = ['y_real', 'y_pred'])
    df['y_real'] = train_y
    df['y_pred'] = y_pred
    df['dif'] = (df['y_real'] - df['y_pred'])/df['y_real'] * 100
    df['dif'] = df['dif'].abs()
    return df

def basic_info(df, col='dif'):
    print("max:\t", df[col].max())
    print("min:\t", df[col].min())
    print("median:\t", df[col].median())
    print("mean:\t", df[col].mean())
    print("std:\t", df[col].std())
    print("10%:\t", df[col].quantile(0.10))
    print("25%:\t", df[col].quantile(0.25))
    print("50%:\t", df[col].quantile(0.50))
    print("75%:\t", df[col].quantile(0.75))
    print("90%:\t", df[col].quantile(0.90))
    return

In [215]:
stocks_info = [
    #('HSI', '^HSI', 1),
    ('BABA', '9988.HK', 1),
    ('BIDU', '9888.HK', 1),
    ('JD', '9618.HK', 0.5 * 7.8),
    ('MPNGY', '3690.HK', 0.5 * 7.8),
    ('NTES', '9999.HK', 0.2 * 7.8),
	('LI', '2015.HK', 0.5 * 7.8),
	('XPEV', '9868.HK', 0.5*7.8),	
	('BILI', '9626.HK', 1 * 7.8),
	('TCOM', '9961.HK', 1 * 7.8),
	('YUMC', '9987.HK', 1*7.8),
	('EDU', '9901.HK', 0.1*7.8), 
    ('NIO', '9866.HK', 1 * 7.8),
	('ZTO', '2057.HK', 1*7.8),
    ('BEKE', '2423.HK', 0.5*7.8),
    ('ZH', '2390.HK', 3 * 7.8), 
    ('WB', '9898.HK', 1*7.8),
    ('MNSO', '9896.HK', 0.5*7.8),
    ('ZLAB', '9688.HK', 0.5*7.8),
    ('TENCENT', '0700.HK', 1*7.8),
    ('TME', '1698.HK', 1*7.8),
    ('SMIC', '0981.HK', 1*7.8),
    ('SenseTime', '0020.HK', 1*7.8),
    ('Kuaishou', '1024.HK', 1*7.8),
    ('Xiaomi', '1810.HK', 1*7.8),
    ('CMB', '3968.HK', 1*7.8),
]

st, et = "2020-01-01", "2023-08-31"

features1 = [   
                'open_price', 'close_price', 'high_price', 'low_price', 
                'rise_N', 'fall_N', 
                'open_avg', 'open_std',
                'high_avg', 'high_std',
                'low_avg', 'low_std',
                'close_avg', 'close_std',
                'volume_avg', 'volume_std',
                'ma_avg1', 'ma_avg2',
                'ma_std1', 'ma_std2',
                'weekly_change', 
                'range_avg', 'range_std',
                'turnover_avg', 'turnover_std',
                'nw_open'
            ]
feature2  = features1[:-1]

labels12 = ["nw_close", "nw_high", "nw_low"]

In [218]:
predicted_data = []
for it in stocks_info[:]:# 
    reference_stock, target_stock, ratio = it   

    test_df = get_df_data(target_stock, st, et, True)
    weekly_df = generate_weekly_df(test_df)
    label = 'nw_close'
    
    new_name = reference_stock + "_" + target_stock
    each_row = []
    for label in labels12:
        # training data
        training_data = weekly_df[10:-2].copy()
        X_data, y_data = get_datasets(training_data, features1, label)

        # testing / predicting this week
        testing_data1 = weekly_df[-2:-1].copy()
        X_test1, y_test1 = get_datasets(testing_data1, features1, label)

        # testing / predicting next week
        testing_data2 = weekly_df[-1:].copy()
        X_test2, y_test2 = get_datasets(testing_data2, features1, label)

        # fit model
        model = train_model(X_data, y_data) 
        # error analysis
        y_pred = model.predict(X_data) 
        error_df = error_analyze(y_data, y_pred)
        err1, err2 = round(error_df['dif'].median(), 2), round(error_df['dif'].mean(), 2)

        pred_this_week = model.predict(X_test1)
        y_pred1 = round(pred_this_week[0], 2)

        # 
        last_week_close = round(X_test1[0][1], 2)
        each_row = each_row + [ y_pred1, y_test1[0], err2 ]
    predicted_data.append( [new_name, last_week_close]+each_row )

[*********************100%***********************]  1 of 1 completed
5 invalid_weeks:  [51, 52, 58, 103, 109]
coefficient of determination: 0.9861343427226609
intercept: -45.658420100456794	slope: [ -0.08903484   0.27643351   0.57877055  -0.86380256  -3.05053396
  -2.60530671   2.13815912  -0.77471966  -0.16978933   0.34059661
  -0.16978933   0.34059661  -1.73327297  -1.89731554   2.39149429
  18.19793083  -0.27056725   0.12998924  -0.09539441  -0.40080073
  -0.15093653  -0.02458812  -0.02458812 -11.33047368 -11.33047368
   1.17472808]
coefficient of determination: 0.9933799890099754
intercept: -16.654605364073007	slope: [-0.03133773  0.30276649  1.0605286  -0.90850801 -2.1619643  -1.64973814
  0.23317873 -0.35661372 -0.19632228 -0.62114092 -0.19632228 -0.62114092
 -0.01242365 -0.42360382  1.13480577  4.77455482 -0.458102    0.11209824
 -0.26867965 -0.34491893 -0.34331581 -0.13272622 -0.13272622 -4.24413674
 -4.24413674  1.10800843]
coefficient of determination: 0.9933629898843971
inte

In [219]:
col_names1 = [
            "stock-name", 'last_week_close', 
            "close_pred", "close_real", "close_err_range%", 
            "high_pred", "high_real", "high_err_range%", 
            "low_pred", "low_real", "low_err_range%",    
            ]
col_names2 = [
            "stock-name", 'last_week_close', 
            "close_pred", "close_real", "close_err_range%", "nd_close",
            "high_pred", "high_real", "high_err_range%", "nd_high",
            "low_pred", "low_real", "low_err_range%", "nd_low",            
            ]

col_names = col_names1
round_dic = {'close_pred': 2, 'high_pred': 2, 'low_pred': 2, 
             'close_err_range%': 2, 'high_err_range%':2, 'low_err_range%':2, 
             'nd_close':2, 'nd_high':2, 'nd_low':2,
              'weekly_change%':2 
            }
stock_df = pd.DataFrame( predicted_data, columns=col_names )
stock_df['price_change'] = stock_df['close_real'] - stock_df['last_week_close']
stock_df['weekly_change%'] = stock_df['price_change'] / stock_df['last_week_close'] * 100


show_cols = [
        "stock-name", 'last_week_close', 'price_change', 'weekly_change%',
        "close_pred", "close_real", "close_err_range%", 
        "high_pred", "high_real", "high_err_range%", 
        "low_pred", "low_real", "low_err_range%",
        ]


stock_df.round(round_dic)[ show_cols ] 

Unnamed: 0,stock-name,last_week_close,price_change,weekly_change%,close_pred,close_real,close_err_range%,high_pred,high_real,high_err_range%,low_pred,low_real,low_err_range%
0,BABA_9988.HK,95.3,-2.5,-2.62,90.01,92.8,4.25,94.89,93.35,2.92,87.44,91.45,2.63
1,BIDU_9888.HK,137.8,-3.2,-2.32,132.64,134.6,3.96,141.0,135.4,2.76,127.0,132.1,2.48
2,JD_9618.HK,146.1,-1.9,-1.3,152.42,144.2,4.79,155.65,144.4,3.32,137.74,140.5,3.3
3,MPNGY_3690.HK,137.2,-1.8,-1.31,142.1,135.4,6.17,144.51,135.9,3.92,129.57,131.6,4.0
4,NTES_9999.HK,163.6,-1.4,-0.86,158.63,162.2,4.17,165.98,162.3,2.59,151.29,157.5,2.82
5,LI_2015.HK,165.4,-4.5,-2.72,144.08,160.9,5.99,156.48,161.0,3.73,145.72,150.0,3.22
6,XPEV_9868.HK,66.1,-2.0,-3.03,58.98,64.1,7.91,68.3,64.6,5.33,56.25,61.0,4.73
7,BILI_9626.HK,131.0,-0.6,-0.46,123.82,130.4,8.92,138.24,131.2,5.16,118.69,126.1,5.15
8,TCOM_9961.HK,320.0,-5.2,-1.62,303.96,314.8,4.58,321.33,316.8,2.95,295.79,306.4,3.28
9,YUMC_9987.HK,436.4,-6.0,-1.37,434.99,430.4,3.08,448.69,433.2,2.0,419.18,424.2,2.05
