In [1]:
# predict HK stocks according to US stocks
    # there is jet-lag between US stocks and HK stocks
    # US stock markets have much influence over HK stock markets
    # exploit the information from US stock markets to predict HK stock markets

In [2]:
# import packages needed
!pip install yfinance
import yfinance as yf #
import math
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split 
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting yfinance
  Downloading yfinance-0.1.86-py2.py3-none-any.whl (29 kB)
Collecting requests>=2.26
  Downloading requests-2.28.1-py3-none-any.whl (62 kB)
[K     |████████████████████████████████| 62 kB 1.2 MB/s 
Installing collected packages: requests, yfinance
  Attempting uninstall: requests
    Found existing installation: requests 2.23.0
    Uninstalling requests-2.23.0:
      Successfully uninstalled requests-2.23.0
Successfully installed requests-2.28.1 yfinance-0.1.86


In [45]:
# get data by ticker-name, start-time & end-time
def get_df_data(ticker_name="AAPL", start_time="2022-01-01", end_time="2022-10-09"):
  df_data = yf.download(tickers=ticker_name, start=start_time, end=end_time) 
  #df_data.head()
  return df_data

# calculate the daily return by (current_index - previous_index) / previous_index
def calculate_daily_return(df_data, OHLC_index="Close"):
  name1 = OHLC_index+"_previous"
  df_data[name1] = df_data[OHLC_index].shift(1)
  name2 = OHLC_index+"_delta"
  df_data[name2] = df_data[OHLC_index] - df_data[name1]
  name3 = OHLC_index+"_return"
  df_data[name3] = df_data[name2] / df_data[name1]
  del df_data[name1]
  del df_data[name2]
  new_feature = name3
  return df_data #, new_feature

# calculate the daily change of points & volumes
    # by (current_index - previous_index) / previous_index
    # by (current_volume - previous_volume) / previous_volume
def calculate_daily_change(df_data):
  # , OHLC_index="Close"
  df_data = calculate_daily_return(df_data, "Close")
  df_data = calculate_daily_return(df_data, "Volume")
  return df_data #, new_feature

# convert the time to be string type: yyyy-mm-dd
def get_ymt_date(df_data):
  df_data["ymd_time"] = df_data.index
  df_data["ymd_time"] = df_data["ymd_time"].astype(str)
  df_data["ymd_time"] = df_data["ymd_time"].str.slice(0,10)
  return df_data

# dict type: key - time, value - daily return
def get_date_return(df_data):
  return_list = list(df_data["Close_return"])
  time_list = list(df_data["ymd_time"])
  time_return_dic = {}
  ii = 0
  while ii<len(return_list):
    time_return_dic[ time_list[ii] ] = return_list[ii]
    """if return_list[ii]>0:
      time_return_dic[ time_list[ii] ] = 1
    else:
      time_return_dic[ time_list[ii] ] = 0"""
    ii = ii + 1
  return time_return_dic

# get useful features and pack as dict-type
    # key: date, value:
def get_date_features(df_data, features=["Close_return", "Volume_return"]):
    time_list = list(df_data["ymd_time"])
    values_list = []
    L = len(df_data)
    for ii in range(L):
        fv = []
        for fn in features:
            v = df_data.iloc[ii][fn]
            fv.append(v)
        values_list.append( fv )
    time_return_dic = {}
    ii = 0
    while ii<L:
        time_return_dic[ time_list[ii] ] = values_list[ii]
        ii = ii + 1
    return time_return_dic

# compare 2 string-type dates, 
  # for example: '2022-01-01' -> 20220101, '2022-10-10' -> 20221010,
def compare_date_str(date_str1, date_str2):
  num1 = date_str1[0:4]+date_str1[5:7]+date_str1[8:10]
  num1 = int(num1)
  num2 = date_str2[0:4]+date_str2[5:7]+date_str2[8:10]
  num2 = int(num2)
  if num1>num2:
    return 1
  elif num1<num2:
    return -1
  return 0

# 3 US stock indexes: Nasdaq, DJI, SP500, 1 HK index: HSI
# get their data for certain year
# close points -> moving average close points, parameter: nn
# calculate the daily return based on the MA close points
# get the date-return dic
def get_useful_data(tn, st, et, nn, features=["Close_return", "Volume_return"]):
  stock_index = get_df_data(ticker_name=tn, start_time=st, end_time=et)
  stock_index['SMA'+str(nn)] = stock_index['Close'].rolling(nn).mean() # moving average, smoothening function
  stock_index['Close'] = stock_index['SMA'+str(nn)] # moving average, smoothening function
  del stock_index['SMA'+str(nn)] # 
  stock_index = calculate_daily_change(stock_index)
  stock_index = get_ymt_date(stock_index)
  time_return = get_date_features(stock_index, features)
  return stock_index, time_return

# match US stocks with HK stock by time, because there is jet-lag between HK time & US time
def US_HK_stock_signal(year = 2022, nn = 5):
  st, et = str(year)+"-01-01", str(year)+"-12-31"
  # ^IXIC, ^DJI, ^GSPC
  nasdaq_df, nasdaq_time_return = get_date_return_dic("^IXIC", st, et, nn)
  dowjones_df, dowjones_time_return = get_date_return_dic("^DJI", st, et, nn)
  sp500_df, sp500_time_return = get_date_return_dic("^GSPC", st, et, nn)

  # ^HSI
  hsi_df, hsi_time_return = get_date_return_dic("^HSI", st, et, nn)  
  
  # 
  hk_us_stock_signal = []

  for k, v in sorted(hsi_time_return.items()):
    date_str = k
    rise_fall_signal = v
    for pk, pv in sorted(sp500_time_return.items(), reverse=True):
      if compare_date_str(k, pk)==1:
        v1 = sp500_time_return[pk]
        v2 = dowjones_time_return[pk]
        v3 = nasdaq_time_return[pk]
        #print(k, v, pk, v1, v2, v3)
        hk_us_stock_signal.append( (k, v, pk, v1, v2, v3) )
        break
  return hk_us_stock_signal

# filter signals according to HK stock rise/fall
def US_HK_info(hk_us_stock_signal, hk_s, us_stock_num):
  tmp_list = []
  for it in hk_us_stock_signal:
    hkt, hk_signal, ust, us_signal1, us_signal2, us_signal3 = it
    hk_signal = rise_fall_judge(hk_signal)
    us_signal1 = rise_fall_judge(us_signal1)
    us_signal2 = rise_fall_judge(us_signal2)
    us_signal3 = rise_fall_judge(us_signal3)
    #
    if hk_signal==hk_s and sum( [us_signal1, us_signal2, us_signal3] ) == us_stock_num:
      tmp_list.append( it )
  return tmp_list

# rise -> 1, fall -> 0
def rise_fall_judge(point):
  if point > 0:
    return 1
  return 0

# match US stocks with HK stock by time, because there is jet-lag between HK time & US time
def stocks_signal(refer_list=["^IXIC", "^DJI", "^GSPC"], target="^HSI", st="2022-01-01", et="2022-12-31", nn=5, features=["Close_return", "Volume_return"]):
  # reference list
  refer_dic_list = []
  for it in refer_list:
    _, tmp = get_useful_data(it, st, et, nn, features)
    refer_dic_list.append( tmp )
  # target
  _, target_dic = get_useful_data(target, st, et, nn, features) # ["Close_return"]
  # integrate references & target
  hk_us_stock_signal = []
  for k, v in sorted(target_dic.items()):
    date_str = k
    rise_fall_signal = v
    for pk, pv in sorted(refer_dic_list[0].items(), reverse=True):
      if compare_date_str(k, pk)==1:
        vs = []
        for rd in refer_dic_list:
          vs.append( rd[pk] )
        #print(k, v, pk, vs)
        hk_us_stock_signal.append( (k, v, pk, vs) )
        break
  return hk_us_stock_signal

# list of lists -> dataframe
def lists_to_dataframe(hk_us_stock_signal):
  tmp_list = []
  L = 0
  for it in hk_us_stock_signal[ : ]:
    hkt, hk_hsi, ust, vs = it
    # flatten list of lists to be list 
    vs = np.array(vs)
    vs = vs.reshape(-1)
    vs = list(vs)
    L = len(vs)
    sub_list = []
    sub_list.append( hkt )
    sub_list += hk_hsi
    sub_list.append( ust )
    sub_list += vs
    tmp_list.append( sub_list )
  cols = ["HKT", "HKDR", "HKVC", "UST"]
  for i in range(L):
    cols.append( "feature"+str(i+1) )
  #print( len(cols), len(tmp_list[0]) )
  assert len(cols)==len(tmp_list[0])
  df_data = pd.DataFrame(tmp_list, columns=cols)
  return df_data


# scale list
def scale_list_values(vs):
  tmp = []
  for v in vs:
    tmp.append( v*100 )
  return tmp

### 
tn = "9988.HK"
year = 2022
st, et = str(year)+"-01-01", str(year)+"-12-31"
df_data = get_df_data(tn, st, et)
df_data = calculate_daily_change(df_data)
df_data

[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,Close_return,Volume_return
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2022-01-03,117.000000,117.500000,114.000000,115.000000,115.000000,22176946,,
2022-01-04,118.400002,118.900002,115.699997,116.900002,116.900002,23228903,0.016522,0.047435
2022-01-05,119.000000,119.099998,113.900002,114.500000,114.500000,30717509,-0.020530,0.322383
2022-01-06,117.500000,121.099998,117.300003,121.000000,121.000000,47231895,0.056769,0.537621
2022-01-07,126.500000,128.800003,122.800003,128.800003,128.800003,58778943,0.064463,0.244476
...,...,...,...,...,...,...,...,...
2022-11-08,68.750000,69.000000,66.400002,67.099998,67.099998,64928791,-0.037303,-0.349366
2022-11-09,66.699997,67.750000,65.150002,66.000000,66.000000,63076446,-0.016393,-0.028529
2022-11-10,63.900002,64.500000,62.599998,63.000000,63.000000,58517362,-0.045455,-0.072279
2022-11-11,68.750000,71.449997,67.050003,70.800003,70.800003,110961720,0.123810,0.896219


In [39]:
### prepare data to fit models
nn = 1
dataset_xy = []
refer_list=["^IXIC", "AAPL", "GOOGL", "AMZN", "BABA", "PDD", "JD", "MPNGY", "TME", "BIDU"] # "^IXIC", "^DJI", "^GSPC"
target="9988.HK" # 0700.HK, ^HSI, 0005.HK:滙豐控股, 1299.HK:友邦保險, 0700.HK:騰訊控股, 9988.HK:阿⾥巴巴, 3690.HK:美團
st, et = str(year)+"-01-01", str(year)+"-12-31"
st, et = "2022-01-01", "2022-11-14"
features=["Close_return", "Volume_return"]

for i in range(2022, 2023):
  hk_us_stock_signal = stocks_signal(refer_list, target, st, et, nn, features)
  dataset_xy = dataset_xy + hk_us_stock_signal
print(target, ':', len(dataset_xy), "business days")

#
X_data = []
y_data = []
X_positive, X_negative = [], []
y_positive, y_negative = [], []

counter = 0
for it in dataset_xy[ : ]:
  hkt, hk_hsi, ust, vs = it
  # flatten list of lists to be list 
  vs = np.array(vs)
  vs = vs.reshape(-1)
  vs = list(vs)
  #
  counter += 1
  if counter==1:
    print("starting date:", hkt, ust)
  if counter==len(dataset_xy):
    print("ending date:", hkt, ust)
  #
  if math.isnan(hk_hsi[0]):
    continue
  fg = True
  for v in vs:
    if math.isnan(v):
      fg = False
      break
  if not fg:
    continue
  #us_sp500, us_dowjones, us_nasdaq = us_sp500*100, us_dowjones*100, us_nasdaq*100
  X_data.append( scale_list_values(vs) )
  if hk_hsi[0]>0:
    y_data.append( 1 )
    X_positive.append( scale_list_values(vs) )
    y_positive.append( 1 )
  else:
    y_data.append( 0 )
    X_negative.append( scale_list_values(vs) )
    y_negative.append( 0 )

print( "rise vs fall: ", sum(y_data), len(y_data)-sum(y_data) )
X_data[-2:], y_data[-2:]
len(X_positive), len(y_positive), len(X_negative), len(y_negative)

import random
random.seed(9)

sample_number = min(len(X_positive), len(X_negative))

X_data = random.sample(X_positive, sample_number)
y_data = random.sample(y_positive, sample_number)
X_data = X_data + random.sample(X_negative, sample_number)
y_data = y_data + random.sample(y_negative, sample_number)


len(X_data), len(y_data)
print( "rise vs fall: ", sum(y_data), len(y_data)-sum(y_data) )
np.array(X_data).shape, np.array(y_data).shape

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
9988.HK : 212 business days
starting date: 2022-01-04 2022-01-03
ending date: 2022-11-11 2022-11-10
rise vs fall:  85 126
rise vs fall:  85 85


((170, 20), (170,))

In [44]:


df_data = lists_to_dataframe(hk_us_stock_signal)
df_data

24 24


Unnamed: 0,HKT,HKDR,HKVC,UST,feature1,feature2,feature3,feature4,feature5,feature6,...,feature11,feature12,feature13,feature14,feature15,feature16,feature17,feature18,feature19,feature20
0,2022-01-04,0.016522,0.047435,2022-01-03,,,,,,,...,,,,,,,,,,
1,2022-01-05,-0.020530,0.322383,2022-01-04,-0.013269,0.158275,-0.012692,-0.049551,-0.004083,-0.008588,...,-0.111943,1.359285,-0.060442,1.132382,-0.061696,1.729614,-0.058394,0.183676,-0.017237,0.235208
2,2022-01-06,0.056769,0.537621,2022-01-05,-0.033448,-0.019345,-0.026600,-0.048059,-0.045876,0.923169,...,-0.025893,-0.410963,-0.018380,-0.040105,-0.074458,0.207547,-0.017054,-0.135264,-0.018085,0.218680
3,2022-01-07,0.064463,0.244476,2022-01-06,-0.001279,-0.047901,-0.016693,0.025031,-0.000200,-0.316196,...,0.074181,0.505374,0.059664,-0.324300,0.051031,-0.020833,0.039432,0.299012,0.047748,0.095011
4,2022-01-10,-0.009317,-0.373671,2022-01-07,-0.009612,-0.115377,0.000988,-0.105206,-0.005303,-0.203170,...,0.072319,-0.138743,0.021863,0.003309,0.012376,-0.564495,0.024279,-0.158115,0.017114,-0.283147
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
207,2022-11-07,-0.002861,-0.343454,2022-11-04,0.012792,0.068904,-0.003600,0.437080,0.037756,-0.166140,...,0.086361,0.749829,0.097428,0.836077,0.049128,1.900628,0.047493,0.505504,0.090211,0.654493
208,2022-11-08,-0.037303,-0.349366,2022-11-07,0.008522,-0.182821,0.003902,-0.407500,0.022060,-0.333167,...,-0.002747,-0.269627,0.002929,-0.491270,-0.017656,-0.529391,0.000000,-0.481622,-0.002113,-0.371486
209,2022-11-09,-0.016393,-0.028529,2022-11-08,0.004892,0.152088,0.004175,0.078368,0.004633,-0.107101,...,0.019443,-0.377069,0.022018,-0.097408,-0.012503,-0.205364,-0.020151,0.498235,0.002000,-0.334956
210,2022-11-10,-0.045455,-0.072279,2022-11-09,-0.024776,-0.030585,-0.033190,-0.166733,-0.017773,0.044040,...,-0.059758,-0.109792,-0.066608,0.019527,-0.067001,-0.201543,-0.059126,0.179761,-0.067027,0.574070


In [40]:
### data-set split and train models
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.25, random_state=0, shuffle=True)
print( "training data: ", len(X_train), len(y_train) )
print( "testing data: ", len(X_test), len(y_test) )
print()

# LR
LR1 = LogisticRegression()
LR1.fit(X_train, y_train)
y_pred = LR1.predict(X_test)
score = LR1.score(X_test, y_test)
print(classification_report(y_test, y_pred))
print("LR", score)


# SVM
SVM1 = svm.SVC(kernel='linear', probability=True)  # solver='lbfgs', , max_iter=1000 * 1000 * 20
SVM1.fit(X_train, y_train)
y_pred = SVM1.predict(X_test)
score = SVM1.score(X_test, y_test)
print(classification_report(y_test, y_pred))
print("SVM", score)


# RF
RM1=RandomForestClassifier(n_estimators=100)
RM1.fit(X_train, y_train)
y_pred = RM1.predict(X_test)
score = RM1.score(X_test, y_test)
print(classification_report(y_test, y_pred))
print("RF", score)

training data:  127 127
testing data:  43 43

              precision    recall  f1-score   support

           0       0.76      0.73      0.74        22
           1       0.73      0.76      0.74        21

    accuracy                           0.74        43
   macro avg       0.74      0.74      0.74        43
weighted avg       0.74      0.74      0.74        43

LR 0.7441860465116279


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


              precision    recall  f1-score   support

           0       0.86      0.82      0.84        22
           1       0.82      0.86      0.84        21

    accuracy                           0.84        43
   macro avg       0.84      0.84      0.84        43
weighted avg       0.84      0.84      0.84        43

SVM 0.8372093023255814
              precision    recall  f1-score   support

           0       0.68      0.77      0.72        22
           1       0.72      0.62      0.67        21

    accuracy                           0.70        43
   macro avg       0.70      0.70      0.70        43
weighted avg       0.70      0.70      0.70        43

RF 0.6976744186046512


In [41]:
# get latest US stock data to predict HK stock
df_reference = pd.DataFrame()
year = 2022
st, et = str(year)+"-01-01", str(year)+"-12-31"
#refer_list=["^IXIC", "AAPL", "GOOGL", "AMZN", "BABA", "PDD", "JD", "MPNGY", "TME", "BIDU"] # "^IXIC", "^DJI", "^GSPC"
#target="9988.HK" # 0700.HK, ^HSI, 0005.HK:滙豐控股, 1299.HK:友邦保險, 0700.HK:騰訊控股, 9988.HK:阿⾥巴巴, 3690.HK:美團
lastest_data = []
lastest_day = -1
for tn in refer_list:
    tmp_df = get_df_data(tn, st, et)
    tmp_df = calculate_daily_change(tmp_df)
    #tmp_df = tmp_df[ ['Close', 'Close_return'] ]
    #df_reference[tn+"_Close"] = tmp_df[ 'Close' ]
    df_reference[tn+"_Close_return"] = tmp_df[ 'Close_return' ]
    df_reference[tn+"_Volume_return"] = tmp_df[ 'Volume_return' ]
    date_time = list(tmp_df.index)[lastest_day]
    close_point = list(tmp_df[ 'Close_return' ])[lastest_day]
    volume_point = list(tmp_df[ 'Close_return' ])[lastest_day]
    lastest_data.append( close_point )
    lastest_data.append( volume_point )
    print("date_time:", date_time, tn, close_point*100, volume_point*100)
df_reference
lastest_data = scale_list_values(lastest_data) # scale the data as the procedure of training data
print("lastest_data of US stock market info: ", lastest_data)

#
rise_or_fall = LR1.predict([lastest_data])
prob = LR1.predict_proba([lastest_data])
print( "LR predict today: ", rise_or_fall, prob )

rise_or_fall = SVM1.predict([lastest_data])
prob = SVM1.predict_proba([lastest_data])
print( "SVM predict today: ", rise_or_fall, prob )

rise_or_fall = RM1.predict([lastest_data])
prob = RM1.predict_proba([lastest_data])
print( "RF predict today: ", rise_or_fall, prob )

[*********************100%***********************]  1 of 1 completed
date_time: 2022-11-14 00:00:00 ^IXIC -1.1225441113436807 -1.1225441113436807
[*********************100%***********************]  1 of 1 completed
date_time: 2022-11-14 00:00:00 AAPL -0.9485625904429829 -0.9485625904429829
[*********************100%***********************]  1 of 1 completed
date_time: 2022-11-14 00:00:00 GOOGL -0.7364450647212569 -0.7364450647212569
[*********************100%***********************]  1 of 1 completed
date_time: 2022-11-14 00:00:00 AMZN -2.281975425008139 -2.281975425008139
[*********************100%***********************]  1 of 1 completed
date_time: 2022-11-14 00:00:00 BABA 0.7913031150936856 0.7913031150936856
[*********************100%***********************]  1 of 1 completed
date_time: 2022-11-14 00:00:00 PDD 0.32218331311664744 0.32218331311664744
[*********************100%***********************]  1 of 1 completed
date_time: 2022-11-14 00:00:00 JD 3.9239692560194337 3.92396925