In [2]:
# predict HK stocks according US stocks

In [32]:
# import packages needed
#!pip install yfinance
import yfinance as yf
import math
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split 
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt

In [60]:
# get data by ticker-name, start-time & end-time
def get_df_data(ticker_name="AAPL", start_time="2022-01-01", end_time="2022-10-09"):
  df_data = yf.download(tickers=ticker_name, start=start_time, end=end_time) 
  #df_data.head()
  return df_data

# calculate the daily return by (current_index - previous_index) / previous_index
def calculate_daily_return(df_data, OHLC_index="Close"):
  name1 = OHLC_index+"_previous"
  df_data[name1] = df_data[OHLC_index].shift(1)
  name2 = OHLC_index+"_delta"
  df_data[name2] = df_data[OHLC_index] - df_data[name1]
  name3 = OHLC_index+"_return"
  df_data[name3] = df_data[name2] / df_data[name1]
  del df_data[name1]
  del df_data[name2]
  new_feature = name3
  return df_data #, new_feature

# convert the time to be string type: yyyy-mm-dd
def get_ymt_date(df_data):
  df_data["ymd_time"] = df_data.index
  df_data["ymd_time"] = df_data["ymd_time"].astype(str)
  df_data["ymd_time"] = df_data["ymd_time"].str.slice(0,10)
  return df_data

# dict type: key - time, value - daily return
def get_date_return(df_data):
  return_list = list(df_data["Close_return"])
  time_list = list(df_data["ymd_time"])
  time_return_dic = {}
  ii = 0
  while ii<len(return_list):
    time_return_dic[ time_list[ii] ] = return_list[ii]
    """if return_list[ii]>0:
      time_return_dic[ time_list[ii] ] = 1
    else:
      time_return_dic[ time_list[ii] ] = 0"""
    ii = ii + 1
  return time_return_dic

# compare 2 string-type dates, 
  # for example: '2022-01-01' -> 20220101, '2022-10-10' -> 20221010,
def compare_date_str(date_str1, date_str2):
  num1 = date_str1[0:4]+date_str1[5:7]+date_str1[8:10]
  num1 = int(num1)
  num2 = date_str2[0:4]+date_str2[5:7]+date_str2[8:10]
  num2 = int(num2)
  if num1>num2:
    return 1
  elif num1<num2:
    return -1
  return 0

# 3 US stock indexes: Nasdaq, DJI, SP500, 1 HK index: HSI
# get their data for certain year
# close points -> moving average close points, parameter: nn
# calculate the daily return based on the MA close points
# get the date-return dic
def get_date_return_dic(tn, st, et, nn):
  stock_index = get_df_data(ticker_name=tn, start_time=st, end_time=et)
  stock_index['SMA'+str(nn)] = stock_index['Close'].rolling(nn).mean() # moving average, smoothening function
  stock_index['Close'] = stock_index['SMA'+str(nn)] # moving average, smoothening function
  del stock_index['SMA'+str(nn)] # 
  stock_index = calculate_daily_return(stock_index)
  stock_index = get_ymt_date(stock_index)
  time_return = get_date_return( stock_index )
  return time_return


# match US stocks with HK stock by time, because there is jet-lag between HK time & US time
def US_HK_stock_signal(year = 2022, nn = 5):
  st, et = str(year)+"-01-01", str(year)+"-12-31"
  # ^IXIC, ^DJI, ^GSPC
  nasdaq_time_return = get_date_return_dic("^IXIC", st, et, nn)
  dowjones_time_return = get_date_return_dic("^DJI", st, et, nn)
  sp500_time_return = get_date_return_dic("^GSPC", st, et, nn)

  # ^HSI
  hsi_time_return = get_date_return_dic("^HSI", st, et, nn)  
  
  # 
  hk_us_stock_signal = []

  for k, v in sorted(hsi_time_return.items()):
    date_str = k
    rise_fall_signal = v
    for pk, pv in sorted(sp500_time_return.items(), reverse=True):
      if compare_date_str(k, pk)==1:
        v1 = sp500_time_return[pk]
        v2 = dowjones_time_return[pk]
        v3 = nasdaq_time_return[pk]
        #print(k, v, pk, v1, v2, v3)
        hk_us_stock_signal.append( (k, v, pk, v1, v2, v3) )
        break
  return hk_us_stock_signal

# rise -> 1, fall -> 0
def rise_fall_judge(point):
  if point > 0:
    return 1
  return 0

# filter signals according to HK stock rise/fall
def US_HK_info(hk_us_stock_signal, hk_s, us_stock_num):
  tmp_list = []
  for it in hk_us_stock_signal:
    hkt, hk_signal, ust, us_signal1, us_signal2, us_signal3 = it
    hk_signal = rise_fall_judge(hk_signal)
    us_signal1 = rise_fall_judge(us_signal1)
    us_signal2 = rise_fall_judge(us_signal2)
    us_signal3 = rise_fall_judge(us_signal3)
    #
    if hk_signal==hk_s and sum( [us_signal1, us_signal2, us_signal3] ) == us_stock_num:
      tmp_list.append( it )
  return tmp_list
# 

# match US stocks with HK stock by time, because there is jet-lag between HK time & US time
def stocks_signal(refer_list=["^IXIC", "^DJI", "^GSPC"], target="^HSI", year = 2022, nn = 5):
  st, et = str(year)+"-01-01", str(year)+"-12-31"
  #
  refer_dic_list = []
  for it in refer_list:
    tmp = get_date_return_dic(it, st, et, nn)
    refer_dic_list.append( tmp )
  # 
  target_dic = get_date_return_dic(target, st, et, nn)  
  
  # 
  hk_us_stock_signal = []
  for k, v in sorted(target_dic.items()):
    date_str = k
    rise_fall_signal = v
    for pk, pv in sorted(refer_dic_list[0].items(), reverse=True):
      if compare_date_str(k, pk)==1:
        vs = []
        for rd in refer_dic_list:
          vs.append( rd[pk] )
        #print(k, v, pk, vs)
        hk_us_stock_signal.append( (k, v, pk, vs) )
        break
  return hk_us_stock_signal

# scale list
def scale_list_values(vs):
  tmp = []
  for v in vs:
    tmp.append( v*100 )
  return tmp


In [74]:
nn = 1
dataset_xy = []
refer_list=["^IXIC", "AAPL", "GOOGL", "AMZN", "BIDU", "BABA"] # "^IXIC", "^DJI", "^GSPC"
target="0700.HK" # 0700.HK, ^HSI
for i in range(2022, 2023):
  hk_us_stock_signal = stocks_signal(refer_list, target, i, nn)
  dataset_xy = dataset_xy + hk_us_stock_signal

print('L:', len(dataset_xy))

#
X_data = []
y_data = []
for it in dataset_xy[nn:]:
  hkt, hk_hsi, ust, vs = it  
  if math.isnan(hk_hsi):
    continue
  fg = True
  for v in vs:
    if math.isnan(v):
      fg = False
      break
  if not fg:
    continue
  #us_sp500, us_dowjones, us_nasdaq = us_sp500*100, us_dowjones*100, us_nasdaq*100
  X_data.append( scale_list_values(vs) )
  if hk_hsi>0:
    y_data.append( 1 )
  else:
    y_data.append( 0 )

print( "rise vs fall: ", sum(y_data), len(y_data)-sum(y_data) )

### data-set split
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.25, random_state=0, shuffle=True) 

logisticRegr = LogisticRegression()
logisticRegr.fit(X_train, y_train)
y_pred = logisticRegr.predict(X_test)
score = logisticRegr.score(X_test, y_test)
print(classification_report(y_test, y_pred))
print(score)

clf = svm.SVC(kernel='linear') 
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
score = clf.score(X_test, y_test)
print(classification_report(y_test, y_pred))
print(score)

clf=RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
score = clf.score(X_test, y_test)
print(classification_report(y_test, y_pred))
print(score)

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
L: 212
rise vs fall:  91 120
              precision    recall  f1-score   support

           0       0.64      0.96      0.77        28
           1       0.91      0.40      0.56        25

    accuracy                           0.70        53
   macro avg       0.78      0.68      0.66        53
weighted avg       0.77      0.70      0.67        53

0.6981132075471698
              precision    recall  f1-score   support

           0       0.66      0.96      0.78        28
           1       0.92      0.44