In [1]:
import numpy as np 
import matplotlib.pyplot as plt 
!pip install yfinance
import yfinance as yf # https://pypi.org/project/yfinance/
import math
import random
import seaborn as sns
import datetime
import pandas as pd
from scipy import stats
import warnings
warnings.filterwarnings(action='ignore')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting yfinance
  Downloading yfinance-0.2.11-py2.py3-none-any.whl (59 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.2/59.2 KB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Collecting cryptography>=3.3.2
  Downloading cryptography-39.0.1-cp36-abi3-manylinux_2_28_x86_64.whl (4.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.2/4.2 MB[0m [31m36.9 MB/s[0m eta [36m0:00:00[0m
Collecting frozendict>=2.3.4
  Downloading frozendict-2.3.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m111.0/111.0 KB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.26
  Downloading requests-2.28.2-py3-none-any.whl (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.8/62.8 KB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting beautifu

In [14]:
# get data by ticker-name, start-time & end-time
def get_df_data(ticker_name="AAPL", start_time="2022-01-01", end_time="2022-10-09"):
    df_data = yf.download(tickers=ticker_name, start=start_time, end=end_time) 
    df_data = df_data[ ["Open", "High", "Low", "Close", "Volume"] ]
    df_data['previous_Close'] = df_data['Close'].shift(1)
    df_data['daily_return'] = (df_data['Close']-df_data['previous_Close'])/df_data['previous_Close']
    MA1, MA2 = 5, 20
    df_data['MA1'] = df_data['Close'].rolling(MA1).mean()
    df_data['MA2'] = df_data['Close'].rolling(MA2).mean()
    return df_data

def merge_stocks(df_data1, df_data2):
    data = []
    i = 0
    while i<len(df_data1):
        target_time = df_data1.index[i]

        j = 0
        while j<len(df_data2):
            ref_time = df_data2.index[j]
            if ref_time>=target_time:
                break
            j += 1
        ref_time = df_data2.index[j-1]
        if ref_time<target_time:
            it1 = df_data1.iloc[i]   
            open1, high1, low1, close1 = it1['Open'], it1['High'], it1['Low'], it1['Close']
            daily_return1 = it1['daily_return']
            volume1 = it1['Volume']
            target_MA1, target_MA2 = it1['MA1'], it1['MA2']
            it2 = df_data2.iloc[j-1]
            open2, high2, low2, close2 = it2['Open'], it2['High'], it2['Low'], it2['Close']
            daily_return2 = it2['daily_return']
            volume2 = it2['Volume']
            ref_MA1, ref_MA2 = it2['MA1'], it2['MA2']

            tmp_list = [ target_time, open1, high1, low1, close1, ref_time, open2, high2, low2, close2 ]
            data.append( tmp_list )
            #print( target_time, ref_time )
        i += 1
    col_names = ['target_time', 'target_open', 'target_high', 'target_low', 'target_close',
                'ref_time', 'ref_open', 'ref_high', 'ref_low', 'ref_close']
    df = pd.DataFrame(data, columns = col_names)
    return df

def LR(train_data, label_name, min_num=20):
    x = train_data["ref_"+label_name]
    y = train_data["target_"+label_name]
    if len(x)<min_num:
        return 1, 0, 0
    k, b, R, p, std_err = stats.linregress(list(x), list(y)) # R*R -> R2
    #print( '\tlinear model: y = ', round(k, 4), '* x + ', round(b, 4), "\t R2:", round(R*R, 4), "\t std error:", round(std_err, 3) )
    return k, b, R*R

def daily_models(merged_data, label_name, min_num):
    data = []
    k_list, b_list, R2_list = [], [], []
    i = 0
    while i<len(merged_data):
        train_data = merged_data.iloc[ max(0, i-100):i ].copy()
        k, b, R2 = LR(train_data, label_name, min_num)
        k_list.append( k )
        b_list.append( b )
        R2_list.append( R2 )
        if len(train_data)>0:            
            it = merged_data.iloc[i]
            target_time = it['target_time']
            target_point = it['target_'+label_name]
            ref_time = it['ref_time']
            ref_point = it['ref_'+label_name]
            train_data_it1, train_data_it2 = train_data.iloc[0], train_data.iloc[-1]
            target_from_time,target_to_time = train_data_it1['target_time'], train_data_it2['target_time']
            ref_from_time,ref_to_time = train_data_it1['ref_time'], train_data_it2['ref_time']
            #print( target_time, "\t", target_time1, ref_time1, "\t", target_time2, ref_time2 )
            data.append( [target_time, target_point, ref_time, ref_point, target_from_time, target_to_time, ref_from_time, ref_to_time, k, b, R2] )
        i += 1
    #
    col_names = ['target_time', 'target_point', 'ref_time', 'ref_point', 'target_from_time', 'target_to_time', 'ref_from_time', 'ref_to_time',
                'k_'+label_name, 'b_'+label_name, 'R2'+label_name]
    df = pd.DataFrame(data, columns = col_names)
    return df[min_num:]

In [38]:
stocks_info = [
    ('BABA', '9988.HK', '2022-11-01', 1),
    ('BIDU', '9888.HK', '2022-10-25', 1),
    ('JD', '9618.HK', '2022-10-24', 0.5 * 7.8),
    ('MPNGY', '3690.HK', '2022-10-24', 0.5 * 7.8),
    ('NTES', '9999.HK', '2022-10-26', 0.2 * 7.8),
    ('LI', '2015.HK', '2022-10-26', 0.5 * 7.8),
    ('NIO', '9866.HK', '2022-10-26', 1 * 7.8)
]

stocks_info = [ 
    ('BABA', '9988.HK', '2022-11-01', 1),
    ('BIDU', '9888.HK', '2022-10-25', 1)
]
total_business_days = 0
gain_rate, loss_rate = 1.3, 9
trade_info_col = {}
for it in stocks_info[:1]:
        # get market data of target-stock & reference-stock
  reference_stock, target_stock, st, ratio = it
  st, et = "2021-01-01", "2023-02-28"
  print("time range:\t", st, "-", et)
  df_data1 = get_df_data(ticker_name=target_stock, start_time=st, end_time=et)
  print("target stock:\t", target_stock, "\t", len(df_data1))
  df_data2 = get_df_data(ticker_name=reference_stock, start_time=st, end_time=et)
  print("reference stock:\t", reference_stock, "\t", len(df_data2))
      # merge reference-stock & target-stock: target stock (datetime, open, high, low, close), 1-day previous reference stock (datetime, open, high, low, close) 
  merged_data = merge_stocks(df_data1, df_data2)
  for label_name in ['low']: # "open", "high", "low", "close"
    model_df = daily_models(merged_data, label_name, 100)
    MA_days = 20
    model_df['k_'+label_name+"_MA"] = model_df['k_'+label_name].rolling(MA_days).mean()
    model_df['b_'+label_name+"_MA"] = model_df['b_'+label_name].rolling(MA_days).mean()
    model_df = model_df[MA_days:]
    model_df['buy_price'] = (model_df['ref_point']*model_df['k_'+label_name+ '_MA']+model_df['b_'+label_name+'_MA'])*0.985
    buy_df = model_df[ model_df['buy_price']>=model_df['target_point'] ]

time range:	 2021-01-01 - 2023-02-28
[*********************100%***********************]  1 of 1 completed
target stock:	 9988.HK 	 519
[*********************100%***********************]  1 of 1 completed
reference stock:	 BABA 	 531


In [50]:
def search_by_index(df_data1, target_time):
  i = 0
  while i<len(df_data1):
    current_time = df_data1.index[i]
    if target_time==current_time:
      return i
    i += 1
  return -1

def possible_returns(buy_df, df_data1, hold_days):
  hold_data1 = []
  hold_data2 = []
  Len = 5 + hold_days*3
  i = 0
  while i<len(buy_df):
    it1 = buy_df.iloc[i]
    target_time = it1['target_time']
    buy_price = it1['buy_price']
    j = search_by_index(df_data1, target_time)
    it2 = df_data1.iloc[j]
    hold_time = df_data1.index[j]
    assert target_time == hold_time
    hold_info = [ target_time, round(buy_price,1) ]
    hold_info.append( hold_time )
    hold_info.append( round(it2['Low'],1) )
    hold_info.append( round(it2['Close'],1) )
    j = j + 1
    holding_period = min(len(df_data1), j+hold_days)
    while j<holding_period:
      it2 = df_data1.iloc[j]
      hold_time = df_data1.index[j]
      hold_info.append( hold_time )
      hold_info.append( round(it2['Low'],1) )
      hold_info.append( round(it2['Close'],1) )
      j += 1
    if len(hold_info)==Len:
      hold_data1.append( hold_info )
    else:
      hold_data2.append( hold_info )
    i += 1
  col_names = ["buy_time", "buy_price"]
  for day in range(hold_days+1):
    st = "sell_"+str(day+1)
    col_names.append( st )
    col_names.append( st+"_low" )
    col_names.append( st+"_high" )
  df = pd.DataFrame(hold_data1, columns = col_names)
  return df, hold_data2

hold_df, hold_data2 = possible_returns(buy_df[-30:], df_data1, 5)
hold_df

Unnamed: 0,buy_time,buy_price,sell_1,sell_1_low,sell_1_high,sell_2,sell_2_low,sell_2_high,sell_3,sell_3_low,sell_3_high,sell_4,sell_4_low,sell_4_high,sell_5,sell_5_low,sell_5_high,sell_6,sell_6_low,sell_6_high
0,2022-09-23,78.6,2022-09-23,78.4,78.4,2022-09-26,76.7,78.7,2022-09-27,76.1,77.9,2022-09-28,74.4,74.7,2022-09-29,75.8,76.8,2022-09-30,75.9,77.9
1,2022-09-27,77.2,2022-09-27,76.1,77.9,2022-09-28,74.4,74.7,2022-09-29,75.8,76.8,2022-09-30,75.9,77.9,2022-10-03,76.8,77.7,2022-10-05,81.4,84.2
2,2022-09-28,75.6,2022-09-28,74.4,74.7,2022-09-29,75.8,76.8,2022-09-30,75.9,77.9,2022-10-03,76.8,77.7,2022-10-05,81.4,84.2,2022-10-06,82.7,83.2
3,2022-10-03,77.0,2022-10-03,76.8,77.7,2022-10-05,81.4,84.2,2022-10-06,82.7,83.2,2022-10-07,81.2,81.3,2022-10-10,78.0,78.7,2022-10-11,75.6,76.0
4,2022-10-07,81.6,2022-10-07,81.2,81.3,2022-10-10,78.0,78.7,2022-10-11,75.6,76.0,2022-10-12,72.3,74.3,2022-10-13,72.6,72.8,2022-10-14,72.3,73.2
5,2022-10-10,78.9,2022-10-10,78.0,78.7,2022-10-11,75.6,76.0,2022-10-12,72.3,74.3,2022-10-13,72.6,72.8,2022-10-14,72.3,73.2,2022-10-17,70.9,73.2
6,2022-10-11,75.9,2022-10-11,75.6,76.0,2022-10-12,72.3,74.3,2022-10-13,72.6,72.8,2022-10-14,72.3,73.2,2022-10-17,70.9,73.2,2022-10-18,73.8,76.0
7,2022-10-12,72.5,2022-10-12,72.3,74.3,2022-10-13,72.6,72.8,2022-10-14,72.3,73.2,2022-10-17,70.9,73.2,2022-10-18,73.8,76.0,2022-10-19,72.5,72.7
8,2022-10-17,71.1,2022-10-17,70.9,73.2,2022-10-18,73.8,76.0,2022-10-19,72.5,72.7,2022-10-20,68.0,69.9,2022-10-21,69.3,69.6,2022-10-24,60.8,61.7
9,2022-10-19,73.2,2022-10-19,72.5,72.7,2022-10-20,68.0,69.9,2022-10-21,69.3,69.6,2022-10-24,60.8,61.7,2022-10-25,60.2,63.6,2022-10-26,61.2,62.3


In [52]:
for it in hold_data2:
  print(it)

[Timestamp('2023-02-06 00:00:00'), 102.2, Timestamp('2023-02-06 00:00:00'), 101.5, 103.2, Timestamp('2023-02-07 00:00:00'), 103.5, 104.8, Timestamp('2023-02-08 00:00:00'), 102.1, 103.5, Timestamp('2023-02-09 00:00:00'), 103.1, 107.6, Timestamp('2023-02-10 00:00:00'), 103.4, 104.1]
[Timestamp('2023-02-10 00:00:00'), 104.1, Timestamp('2023-02-10 00:00:00'), 103.4, 104.1]
