In [75]:
import pandas
import numpy
import xgboost as xgb
import math
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error

#example.groupby('name')['number'].fillna(method='ffill')

def read_macro_data(url = 'https://raw.githubusercontent.com/nikhilchandra-stats/macrodatasetsraw/master/data/daily_fx_macro_data.csv', 
                    encoding_var = 'cp1252'):
    return pandas.read_csv(url, encoding=encoding_var)

def read_yahoo_finance(url = 'https://query1.finance.yahoo.com/v7/finance/download/AUDUSD=X?period1=1550448000&period2=1708214400&interval=1d&events=history&includeAdjustedClose=true', 
                       start_date = '2019-01-01',
                       end_date = '2024-02-16',
                       asset_symbol = 'AUDUSD'):
    
    if '.' in asset_symbol:
        symbol_string = asset_symbol + '?'
    else:
        symbol_string = asset_symbol + '=X?'         
    base_url = 'https://query1.finance.yahoo.com/v7/finance/download/'
    start_date_as_int = int(pandas.to_datetime(start_date).timestamp())
    start_date_string = 'period1=' + str(start_date_as_int) + '&'
    end_date_as_int = int(pandas.to_datetime(end_date).timestamp())
    end_date_string = 'period2=' + str(end_date_as_int) + '&interval=1d&events=history&includeAdjustedClose=true'
    complete_url = base_url + symbol_string + start_date_string + end_date_string
    returned_data = pandas.read_csv(complete_url) 
    return returned_data

# test = ['jj', 'S&P', 'RBA']
# final = [x for x in test if '(S&P|RBA)' in test]

# Build a data set that can be joined to the asset, This example looked at 2 macro features for 2 assets
# PMI for USD and RBA CPI for AUD

def join_asset_to_macro(macro_data = read_macro_data(),
                        macro_vars = ['S&P Global Manufacturing PMI', 'RBA Trimmed Mean CPI \(QoQ\)'],
                        symbol_vars = ['USD','AUD'],
                        asset_data = read_yahoo_finance(
                            start_date = '2019-01-01',
                            end_date = '2024-02-16',
                            asset_symbol = 'AUDUSD'), 
                        pip_value = 100
                        ):
    #Clean Input Variables
    macro_data_vars = "|".join(macro_vars)
    symbol_data_vars = "|".join(symbol_vars) 
    macro_data['event'] = macro_data['event'].str.replace('(?i)(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)','', regex=True)
    macro_dat_filt = macro_data[macro_data['event'].str.contains(macro_data_vars, regex=True)]
    macro_dat_filt2 =macro_dat_filt[macro_data['symbol'].str.contains(symbol_data_vars, regex=True)]

    asset_data['Date'] = pandas.to_datetime(asset_data['Date']).dt.date
    macro_dat_filt2['date'] = pandas.to_datetime(macro_dat_filt2['date']).dt.date
    asset_data = asset_data.rename(columns={'Date':'date'}) 
    
    macro_dat_filt2['actual'] = pandas.to_numeric(macro_dat_filt2['actual'])
    
    for i in  range(len(symbol_vars)):
        temp_data = macro_dat_filt2[macro_dat_filt2['event'].str.contains(macro_vars[i])]
        temp_data2 = temp_data[temp_data['symbol'].str.contains(symbol_vars[i])]
        temp_data3 = temp_data2[['date','actual']]
        new_col_name = macro_vars[i] + " "+ symbol_vars[i]
        temp_data4 = temp_data3.rename(columns={'actual':new_col_name})
        # merge_asset_macro['actual'] = merge_asset_macro.groupby('symbol')['actual'].fillna(method = "ffill")
        asset_data = pandas.merge( asset_data, temp_data4 ,
                                    on="date", how = "left" )
        asset_data[new_col_name] = asset_data[new_col_name].fillna(method='ffill')
        
    asset_data['daily return'] = (asset_data['Close'] - asset_data['Open'])*pip_value    
        
    return asset_data 

In [66]:
macro_data = read_macro_data()
macro_vars = ['S&P Global Manufacturing PMI', 'RBA Trimmed Mean CPI \(QoQ\)', 
              'CFTC AUD NC Net', 'CFTC Gold NC Net Positions', 'Consumer Price Index \(MoM\)']
symbol_vars = ['USD','AUD','AUD', 'USD', 'USD']
asset_data = read_yahoo_finance( 
                start_date = '2014-01-01',
                end_date = '2024-02-16',
                asset_symbol = 'AUDUSD'
                ) 
pip_value = 100

   
macro_test_set = join_asset_to_macro(macro_data=macro_data,
                                     macro_vars=macro_vars,
                                     symbol_vars=symbol_vars,
                                     asset_data=asset_data, 
                                     pip_value=pip_value  )

macro_test_set = macro_test_set.dropna()
macro_test_set['dependant variable'] = numpy.where(macro_test_set['daily return'] > 0, 'win', 'loss')

decison_tree_RBA_CPI = [macro_test_set['RBA Trimmed Mean CPI \\(QoQ\\) AUD'].quantile(q = 0.25), 
                        macro_test_set['RBA Trimmed Mean CPI \\(QoQ\\) AUD'].median(),
                        macro_test_set['RBA Trimmed Mean CPI \\(QoQ\\) AUD'].quantile(q = 0.75),
                        macro_test_set['RBA Trimmed Mean CPI \\(QoQ\\) AUD'].quantile(q = 0.99999)] 

decison_tree_SNP_PMI = [macro_test_set['S&P Global Manufacturing PMI USD'].quantile(q = 0.25), 
                        macro_test_set['S&P Global Manufacturing PMI USD'].median(),
                        macro_test_set['S&P Global Manufacturing PMI USD'].quantile(q = 0.75),
                        macro_test_set['S&P Global Manufacturing PMI USD'].quantile(q = 0.99999)] 



  macro_dat_filt2 =macro_dat_filt[macro_data['symbol'].str.contains(symbol_data_vars, regex=True)]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  macro_dat_filt2['date'] = pandas.to_datetime(macro_dat_filt2['date']).dt.date
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  macro_dat_filt2['actual'] = pandas.to_numeric(macro_dat_filt2['actual'])
  asset_data[new_col_name] = asset_data[new_col_name].fillna(method='ffill')
  asset_data[new_col_name] = asset_data[new_col_name].fillna(method='ffill')
  asset_data[new_col_name] = asset_data[new_col_name].fillna(metho

In [67]:
reserve_test_date = pandas.to_datetime('2023-01-01').date()
modelling_data = macro_test_set.loc[macro_test_set['date'] < reserve_test_date]

dependant = modelling_data['dependant variable'].to_numpy() 
independants = modelling_data[['S&P Global Manufacturing PMI USD',
                               'RBA Trimmed Mean CPI \\(QoQ\\) AUD', 
                               'Consumer Price Index \\(MoM\\) USD', 
                               'CFTC AUD NC Net AUD', 
                               'CFTC Gold NC Net Positions USD', 'Open']].to_numpy() 

# split data into train and test sets
def create_tree(
    modelling_data = macro_test_set.loc[macro_test_set['date'] < reserve_test_date],
    depth_interval = 0.25,
    starting_value = [modelling_data['S&P Global Manufacturing PMI USD'].min()][0],
    max_mvalue = [modelling_data['S&P Global Manufacturing PMI USD'].max()][0],
    depth = 200,
    test_size = 0.33,
    variables = ['S&P Global Manufacturing PMI USD'],
    dependant_variable = 'dependant variable',
    success_state_var = 'win',
    loss_state_var = 'loss'
):
    test_size_conv = round((modelling_data.shape[0])*test_size)
    seed = 1
    random_sample = modelling_data.sample(n = test_size_conv,replace=True, random_state=seed )
    SSR = numpy.zeros(depth)
    branch_values = numpy.zeros(depth)
    branch_count = 0
    success_state_tracker = numpy.zeros(depth)
    loss_state_tracker = numpy.zeros(depth)

    for i in range(0,depth):
        random_sample_filt = random_sample[modelling_data[variables[0]] > starting_value]
        count_binaries = random_sample_filt.groupby([dependant_variable]).size()
        if(count_binaries.size != 2): 
            print( "Break Condition")
            break
        branch_values[branch_count] = starting_value
        starting_value = starting_value + depth_interval
        # print(count_binaries.size)
        success_state_tracker[branch_count] = count_binaries['win']
        loss_state_tracker[branch_count] = count_binaries['loss']
        branch_count = branch_count + 1
    results = pandas.DataFrame({'branch values': branch_values, 'success':success_state_tracker, 'loss': loss_state_tracker})
    return results   

In [78]:
import matplotlib.pyplot as plt
test_func = create_tree()
test_func = test_func[test_func['branch values'] > 0]
test_func['success %'] = test_func['success']/(test_func['loss'] + test_func['success'])
print(numpy.log2(test_func['success %']))


Break Condition
0     -1.141356
1     -1.141356
2     -1.141356
3     -1.141356
4     -1.137019
         ...   
101   -0.874469
102   -1.078003
103   -1.078003
104   -3.169925
105   -3.169925
Name: success %, Length: 106, dtype: float64


  random_sample_filt = random_sample[modelling_data[variables[0]] > starting_value]
  random_sample_filt = random_sample[modelling_data[variables[0]] > starting_value]
  random_sample_filt = random_sample[modelling_data[variables[0]] > starting_value]
  random_sample_filt = random_sample[modelling_data[variables[0]] > starting_value]
  random_sample_filt = random_sample[modelling_data[variables[0]] > starting_value]
  random_sample_filt = random_sample[modelling_data[variables[0]] > starting_value]
  random_sample_filt = random_sample[modelling_data[variables[0]] > starting_value]
  random_sample_filt = random_sample[modelling_data[variables[0]] > starting_value]
  random_sample_filt = random_sample[modelling_data[variables[0]] > starting_value]
  random_sample_filt = random_sample[modelling_data[variables[0]] > starting_value]
  random_sample_filt = random_sample[modelling_data[variables[0]] > starting_value]
  random_sample_filt = random_sample[modelling_data[variables[0]] > starting