In [1]:
import yfinance as yf
import pandas as pd
import numpy as np
from Utils.Solver import *
from Utils.Signals import *

# Prepare data

In [2]:
# Download historical stock data
tickers = ['AAPL', 'MSFT', 'TSLA', 'AMZN', 'GOOG', 'IBM', 'NFLX', 'NVDA', 'AMD', 'INTC']

# Download historical stock data
data = yf.download(tickers, start='2000-01-01', end='2024-01-01')
# Initialize the portfolio solver with appropriate penalty and max weight threshold
portfolio_solver = Portfolio_Solver(0.8, max_weight_threshold=0.3)


start_date_signal = '2000-01-01'
end_date_signal = '2021-01-01'
date_range_signal = pd.date_range(start=start_date_signal, end=end_date_signal)

start_date_eval = '2019-01-01'
end_date_eval = '2020-01-01'
date_range_eval = pd.date_range(start=start_date_eval, end=end_date_eval)


YF.download() has changed argument auto_adjust default to True


[*********************100%***********************]  10 of 10 completed


In [3]:
# Initialize an empty list to store the rows for the first DataFrame
dataset_scores = []

# Step 1: Create the DataFrame with rsi_scores, macd_scores, and sma_scores
for date in date_range_signal:
    #print(f"processing {date}")

    # Step 1a: Calculate the RSI signal scores
    rsi_signal_scores = calculate_rsi_signal(data, tickers, date=date, period=14)
    rsi_scores = np.array([score[1] for score in rsi_signal_scores])

    # Step 1b: Calculate the MACD signal scores
    macd_signal_scores = calculate_macd_signal(data, tickers, date=date)
    macd_scores = np.array([score[1] for score in macd_signal_scores])

    # Step 1c: Calculate the SMA signal scores
    sma_signal_scores = calculate_sma_signal(data, tickers, date=date)
    sma_scores = np.array([score[1] for score in sma_signal_scores])

    if np.any(np.isnan(rsi_scores)) or np.any(np.isnan(macd_scores)) or np.any(np.isnan(sma_scores)):
        print(f"Skipping {date} due to NaN values in the signals.")
        continue  # Skip this date and move to the next one

    # Step 1d: Add the scores to the first dataset
    dataset_scores.append({
        'date': date,
        'rsi_scores': rsi_scores,
        'macd_scores': macd_scores,
        'sma_scores': sma_scores
    })


# Convert the first dataset into a DataFrame
df_scores = pd.DataFrame(dataset_scores)
display(df_scores)

Skipping 2000-01-01 00:00:00 due to NaN values in the signals.
Skipping 2000-01-02 00:00:00 due to NaN values in the signals.
Skipping 2000-01-03 00:00:00 due to NaN values in the signals.
Skipping 2000-01-04 00:00:00 due to NaN values in the signals.
Skipping 2000-01-05 00:00:00 due to NaN values in the signals.
Skipping 2000-01-06 00:00:00 due to NaN values in the signals.
Skipping 2000-01-07 00:00:00 due to NaN values in the signals.
Skipping 2000-01-08 00:00:00 due to NaN values in the signals.
Skipping 2000-01-09 00:00:00 due to NaN values in the signals.
Skipping 2000-01-10 00:00:00 due to NaN values in the signals.
Skipping 2000-01-11 00:00:00 due to NaN values in the signals.
Skipping 2000-01-12 00:00:00 due to NaN values in the signals.
Skipping 2000-01-13 00:00:00 due to NaN values in the signals.
Skipping 2000-01-14 00:00:00 due to NaN values in the signals.
Skipping 2000-01-15 00:00:00 due to NaN values in the signals.
Skipping 2000-01-16 00:00:00 due to NaN values in the s

Unnamed: 0,date,rsi_scores,macd_scores,sma_scores
0,2011-04-12,"[38.30999758863495, 45.97695202946077, 49.2629...","[-0.041285952148013935, 0.07719784119519542, 0...","[1.2408267736434944, 0.3653737068176284, 0.015..."
1,2011-04-13,"[43.37424857723896, 45.79289203866577, 50.7693...","[-0.03208349389212367, 0.056301495920932754, 2...","[1.2234387397766113, 0.3210000324249265, 0.016..."
2,2011-04-14,"[39.868447519141036, 41.993911275712904, 51.92...","[-0.031152558312366768, 0.03080800551427569, -...","[1.2041051101684577, 0.27382594108581415, 0.01..."
3,2011-04-15,"[35.712439846793096, 41.119317583877255, 54.33...","[-0.03778464549550134, 0.011577067483928297, -...","[1.1826032590866102, 0.23109033584594485, 0.01..."
4,2011-04-18,"[41.52272746997473, 36.38522266558832, 50.8965...","[-0.030766553371090688, -0.014564568614314308,...","[1.1610095429420468, 0.18357484817504854, 0.01..."
...,...,...,...,...
2443,2020-12-24,"[67.92099725524878, 61.41396620995696, 62.8719...","[0.6376329076354201, 0.8196506562317654, -1.10...","[20.276956481933595, 16.262737503051767, 64.36..."
2444,2020-12-28,"[73.64751185011087, 64.36460206110364, 63.1416...","[0.8774145720012818, 0.8857966942036855, -1.16...","[20.26290431976318, 16.051127243041975, 64.874..."
2445,2020-12-29,"[68.56483489941925, 62.47877454625197, 63.4838...","[0.8423491786869222, 0.8160744562189803, -1.22...","[20.211752243041985, 15.715758209228511, 65.42..."
2446,2020-12-30,"[65.48954455732145, 56.99499275035495, 67.5459...","[0.6767045544733539, 0.5607932269514513, -0.71...","[20.215960006713857, 15.502694168090812, 66.16..."


In [4]:
# Now, iterate over df_step1 to calculate combined scores, portfolio weights, and returns
dataset_returns = []

# Filter the DataFrame to only include rows where the 'date' is in the specified date range
filtered_df = df_scores[df_scores['date'].isin(date_range_eval)]

# Step 2: Create the second DataFrame with combined_scores, portfolio_weights, total_return, and annualized_return
for index, row in filtered_df.iterrows():
    date = row['date']
    rsi_scores = row['rsi_scores']
    macd_scores = row['macd_scores']
    sma_scores = row['sma_scores']

    print(f"Processing {date} for Step 2")

    # Step 2a: Combine the signals (you can later train a model to adjust these weights)
    signal_weights = [1, 1, 1]  # You can adjust these weights later based on your model
    combined_scores = combine_signals(signal_weights, [rsi_scores, macd_scores, sma_scores])
    combined_scores_with_tickers = list(zip(tickers, combined_scores))

    print(f"Combined Scores: {combined_scores_with_tickers}")

    # Step 2b: Solve the portfolio based on the combined signal scores
    portfolio_weights = portfolio_solver.SolveSignalPortfolioMVO(tickers, data, combined_scores)

    # Step 2c: Calculate the returns for the portfolio based on the optimized weights
    cumulative_returns, total_return, annualized_return = portfolio_solver.CalculatePortfolioReturns(tickers, data, portfolio_weights, start_date=date, time_period=30)

    # Step 2d: Add the calculated values to the second dataset
    dataset_returns.append({
        'date': date,
        'combined_scores': combined_scores,
        'portfolio_weights': portfolio_weights,
        'total_return': total_return,
        'annualized_return': annualized_return
    })

# Convert the second dataset into a DataFrame
df_returns = pd.DataFrame(dataset_returns)

# You can now display or use the two DataFrames for further analysis or training your model
display(df_returns)  # The second DataFrame with combined scores, portfolio weights, and returns

Processing 2019-01-02 00:00:00 for Step 2
Combined Scores: [('AAPL', np.float64(36.56628061781882)), ('MSFT', np.float64(47.12127486277017)), ('TSLA', np.float64(42.99442018025284)), ('AMZN', np.float64(44.22458897282027)), ('GOOG', np.float64(48.27451049963411)), ('IBM', np.float64(32.953452099134786)), ('NFLX', np.float64(1.8540117612227434)), ('NVDA', np.float64(36.84819906258567)), ('AMD', np.float64(49.03939929273357)), ('INTC', np.float64(49.33622193211655))]
     pcost       dcost       gap    pres   dres
 0: -1.0724e-01 -3.3443e+00  3e+00  1e-16  3e-16
 1: -1.0824e-01 -1.8473e-01  8e-02  2e-16  5e-16
 2: -1.2061e-01 -1.3096e-01  1e-02  1e-16  7e-17
 3: -1.2518e-01 -1.2820e-01  3e-03  2e-16  1e-16
 4: -1.2596e-01 -1.2684e-01  9e-04  2e-16  8e-17
 5: -1.2649e-01 -1.2656e-01  7e-05  2e-16  5e-17
 6: -1.2652e-01 -1.2652e-01  7e-07  3e-16  5e-17
 7: -1.2652e-01 -1.2652e-01  7e-09  2e-16  1e-16
Optimal solution found.
Optimized Portfolio Weights:
{'AAPL': np.float64(1.561280591908461

Unnamed: 0,date,combined_scores,portfolio_weights,total_return,annualized_return
0,2019-01-02,"[36.56628061781882, 47.12127486277017, 42.9944...","[1.5612805919084617e-08, 0.1000000834702184, 2...",0.126847,1.726873
1,2019-01-03,"[25.846104484845227, 41.10029825235928, 40.411...","[1.3918227666591354e-09, 0.29999996509567595, ...",0.255171,5.746895
2,2019-01-04,"[32.753306532385665, 49.89459663571666, 47.042...","[9.275092724726073e-09, 0.10000004992914417, 4...",0.125814,1.705946
3,2019-01-07,"[32.23301097099511, 50.14874103532683, 52.8080...","[8.696998474300601e-08, 2.173320571762828e-06,...",0.091353,1.084050
4,2019-01-08,"[35.3322105176333, 51.63731267848253, 53.07309...","[7.677196243161262e-09, 2.2608599258393123e-07...",0.101270,1.248572
...,...,...,...,...,...
247,2019-12-24,"[85.39352323416526, 88.09702888036425, 89.5700...","[0.29999993228262073, 0.29999980687937766, 0.2...",0.113998,1.476449
248,2019-12-26,"[89.82003964158392, 90.95891888252922, 90.6822...","[0.29999923048281335, 0.29999940108446055, 0.2...",0.093806,1.123725
249,2019-12-27,"[89.70331755278528, 91.67743639716288, 90.3032...","[0.29999998390931176, 0.2999999839078777, 0.29...",0.129937,1.790320
250,2019-12-30,"[91.04785571059881, 82.45904628837432, 76.9975...","[0.2999999946611208, 0.29999997472475287, 0.23...",0.151085,2.260645


Calculate all the return for each of those dates



In [5]:
average_annualized_return = df_returns['annualized_return'].mean()
print("Average return for 1/N : ", average_annualized_return)

total_return_sum = df_returns['total_return'].sum()
print(f"Total Return Sum: {total_return_sum}")

Average return for 1/N :  0.8765052016399952
Total Return Sum: 14.828914815699676


### Matrix of size signals 

In [6]:

# Define the target date range
start_date = '2011-01-01'
end_date = '2018-01-01'
date_range = pd.date_range(start=start_date, end=end_date)

# Initialize an empty list to store the rows of the dataset
dataset_weighted_returns = []
# Filter the DataFrame to only include rows where the 'date' is in the specified date range
filtered_df = df_scores[df_scores['date'].isin(date_range_eval)]

# Step 2: Create the second DataFrame with combined_scores, portfolio_weights, total_return, and annualized_return
for index, row in filtered_df.iterrows():
    date = row['date']
    rsi_scores = row['rsi_scores']
    macd_scores = row['macd_scores']
    sma_scores = row['sma_scores']

    if np.any(np.isnan(rsi_scores)) or np.any(np.isnan(macd_scores)) or np.any(np.isnan(sma_scores)):
        #print(f"Skipping {date} due to NaN values in the signals.")
        continue  # Skip this date and move to the next one

    # Calculate the average scores for each signal type (RSI, MACD, SMA)
    avg_rsi = np.mean(rsi_scores)
    avg_macd = np.mean(macd_scores)
    avg_sma = np.mean(sma_scores)

    # Add the aggregated values to the dataset
    dataset_weighted_returns.append({
        'date': date,
        'rsi_avg': avg_rsi,
        'macd_avg': avg_macd,
        'sma_avg': avg_sma,
        'rsi_scores': rsi_scores,
        'macd_scores': macd_scores,
        'sma_scores': sma_scores,
        'total_return': total_return 
    })

# Convert the dataset into a DataFrame
df = pd.DataFrame(dataset_weighted_returns)

# Prepare the feature matrix X (aggregated values for RSI, MACD, SMA)
X = df[['rsi_avg', 'macd_avg', 'sma_avg']].values

# Prepare the target y (total return)
y = df['total_return'].values.reshape(-1, 1)

# Initialize and train a linear model to learn the weight matrix W (size should be 3)
W, _, _, _ = np.linalg.lstsq(X, y, rcond=None)

# Output the learned weight matrix W
print("Learned Weight Matrix (W):")
print(W)


Learned Weight Matrix (W):
[[ 0.00399403]
 [-0.02479284]
 [ 0.00127505]]


In [7]:

# Now, iterate over df_step1 to calculate combined scores, portfolio weights, and returns
dataset_weighted_returns = []

# Filter the DataFrame to only include rows where the 'date' is in the specified date range
filtered_df = df_scores[df_scores['date'].isin(date_range_eval)]

# Step 2: Create the second DataFrame with combined_scores, portfolio_weights, total_return, and annualized_return
for index, row in filtered_df.iterrows():
    date = row['date']
    rsi_scores = row['rsi_scores']
    macd_scores = row['macd_scores']
    sma_scores = row['sma_scores']

    print(f"Processing {date} for Step 2")

    # Step 2a: Combine the signals (you can later train a model to adjust these weights)
    signal_weights = W  # You can adjust these weights later based on your model
    combined_scores = combine_signals(signal_weights, [rsi_scores, macd_scores, sma_scores])
    combined_scores_with_tickers = list(zip(tickers, combined_scores))

    print(f"Combined Scores: {combined_scores_with_tickers}")

    # Step 2b: Solve the portfolio based on the combined signal scores
    portfolio_weights = portfolio_solver.SolveSignalPortfolio(tickers, data, combined_scores)

    # Step 2c: Calculate the returns for the portfolio based on the optimized weights
    cumulative_returns, total_return, annualized_return = portfolio_solver.CalculatePortfolioReturns(tickers, data, portfolio_weights, start_date=date, time_period=30)

    # Step 2d: Add the calculated values to the second dataset
    dataset_weighted_returns.append({
        'date': date,
        'combined_scores': combined_scores,
        'portfolio_weights': portfolio_weights,
        'total_return': total_return,
        'annualized_return': annualized_return
    })


# Convert the dataset into a DataFrame
df2 = pd.DataFrame(dataset_weighted_returns)

# Display the dataset (this will include the RSI, MACD, SMA scores, portfolio weights, and returns)

# You can now use this DataFrame for training your model.
display(df2)

Processing 2019-01-02 00:00:00 for Step 2
Combined Scores: [('AAPL', np.float64(0.14507412669138128)), ('MSFT', np.float64(0.18780777371154822)), ('TSLA', np.float64(0.17434231943110465)), ('AMZN', np.float64(0.18657690634669535)), ('GOOG', np.float64(0.1964534909910239)), ('IBM', np.float64(0.16565082081120144)), ('NFLX', np.float64(0.0856225993302518)), ('NVDA', np.float64(0.15014184599951258)), ('AMD', np.float64(0.19071047866700125)), ('INTC', np.float64(0.20165553326526692))]
     pcost       dcost       gap    pres   dres
 0: -6.1656e-02 -1.0714e+00  1e+00  0e+00  4e+00
 1: -6.1817e-02 -8.0084e-02  2e-02  8e-17  6e-02
 2: -6.2369e-02 -6.2986e-02  6e-04  4e-17  1e-03
 3: -6.2396e-02 -6.2412e-02  2e-05  7e-17  1e-05
 4: -6.2396e-02 -6.2396e-02  2e-07  6e-17  1e-07
 5: -6.2396e-02 -6.2396e-02  2e-09  7e-17  1e-09
Optimal solution found.
Optimized Portfolio Weights:
{'AAPL': np.float64(0.08268336526319911), 'MSFT': np.float64(0.11440303177792573), 'TSLA': np.float64(0.104408103696721

Unnamed: 0,date,combined_scores,portfolio_weights,total_return,annualized_return
0,2019-01-02,"[0.14507412669138128, 0.18780777371154822, 0.1...","[0.08268336526319911, 0.11440303177792573, 0.1...",0.127423,1.738609
1,2019-01-03,"[0.10841013189406298, 0.16743666679703786, 0.1...","[0.0676114921649379, 0.11803607134905686, 0.11...",0.181418,3.056864
2,2019-01-04,"[0.136939689868143, 0.1961726933310499, 0.1896...","[0.0738626383727038, 0.11662550034859895, 0.11...",0.120807,1.606516
3,2019-01-07,"[0.1354066306307822, 0.19245296141933135, 0.20...","[0.0707798266506175, 0.11131323822032645, 0.12...",0.088416,1.037410
4,2019-01-08,"[0.14657534635630823, 0.1936785494826888, 0.20...","[0.0772072118090149, 0.11094135217938607, 0.12...",0.080521,0.916538
...,...,...,...,...,...
247,2019-12-24,"[0.3076484813441925, 0.3120069235069482, 0.332...","[0.11888015991965713, 0.12091850367648616, 0.1...",0.177730,2.951704
248,2019-12-26,"[0.32303397490308816, 0.3220702344772764, 0.33...","[0.11906400796024803, 0.1186342070094042, 0.12...",0.160638,2.495055
249,2019-12-27,"[0.3217449053916083, 0.3243908851377887, 0.335...","[0.12074197747975113, 0.12194053670396379, 0.1...",0.194766,3.458379
250,2019-12-30,"[0.32643710415493793, 0.29051686075922484, 0.2...","[0.13872648335712653, 0.12071047005875045, 0.1...",0.220525,4.333267


In [8]:
average_annualized_return = df2['annualized_return'].mean()
print("Average return for trained matrix : ", average_annualized_return)

total_return_sum = df2['total_return'].sum()
print(f"Total Return Sum: {total_return_sum}")

Average return for trained matrix :  0.8453735177188246
Total Return Sum: 14.408615758866157


In [None]:
average_annualized_return = df2['annualized_return'].mean()
print("Average return for trained matrix : ", average_annualized_return)

total_return_sum = df2['total_return'].sum()
print(f"Total Return Sum: {total_return_sum}")

Average return for trained matrix :  0.8311076613052832
Total Return Sum: 14.206047371171156


### Matrix of size tickers x signals 

In [9]:
# Create the feature matrix dynamically
X = np.array([[
    row['rsi_scores'][i],  # Get RSI score for the i-th ticker
    row['macd_scores'][i],  # Get MACD score for the i-th ticker
    row['sma_scores'][i]    # Get SMA score for the i-th ticker
] for _, row in df.iterrows() for i in range(len(tickers))]).reshape(len(df), len(tickers) * 3)

# Ensure the shape of X is correct (it should be 2D with each row representing a data point)
#print("Shape of X:", X.shape)
#print(X)

# Target (returns), you can choose 'total_return' or 'annualized_return' based on your preference
y = df['total_return'].values.reshape(-1, 1)

# Initialize and train a linear model to learn the weight matrix W
# Using np.linalg.lstsq to solve the least squares problem, i.e., Y = XW
W2, _, _, _ = np.linalg.lstsq(X, y, rcond=None)

# Output the learned weight matrix W
print("Learned Weight Matrix (W):", W2)


Learned Weight Matrix (W): [[ 2.49742468e-04]
 [-2.19331214e-02]
 [ 1.61116880e-03]
 [ 3.23619700e-05]
 [-2.51059768e-03]
 [ 1.53303015e-02]
 [ 2.89945680e-04]
 [-7.10456904e-03]
 [-8.51415556e-04]
 [-4.43238529e-04]
 [ 1.12428620e-02]
 [-7.16400846e-03]
 [ 1.56612138e-04]
 [-2.77770237e-03]
 [ 2.77220541e-03]
 [ 7.78636426e-05]
 [-5.72636776e-03]
 [ 1.56577278e-03]
 [ 2.13602415e-04]
 [-1.60724499e-03]
 [-4.88131068e-04]
 [-3.73457570e-04]
 [ 1.03918721e-01]
 [-1.01836202e-01]
 [ 3.38052152e-04]
 [-4.64461372e-03]
 [-3.14554744e-03]
 [-7.70701021e-05]
 [ 1.55107786e-02]
 [ 9.55044697e-03]]


In [10]:
# Iterate over the rows in df_scores to calculate the combined scores
dataset_weighted_returns = []

for index, row in filtered_df.iterrows():
    date = row['date']
    rsi_scores = row['rsi_scores']
    macd_scores = row['macd_scores']
    sma_scores = row['sma_scores']

    print(f"Processing {date} for Step 2")

    # Step 2a: Combine the signals using the learned weights
    # Reshape W into a 3xN matrix (3 signals and N stocks)
    # W is the learned weight matrix with size (3*N, 1), so we need to reshape it into a 3xN matrix
    W_reshaped = W2.reshape(3, len(tickers))

    # Multiply the scores by the weights
    weighted_rsi_scores = rsi_scores * W_reshaped[0, :]
    weighted_macd_scores = macd_scores * W_reshaped[1, :]
    weighted_sma_scores = sma_scores * W_reshaped[2, :]

    # Combine the weighted scores for each stock
    combined_scores = weighted_rsi_scores + weighted_macd_scores + weighted_sma_scores

    # Pair the combined scores with tickers
    combined_scores_with_tickers = list(zip(tickers, combined_scores))

    print(f"Combined Scores: {combined_scores_with_tickers}")

    # Step 2b: Solve the portfolio based on the combined signal scores
    portfolio_weights = portfolio_solver.SolveSignalPortfolio(tickers, data, combined_scores)

    # Step 2c: Calculate the returns for the portfolio based on the optimized weights
    cumulative_returns, total_return, annualized_return = portfolio_solver.CalculatePortfolioReturns(tickers, data, portfolio_weights, start_date=date, time_period=30)

    # Step 2d: Add the calculated values to the dataset
    dataset_weighted_returns.append({
        'date': date,
        'combined_scores': combined_scores,
        'portfolio_weights': portfolio_weights,
        'total_return': total_return,
        'annualized_return': annualized_return
    })

# Convert the dataset into a DataFrame
df2 = pd.DataFrame(dataset_weighted_returns)

# Display the dataset (this will include the RSI, MACD, SMA scores, portfolio weights, and returns)
display(df2)

average_annualized_return = df2['annualized_return'].mean()
print("Average return for trained matrix : ", average_annualized_return)

total_return_sum = df2['total_return'].sum()
print(f"Total Return Sum: {total_return_sum}")


Processing 2019-01-02 00:00:00 for Step 2
Combined Scores: [('AAPL', np.float64(0.012605342708834896)), ('MSFT', np.float64(-0.9782711466119366)), ('TSLA', np.float64(0.23496465479440123)), ('AMZN', np.float64(0.4730042148803087)), ('GOOG', np.float64(-0.12901227604285168)), ('IBM', np.float64(0.759335958234879)), ('NFLX', np.float64(0.1575167850970873)), ('NVDA', np.float64(-0.2719177952315117)), ('AMD', np.float64(-0.018676457532677215)), ('INTC', np.float64(-0.04104609782139311))]
     pcost       dcost       gap    pres   dres
 0: -2.1183e+01 -9.1647e+00  1e+02  1e+01  1e+00
 1: -6.1849e+00 -4.2521e+00  6e+00  7e-01  9e-02
 2: -3.1980e+00 -3.5679e+00  4e-01  4e-16  8e-16
 3: -3.4103e+00 -3.4268e+00  2e-02  2e-16  4e-16
 4: -3.4251e+00 -3.4253e+00  2e-04  2e-16  4e-16
 5: -3.4253e+00 -3.4253e+00  2e-06  3e-16  5e-16
Optimal solution found.
Optimized Portfolio Weights:
{'AAPL': np.float64(1.8562226707837168e-08), 'MSFT': np.float64(6.662208577599387e-09), 'TSLA': np.float64(8.0518956

Unnamed: 0,date,combined_scores,portfolio_weights,total_return,annualized_return
0,2019-01-02,"[0.012605342708834896, -0.9782711466119366, 0....","[1.8562226707837168e-08, 6.662208577599387e-09...",0.078800,0.891055
1,2019-01-03,"[0.008159325303191411, -0.8527297169897, 0.236...","[9.92713678048715e-10, 2.221949856294007e-10, ...",0.143825,2.091867
2,2019-01-04,"[0.01007835391204236, -1.0460243485167762, 0.2...","[1.8819615476349133e-08, 6.846861837459814e-09...",0.100962,1.243309
3,2019-01-07,"[0.010321930668034008, -1.0523019017591302, 0....","[1.3571303117371605e-08, 5.418721284552378e-09...",0.090342,1.067880
4,2019-01-08,"[0.012062124833112812, -1.0835191633365049, 0....","[1.413143940024108e-08, 5.286576116399463e-09,...",0.083279,0.958026
...,...,...,...,...,...
247,2019-12-24,"[0.014992629335038584, -1.6636760670704698, 0....","[1.7169784537966187e-09, 0.9999999693372277, 5...",0.059734,0.627991
248,2019-12-26,"[0.01673543811848615, -1.7233430051070744, 0.6...","[1.7979920136465459e-09, 0.9999999623342425, -...",0.066481,0.717139
249,2019-12-27,"[0.016769869714088052, -1.7358800524955658, 0....","[1.3271525014982622e-09, 0.9999999793352051, 4...",0.131659,1.826245
250,2019-12-30,"[0.017105569461461245, -1.5322546375877137, 0....","[9.96508787222529e-10, 0.9999999844723864, 5.5...",0.181898,3.070728


Average return for trained matrix :  2.1179358902059486
Total Return Sum: 26.306293653288673


# TODO : Train a neural net perhaps

In [11]:

# Define the target date range
start_date = '2011-01-01'
end_date = '2020-01-01'
date_range = pd.date_range(start=start_date, end=end_date)

dataset_returns = []
# Filter the DataFrame to only include rows where the 'date' is in the specified date range
filtered_df = df_scores[df_scores['date'].isin(date_range_eval)]

# Step 2: Create the second DataFrame with combined_scores, portfolio_weights, total_return, and annualized_return
for index, row in filtered_df.iterrows():
    date = row['date']
    rsi_scores = row['rsi_scores']
    macd_scores = row['macd_scores']
    sma_scores = row['sma_scores']

    print(f"Processing {date} for Step 2")

    # Step 2a: Combine the signals (you can later train a model to adjust these weights)
    signal_weights = [1, 1, 1]  # You can adjust these weights later based on your model
    combined_scores = combine_signals(signal_weights, [rsi_scores, macd_scores, sma_scores])
    combined_scores_with_tickers = list(zip(tickers, combined_scores))

    print(f"Combined Scores: {combined_scores_with_tickers}")

    # Step 2b: Solve the portfolio based on the combined signal scores
    portfolio_weights = portfolio_solver.SolveSignalPortfolio(tickers, data, combined_scores)

    # Step 2c: Calculate the returns for the portfolio based on the optimized weights
    cumulative_returns, total_return, annualized_return = portfolio_solver.CalculatePortfolioReturns(tickers, data, portfolio_weights, start_date=date, time_period=30)

    # Step 2d: Add the calculated values to the second dataset
    dataset_returns.append({
        'date': date,
        'combined_scores': combined_scores,
        'portfolio_weights': portfolio_weights,
        'total_return': total_return,
        'annualized_return': annualized_return
    })

# Convert the second dataset into a DataFrame
df_returns = pd.DataFrame(dataset_returns)

Processing 2019-01-02 00:00:00 for Step 2
Combined Scores: [('AAPL', np.float64(36.56635712709539)), ('MSFT', np.float64(47.12130224731517)), ('TSLA', np.float64(42.99442018025284)), ('AMZN', np.float64(44.22458897282027)), ('GOOG', np.float64(48.27450874346967)), ('IBM', np.float64(32.95343778192971)), ('NFLX', np.float64(1.8540117612227434)), ('NVDA', np.float64(36.84823739536755)), ('AMD', np.float64(49.03939929273357)), ('INTC', np.float64(49.33618617752629))]
     pcost       dcost       gap    pres   dres
 0: -6.5219e-02 -1.0764e+00  1e+00  0e+00  4e+00
 1: -6.5580e-02 -8.4915e-02  2e-02  2e-16  7e-02
 2: -6.7034e-02 -6.7860e-02  8e-04  7e-17  1e-03
 3: -6.7341e-02 -6.7396e-02  5e-05  2e-16  2e-05
 4: -6.7387e-02 -6.7388e-02  1e-06  7e-17  1e-07
 5: -6.7388e-02 -6.7388e-02  1e-08  6e-17  1e-09
Optimal solution found.
Optimized Portfolio Weights:
{'AAPL': np.float64(0.0903207468961032), 'MSFT': np.float64(0.12421915159754975), 'TSLA': np.float64(0.11096520110428375), 'AMZN': np.fl

In [12]:
from sklearn.linear_model import Ridge
import numpy as np
import pandas as pd

def train_regularized_regression(X, y, alpha=1.0):
    """
    Trains a Ridge regression model to find the optimal weights for combining signals.

    Parameters:
    - X: The feature matrix (with columns as signals).
    - y: The target values (total return).
    - alpha: Regularization strength (higher values mean more regularization).

    Returns:
    - The weight matrix of size 3.
    """
    # Initialize the Ridge regression model with regularization
    model = Ridge(alpha=alpha)

    # Fit the model to the data
    model.fit(X, y)

    # Return the weight matrix (coefficients)
    return model.coef_

# Example of how to use this function


# Prepare the feature matrix X (aggregated values for RSI, MACD, SMA)
X = df_scores[['rsi_avg', 'macd_avg', 'sma_avg']].values

# Prepare the target y (total return)
y = df['total_return'].values.reshape(-1, 1)

# Train the regularized regression model (Ridge regression)
W_ridge = train_regularized_regression(X, y, alpha=1.0)

# Output the learned weight matrix W (should be of size 3)
print("Learned Weight Matrix (W):")
print(W_ridge)

KeyError: "None of [Index(['rsi_avg', 'macd_avg', 'sma_avg'], dtype='object')] are in the [columns]"