In [1]:
import yfinance as yf
import pandas as pd
import numpy as np
from Utils.Solver import *
from Utils.Signals import *

# Prepare data

In [2]:
# Download historical stock data
tickers = ['AAPL', 'MSFT', 'TSLA', 'AMZN', 'GOOG', 'IBM', 'NFLX', 'NVDA', 'AMD', 'INTC']

# Download historical stock data
data = yf.download(tickers, start='2000-01-01', end='2024-01-01')
# Initialize the portfolio solver with appropriate penalty and max weight threshold
portfolio_solver = Portfolio_Solver(0.8, max_weight_threshold=0.3)


start_date_signal = '2000-01-01'
end_date_signal = '2021-01-01'
date_range_signal = pd.date_range(start=start_date_signal, end=end_date_signal)

start_date_eval = '2019-01-01'
end_date_eval = '2020-01-01'
date_range_eval = pd.date_range(start=start_date_eval, end=end_date_eval)


YF.download() has changed argument auto_adjust default to True


[*********************100%***********************]  10 of 10 completed


In [3]:
# Initialize an empty list to store the rows for the first DataFrame
dataset_scores = []

# Step 1: Create the DataFrame with rsi_scores, macd_scores, and sma_scores
for date in date_range_signal:
    #print(f"processing {date}")

    # Step 1a: Calculate the RSI signal scores
    rsi_signal_scores = calculate_rsi_signal(data, tickers, date=date, period=14)
    rsi_scores = np.array([score[1] for score in rsi_signal_scores])

    # Step 1b: Calculate the MACD signal scores
    macd_signal_scores = calculate_macd_signal(data, tickers, date=date)
    macd_scores = np.array([score[1] for score in macd_signal_scores])

    # Step 1c: Calculate the SMA signal scores
    sma_signal_scores = calculate_sma_signal(data, tickers, date=date)
    sma_scores = np.array([score[1] for score in sma_signal_scores])

    if np.any(np.isnan(rsi_scores)) or np.any(np.isnan(macd_scores)) or np.any(np.isnan(sma_scores)):
        print(f"Skipping {date} due to NaN values in the signals.")
        continue  # Skip this date and move to the next one

    # Step 1d: Add the scores to the first dataset
    dataset_scores.append({
        'date': date,
        'rsi_scores': rsi_scores,
        'macd_scores': macd_scores,
        'sma_scores': sma_scores
    })


# Convert the first dataset into a DataFrame
df_scores = pd.DataFrame(dataset_scores)
display(df_scores)

Skipping 2000-01-01 00:00:00 due to NaN values in the signals.
Skipping 2000-01-02 00:00:00 due to NaN values in the signals.
Skipping 2000-01-03 00:00:00 due to NaN values in the signals.
Skipping 2000-01-04 00:00:00 due to NaN values in the signals.
Skipping 2000-01-05 00:00:00 due to NaN values in the signals.
Skipping 2000-01-06 00:00:00 due to NaN values in the signals.
Skipping 2000-01-07 00:00:00 due to NaN values in the signals.
Skipping 2000-01-08 00:00:00 due to NaN values in the signals.
Skipping 2000-01-09 00:00:00 due to NaN values in the signals.
Skipping 2000-01-10 00:00:00 due to NaN values in the signals.
Skipping 2000-01-11 00:00:00 due to NaN values in the signals.
Skipping 2000-01-12 00:00:00 due to NaN values in the signals.
Skipping 2000-01-13 00:00:00 due to NaN values in the signals.
Skipping 2000-01-14 00:00:00 due to NaN values in the signals.
Skipping 2000-01-15 00:00:00 due to NaN values in the signals.
Skipping 2000-01-16 00:00:00 due to NaN values in the s

Unnamed: 0,date,rsi_scores,macd_scores,sma_scores
0,2011-04-12,"[38.30992578042408, 45.977238069087065, 49.262...","[-0.0412861709446724, 0.07719825843002304, 0.0...","[1.2408269572257993, 0.365372638702393, 0.0155..."
1,2011-04-13,"[43.37425958670625, 45.79308829330016, 50.7693...","[-0.032083663538626656, 0.05630213930651831, 2...","[1.2234388756752015, 0.32099887847900277, 0.01..."
2,2011-04-14,"[39.86843592370676, 41.994011707326386, 51.922...","[-0.03115275019137005, 0.030808390814281522, -...","[1.2041052293777472, 0.27382468223571976, 0.01..."
3,2011-04-15,"[35.71248577134817, 41.119301194314076, 54.335...","[-0.037784720161673904, 0.011576903839499206, ...","[1.1826033473014839, 0.23108897209167623, 0.01..."
4,2011-04-18,"[41.5227270161251, 36.385387219612184, 50.8965...","[-0.030766611980910782, -0.014564704023122363,...","[1.1610097122192382, 0.18357330322265497, 0.01..."
...,...,...,...,...
2443,2020-12-24,"[67.92100205741967, 61.41404585699559, 62.8719...","[0.6376328089451859, 0.8196534931921744, -1.10...","[20.276958255767823, 16.262741394042962, 64.36..."
2444,2020-12-28,"[73.64746767041068, 64.36473051277122, 63.1416...","[0.8774145443665597, 0.8858006221974026, -1.16...","[20.262905635833746, 16.051132125854508, 64.87..."
2445,2020-12-29,"[68.56484282730332, 62.47885563450607, 63.4838...","[0.8423501784408707, 0.8160778529885946, -1.22...","[20.211753921508787, 15.715763854980452, 65.42..."
2446,2020-12-30,"[65.48943556221823, 56.994984681173364, 67.545...","[0.6767032297699673, 0.5607941220229089, -0.71...","[20.215961265563976, 15.502699508666979, 66.16..."


In [4]:
# Now, iterate over df_step1 to calculate combined scores, portfolio weights, and returns
dataset_returns = []

# Filter the DataFrame to only include rows where the 'date' is in the specified date range
filtered_df = df_scores[df_scores['date'].isin(date_range_eval)]

# Step 2: Create the second DataFrame with combined_scores, portfolio_weights, total_return, and annualized_return
for index, row in filtered_df.iterrows():
    date = row['date']
    rsi_scores = row['rsi_scores']
    macd_scores = row['macd_scores']
    sma_scores = row['sma_scores']

    print(f"Processing {date} for Step 2")

    # Step 2a: Combine the signals (you can later train a model to adjust these weights)
    signal_weights = [1, 1, 1]  # You can adjust these weights later based on your model
    combined_scores = combine_signals(signal_weights, [rsi_scores, macd_scores, sma_scores])
    combined_scores_with_tickers = list(zip(tickers, combined_scores))

    print(f"Combined Scores: {combined_scores_with_tickers}")

    # Step 2b: Solve the portfolio based on the combined signal scores
    portfolio_weights = portfolio_solver.SolveSignalPortfolio(tickers, data, combined_scores)

    # Step 2c: Calculate the returns for the portfolio based on the optimized weights
    cumulative_returns, total_return, annualized_return = portfolio_solver.CalculatePortfolioReturns(tickers, data, portfolio_weights, start_date=date, time_period=30)

    # Step 2d: Add the calculated values to the second dataset
    dataset_returns.append({
        'date': date,
        'combined_scores': combined_scores,
        'portfolio_weights': portfolio_weights,
        'total_return': total_return,
        'annualized_return': annualized_return
    })

# Convert the second dataset into a DataFrame
df_returns = pd.DataFrame(dataset_returns)

# You can now display or use the two DataFrames for further analysis or training your model
display(df_returns)  # The second DataFrame with combined scores, portfolio weights, and returns

Processing 2019-01-02 00:00:00 for Step 2
Combined Scores: [('AAPL', np.float64(36.56637819820095)), ('MSFT', np.float64(47.12128549431338)), ('TSLA', np.float64(42.99442018025284)), ('AMZN', np.float64(44.22458897282027)), ('GOOG', np.float64(48.274511291128995)), ('IBM', np.float64(32.95344193930719)), ('NFLX', np.float64(1.8540117612227434)), ('NVDA', np.float64(36.84821044172888)), ('AMD', np.float64(49.03939929273357)), ('INTC', np.float64(49.336187035750335))]
     pcost       dcost       gap    pres   dres
 0: -6.5219e-02 -1.0764e+00  1e+00  2e-16  4e+00
 1: -6.5580e-02 -8.4915e-02  2e-02  1e-16  7e-02
 2: -6.7034e-02 -6.7860e-02  8e-04  2e-17  1e-03
 3: -6.7341e-02 -6.7396e-02  5e-05  6e-17  2e-05
 4: -6.7387e-02 -6.7388e-02  1e-06  6e-17  1e-07
 5: -6.7388e-02 -6.7388e-02  1e-08  1e-16  1e-09
Optimal solution found.
Optimized Portfolio Weights:
{'AAPL': np.float64(0.09032081914172038), 'MSFT': np.float64(0.12421910367946547), 'TSLA': np.float64(0.1109652064771098), 'AMZN': np.

Unnamed: 0,date,combined_scores,portfolio_weights,total_return,annualized_return
0,2019-01-02,"[36.56637819820095, 47.12128549431338, 42.9944...","[0.09032081914172038, 0.12421910367946547, 0.1...",0.131942,1.832192
1,2019-01-03,"[25.84621305306367, 41.10035553688953, 40.4114...","[0.07015351182148817, 0.1271411028911172, 0.12...",0.186629,3.209641
2,2019-01-04,"[32.75342677959205, 49.894574434507895, 47.042...","[0.07149405498069726, 0.12199318719692409, 0.1...",0.121853,1.627019
3,2019-01-07,"[32.233081253986335, 50.14877646089104, 52.808...","[0.06347972227293112, 0.11265825438773104, 0.1...",0.087344,1.020613
4,2019-01-08,"[35.332311212114746, 51.63729721320271, 53.073...","[0.06905776524971373, 0.11246309024138301, 0.1...",0.079319,0.898710
...,...,...,...,...,...
247,2019-12-24,"[85.39352439244563, 88.09717616307873, 89.5700...","[0.12509755745359033, 0.12984980929054282, 0.1...",0.177234,2.937733
248,2019-12-26,"[89.81994059291887, 90.95902244410841, 90.6822...","[0.12552487731550915, 0.1274338093117356, 0.12...",0.160083,2.481027
249,2019-12-27,"[89.70329650052112, 91.67750638998781, 90.3032...","[0.1281026896585728, 0.1314722074185036, 0.129...",0.194797,3.459333
250,2019-12-30,"[91.04779307632663, 82.45888724477167, 76.9975...","[0.1485756104935649, 0.13220152183357115, 0.12...",0.221887,4.383456


Calculate all the return for each of those dates



In [5]:
average_annualized_return = df_returns['annualized_return'].mean()
print("Average return for 1/N : ", average_annualized_return)

total_return_sum = df_returns['total_return'].sum()
print(f"Total Return Sum: {total_return_sum}")

Average return for 1/N :  0.8765053368837088
Total Return Sum: 14.828915897309743


### Matrix of size signals 

In [6]:
from sklearn.linear_model import Ridge
import numpy as np
import pandas as pd


# Define the target date range
start_date = '2011-01-01'
end_date = '2018-01-01'
date_range = pd.date_range(start=start_date, end=end_date)

# Initialize an empty list to store the rows of the dataset
dataset_weighted_returns = []
# Filter the DataFrame to only include rows where the 'date' is in the specified date range
filtered_df = df_scores[df_scores['date'].isin(date_range_eval)]

# Step 2: Create the second DataFrame with combined_scores, portfolio_weights, total_return, and annualized_return
for index, row in filtered_df.iterrows():
    date = row['date']
    rsi_scores = row['rsi_scores']
    macd_scores = row['macd_scores']
    sma_scores = row['sma_scores']

    if np.any(np.isnan(rsi_scores)) or np.any(np.isnan(macd_scores)) or np.any(np.isnan(sma_scores)):
        #print(f"Skipping {date} due to NaN values in the signals.")
        continue  # Skip this date and move to the next one

    # Calculate the average scores for each signal type (RSI, MACD, SMA)
    avg_rsi = np.mean(rsi_scores)
    avg_macd = np.mean(macd_scores)
    avg_sma = np.mean(sma_scores)

    # Add the aggregated values to the dataset
    dataset_weighted_returns.append({
        'date': date,
        'rsi_avg': avg_rsi,
        'macd_avg': avg_macd,
        'sma_avg': avg_sma,
        'rsi_scores': rsi_scores,
        'macd_scores': macd_scores,
        'sma_scores': sma_scores,
        'total_return': total_return 
    })




def train_regularized_regression(X, y, alpha=1.0):
    """
    Trains a Ridge regression model to find the optimal weights for combining signals.

    Parameters:
    - X: The feature matrix (with columns as signals).
    - y: The target values (total return).
    - alpha: Regularization strength (higher values mean more regularization).

    Returns:
    - The weight matrix of size 3.
    """
    # Initialize the Ridge regression model with regularization
    model = Ridge(alpha=alpha)

    # Fit the model to the data
    model.fit(X, y)

    # Return the weight matrix (coefficients)
    return model.coef_

# Example of how to use this function

# Convert the dataset into a DataFrame
df = pd.DataFrame(dataset_weighted_returns)

# Prepare the feature matrix X (aggregated values for RSI, MACD, SMA)
X = df[['rsi_avg', 'macd_avg', 'sma_avg']].values

# Prepare the target y (total return)
y = df['total_return'].values.reshape(-1, 1)

# Train the regularized regression model (Ridge regression)
W_ridge = train_regularized_regression(X, y, alpha=1.0)

# Output the learned weight matrix W (should be of size 3)
print("Learned Weight Matrix (W):")
print(W_ridge)





Learned Weight Matrix (W):
[ 4.87141345e-34 -5.68750190e-33  1.11257411e-35]


In [7]:

# Now, iterate over df_step1 to calculate combined scores, portfolio weights, and returns
dataset_weighted_returns = []

# Filter the DataFrame to only include rows where the 'date' is in the specified date range
filtered_df = df_scores[df_scores['date'].isin(date_range_eval)]

# Step 2: Create the second DataFrame with combined_scores, portfolio_weights, total_return, and annualized_return
for index, row in filtered_df.iterrows():
    date = row['date']
    rsi_scores = row['rsi_scores']
    macd_scores = row['macd_scores']
    sma_scores = row['sma_scores']

    print(f"Processing {date} for Step 2")

    # Step 2a: Combine the signals (you can later train a model to adjust these weights)
    signal_weights = W_ridge  # You can adjust these weights later based on your model
    combined_scores = combine_signals(signal_weights, [rsi_scores, macd_scores, sma_scores])
    combined_scores_with_tickers = list(zip(tickers, combined_scores))

    print(f"Combined Scores: {combined_scores_with_tickers}")

    # Step 2b: Solve the portfolio based on the combined signal scores
    portfolio_weights = portfolio_solver.SolveSignalPortfolio(tickers, data, combined_scores)

    # Step 2c: Calculate the returns for the portfolio based on the optimized weights
    cumulative_returns, total_return, annualized_return = portfolio_solver.CalculatePortfolioReturns(tickers, data, portfolio_weights, start_date=date, time_period=30)

    # Step 2d: Add the calculated values to the second dataset
    dataset_weighted_returns.append({
        'date': date,
        'combined_scores': combined_scores,
        'portfolio_weights': portfolio_weights,
        'total_return': total_return,
        'annualized_return': annualized_return
    })


# Convert the dataset into a DataFrame
df2 = pd.DataFrame(dataset_weighted_returns)

# Display the dataset (this will include the RSI, MACD, SMA scores, portfolio weights, and returns)

# You can now use this DataFrame for training your model.
display(df2)

Processing 2019-01-02 00:00:00 for Step 2
Combined Scores: [('AAPL', np.float64(1.7417145059851118e-32)), ('MSFT', np.float64(2.3162403842429695e-32)), ('TSLA', np.float64(2.1679887790971164e-32)), ('AMZN', np.float64(2.3179809620641922e-32)), ('GOOG', np.float64(2.3972111511503804e-32)), ('IBM', np.float64(2.1987519363304424e-32)), ('NFLX', np.float64(1.242294289657495e-32)), ('NVDA', np.float64(1.8427725486109525e-32)), ('AMD', np.float64(2.293443284975506e-32)), ('INTC', np.float64(2.4816112118808206e-32))]
     pcost       dcost       gap    pres   dres
 0: -6.1284e-02 -1.0705e+00  1e+00  0e+00  4e+00
 1: -6.1414e-02 -7.9178e-02  2e-02  2e-16  6e-02
 2: -6.1845e-02 -6.2390e-02  5e-04  1e-16  1e-03
 3: -6.1857e-02 -6.1866e-02  9e-06  2e-16  1e-05
 4: -6.1857e-02 -6.1857e-02  9e-08  2e-16  1e-07
 5: -6.1857e-02 -6.1857e-02  9e-10  1e-16  1e-09
Optimal solution found.
Optimized Portfolio Weights:
{'AAPL': np.float64(0.07867343743886944), 'MSFT': np.float64(0.11287139029494211), 'TSLA'

Unnamed: 0,date,combined_scores,portfolio_weights,total_return,annualized_return
0,2019-01-02,"[1.7417145059851118e-32, 2.3162403842429695e-3...","[0.07867343743886944, 0.11287139029494211, 0.1...",0.126028,1.710280
1,2019-01-03,"[1.3479576757248283e-32, 2.0993408890156394e-3...","[0.06727305071648607, 0.11870821075722877, 0.1...",0.180864,3.040901
2,2019-01-04,"[1.701299407595349e-32, 2.3887532554910895e-32...","[0.0769011740664487, 0.11844698420065503, 0.11...",0.121771,1.625397
3,2019-01-07,"[1.6841337563245672e-32, 2.2979106094703249e-3...","[0.07168694214586685, 0.10981995251925289, 0.1...",0.088132,1.032936
4,2019-01-08,"[1.8061239873839155e-32, 2.2686613833687447e-3...","[0.07847314804511686, 0.10843259620581971, 0.1...",0.080069,0.909814
...,...,...,...,...,...
247,2019-12-24,"[3.558343726347654e-32, 3.568627284659956e-32,...","[0.11757863936884598, 0.11799068943867899, 0.1...",0.178283,2.967303
248,2019-12-26,"[3.726206200078991e-32, 3.679988814905425e-32,...","[0.11733878078283169, 0.1155733051226414, 0.12...",0.161143,2.507841
249,2019-12-27,"[3.704392592350939e-32, 3.704776207749125e-32,...","[0.11830358733983287, 0.11831842741464724, 0.1...",0.194945,3.463982
250,2019-12-30,"[3.7569139050456915e-32, 3.3204299301779555e-3...","[0.1345648680248915, 0.1160263786721775, 0.116...",0.220168,4.320169


In [8]:
average_annualized_return = df2['annualized_return'].mean()
print("Average return for trained matrix : ", average_annualized_return)

total_return_sum = df2['total_return'].sum()
print(f"Total Return Sum: {total_return_sum}")

Average return for trained matrix :  0.8311076613052832
Total Return Sum: 14.206047371171156


In [None]:
average_annualized_return = df2['annualized_return'].mean()
print("Average return for trained matrix : ", average_annualized_return)

total_return_sum = df2['total_return'].sum()
print(f"Total Return Sum: {total_return_sum}")

Average return for trained matrix :  1.3499884138215879
Total Return Sum: 18.341571994950606
