In [1]:
import yfinance as yf
import pandas as pd
import numpy as np
from Utils.Solver import *
from Utils.Signals import *

# Prepare data

In [2]:
# Download historical stock data
tickers = ['AAPL', 'MSFT', 'TSLA', 'AMZN', 'GOOG']
# Download historical stock data
data = yf.download(tickers, start='2010-01-01', end='2024-01-01')
# Initialize the portfolio solver with appropriate penalty and max weight threshold
portfolio_solver = Portfolio_Solver(0.8, max_weight_threshold=0.3)

start_date_eval = '2020-01-01'
end_date_eval = '2021-01-01'
date_range_eval = pd.date_range(start=start_date_eval, end=end_date_eval)


YF.download() has changed argument auto_adjust default to True


[*********************100%***********************]  5 of 5 completed


In [3]:
# Initialize an empty list to store the rows for the first DataFrame
dataset_scores = []

# Step 1: Create the DataFrame with rsi_scores, macd_scores, and sma_scores
for date in date_range_eval:
    #print(f"processing {date}")

    # Step 1a: Calculate the RSI signal scores
    rsi_signal_scores = calculate_rsi_signal(data, tickers, date=date, period=14)
    rsi_scores = np.array([score[1] for score in rsi_signal_scores])

    # Step 1b: Calculate the MACD signal scores
    macd_signal_scores = calculate_macd_signal(data, tickers, date=date)
    macd_scores = np.array([score[1] for score in macd_signal_scores])

    # Step 1c: Calculate the SMA signal scores
    sma_signal_scores = calculate_sma_signal(data, tickers, date=date)
    sma_scores = np.array([score[1] for score in sma_signal_scores])

    if np.any(np.isnan(rsi_scores)) or np.any(np.isnan(macd_scores)) or np.any(np.isnan(sma_scores)):
        print(f"Skipping {date} due to NaN values in the signals.")
        continue  # Skip this date and move to the next one

    # Step 1d: Add the scores to the first dataset
    dataset_scores.append({
        'date': date,
        'rsi_scores': rsi_scores,
        'macd_scores': macd_scores,
        'sma_scores': sma_scores
    })


# Convert the first dataset into a DataFrame
df_scores = pd.DataFrame(dataset_scores)
display(df_scores)

Skipping 2020-01-01 00:00:00 due to NaN values in the signals.
Skipping 2020-01-04 00:00:00 due to NaN values in the signals.
Skipping 2020-01-05 00:00:00 due to NaN values in the signals.
Skipping 2020-01-11 00:00:00 due to NaN values in the signals.
Skipping 2020-01-12 00:00:00 due to NaN values in the signals.
Skipping 2020-01-18 00:00:00 due to NaN values in the signals.
Skipping 2020-01-19 00:00:00 due to NaN values in the signals.
Skipping 2020-01-20 00:00:00 due to NaN values in the signals.
Skipping 2020-01-25 00:00:00 due to NaN values in the signals.
Skipping 2020-01-26 00:00:00 due to NaN values in the signals.
Skipping 2020-02-01 00:00:00 due to NaN values in the signals.
Skipping 2020-02-02 00:00:00 due to NaN values in the signals.
Skipping 2020-02-08 00:00:00 due to NaN values in the signals.
Skipping 2020-02-09 00:00:00 due to NaN values in the signals.
Skipping 2020-02-15 00:00:00 due to NaN values in the signals.
Skipping 2020-02-16 00:00:00 due to NaN values in the s

Unnamed: 0,date,rsi_scores,macd_scores,sma_scores
0,2020-01-02,"[84.60799236858566, 76.61355921829467, 75.7266...","[0.33128680226364815, 0.1436935901958516, 0.16...","[11.474995079040532, 13.13599018096923, 5.6645..."
1,2020-01-03,"[77.73353553940395, 65.79059516925966, 78.6274...","[0.30404789862667503, 0.04923520754366528, 0.1...","[11.622064704895017, 13.367627143859863, 5.858..."
2,2020-01-06,"[79.21003592837648, 66.82519134695298, 80.3217...","[0.2906742686162922, -0.010599857637790144, 0....","[11.77049680709839, 13.601333923339837, 6.0613..."
3,2020-01-07,"[75.98196906751427, 59.922808151470505, 83.257...","[0.22762575342557567, -0.15887903693712024, 0....","[11.907038326263432, 13.747934837341319, 6.218..."
4,2020-01-08,"[79.10395137908837, 66.3937931666432, 86.18215...","[0.23217631551737394, -0.11322353000716534, 0....","[12.043761997222909, 13.918324775695794, 6.360..."
...,...,...,...,...
248,2020-12-24,"[67.92089154472352, 61.41409505928145, 62.8719...","[0.6376323142443843, 0.8196543347705583, -1.10...","[20.276957988739014, 16.262740097045878, 64.36..."
249,2020-12-28,"[73.6473568620839, 64.36474042255928, 63.14165...","[0.877413223624091, 0.8858005927771362, -1.163...","[20.262904987335205, 16.05113052368165, 64.874..."
250,2020-12-29,"[68.56475621171795, 62.478896839657615, 63.483...","[0.8423484078778318, 0.8160782682995822, -1.22...","[20.211752891540527, 15.715762100219706, 65.42..."
251,2020-12-30,"[65.48948210128901, 56.99497633472064, 67.5459...","[0.6767041941510539, 0.5607938349184944, -0.71...","[20.215960521697994, 15.502698059082036, 66.16..."


In [4]:
# Now, iterate over df_step1 to calculate combined scores, portfolio weights, and returns
dataset_returns = []

# Step 2: Create the second DataFrame with combined_scores, portfolio_weights, total_return, and annualized_return
for index, row in df_scores.iterrows():
    date = row['date']
    rsi_scores = row['rsi_scores']
    macd_scores = row['macd_scores']
    sma_scores = row['sma_scores']

    print(f"Processing {date} for Step 2")

    # Step 2a: Combine the signals (you can later train a model to adjust these weights)
    signal_weights = [1, 1, 1]  # You can adjust these weights later based on your model
    combined_scores = combine_signals(signal_weights, [rsi_scores, macd_scores, sma_scores])
    combined_scores_with_tickers = list(zip(tickers, combined_scores))

    print(f"Combined Scores: {combined_scores_with_tickers}")

    # Step 2b: Solve the portfolio based on the combined signal scores
    portfolio_weights = portfolio_solver.SolveSignalPortfolio(tickers, data, combined_scores)

    # Step 2c: Calculate the returns for the portfolio based on the optimized weights
    cumulative_returns, total_return, annualized_return = portfolio_solver.CalculatePortfolioReturns(tickers, data, portfolio_weights, start_date=date, time_period=30)

    # Step 2d: Add the calculated values to the second dataset
    dataset_returns.append({
        'date': date,
        'combined_scores': combined_scores,
        'portfolio_weights': portfolio_weights,
        'total_return': total_return,
        'annualized_return': annualized_return
    })

# Convert the second dataset into a DataFrame
df_returns = pd.DataFrame(dataset_returns)

# You can now display or use the two DataFrames for further analysis or training your model
display(df_returns)  # The second DataFrame with combined scores, portfolio weights, and returns

Processing 2020-01-02 00:00:00 for Step 2
Combined Scores: [('AAPL', np.float64(96.41427424988984)), ('MSFT', np.float64(89.89324298945975)), ('TSLA', np.float64(81.55581156073055)), ('AMZN', np.float64(70.60623838272964)), ('GOOG', np.float64(68.40737207874147))]
     pcost       dcost       gap    pres   dres
 0: -1.2152e-01 -1.1410e+00  1e+00  0e+00  3e+00
 1: -1.2182e-01 -1.4598e-01  2e-02  1e-16  6e-02
 2: -1.2220e-01 -1.2271e-01  5e-04  1e-16  9e-04
 3: -1.2220e-01 -1.2221e-01  5e-06  5e-17  9e-06
 4: -1.2220e-01 -1.2220e-01  5e-08  4e-17  9e-08
Optimal solution found.
Optimized Portfolio Weights:
{'AAPL': np.float64(0.24620214076499705), 'MSFT': np.float64(0.2261683489356103), 'TSLA': np.float64(0.20055425877283833), 'AMZN': np.float64(0.16691526736391724), 'GOOG': np.float64(0.16015998416263705)}
Total Portfolio Return: 22.20%
Annualized Portfolio Return: 438.63%
Processing 2020-01-03 00:00:00 for Step 2
Combined Scores: [('AAPL', np.float64(89.65964814292565)), ('MSFT', np.flo

Unnamed: 0,date,combined_scores,portfolio_weights,total_return,annualized_return
0,2020-01-02,"[96.41427424988984, 89.89324298945975, 81.5558...","[0.24620214076499705, 0.2261683489356103, 0.20...",0.221965,4.386326
1,2020-01-03,"[89.65964814292565, 79.20745752066318, 84.6649...","[0.24316283846470066, 0.20898700098665746, 0.2...",0.227135,4.580775
2,2020-01-06,"[91.27120700409117, 80.41592541265503, 86.5845...","[0.23504303420561917, 0.20114163714187633, 0.2...",0.237230,4.978393
3,2020-01-07,"[88.11663314720327, 73.5118639518747, 89.74124...","[0.22978959341023, 0.1834162898280712, 0.23494...",0.256517,5.807922
4,2020-01-08,"[91.37988969182865, 80.19889441233184, 92.9160...","[0.23207002787722572, 0.19755671578223308, 0.2...",0.223225,4.433181
...,...,...,...,...,...
248,2020-12-24,"[88.83548184770692, 78.49648949109789, 126.137...","[0.2160075845956291, 0.18504866739481174, 0.32...",0.134799,1.892806
249,2020-12-28,"[94.78767507304319, 81.30167153901806, 126.852...","[0.21480949404308738, 0.17713348096455309, 0.3...",0.104820,1.310192
250,2020-12-29,"[89.61885751113631, 79.0107372081769, 127.6798...","[0.20481326746724762, 0.17465120350737012, 0.3...",0.101063,1.245038
251,2020-12-30,"[86.38214681713806, 73.05846822872117, 132.993...","[0.20323288376512316, 0.16417396324876404, 0.3...",0.109215,1.388545


Calculate all the return for each of those dates



In [5]:
average_annualized_return = df_returns['annualized_return'].mean()
print("Average return for 1/N : ", average_annualized_return)

total_return_sum = df_returns['total_return'].sum()
print(f"Total Return Sum: {total_return_sum}")

Average return for 1/N :  1.8176891990558766
Total Return Sum: 23.11488333031417


### Matrix of size signals 

In [6]:

# Define the target date range
start_date = '2011-01-01'
end_date = '2020-01-01'
date_range = pd.date_range(start=start_date, end=end_date)

# Initialize an empty list to store the rows of the dataset
dataset_weighted_returns = []

# Step 2: Create the second DataFrame with combined_scores, portfolio_weights, total_return, and annualized_return
for index, row in df_scores.iterrows():
    date = row['date']
    rsi_scores = row['rsi_scores']
    macd_scores = row['macd_scores']
    sma_scores = row['sma_scores']

    if np.any(np.isnan(rsi_scores)) or np.any(np.isnan(macd_scores)) or np.any(np.isnan(sma_scores)):
        #print(f"Skipping {date} due to NaN values in the signals.")
        continue  # Skip this date and move to the next one

    # Calculate the average scores for each signal type (RSI, MACD, SMA)
    avg_rsi = np.mean(rsi_scores)
    avg_macd = np.mean(macd_scores)
    avg_sma = np.mean(sma_scores)

    # Add the aggregated values to the dataset
    dataset_weighted_returns.append({
        'date': date,
        'rsi_avg': avg_rsi,
        'macd_avg': avg_macd,
        'sma_avg': avg_sma,
        'rsi_scores': rsi_scores,
        'macd_scores': macd_scores,
        'sma_scores': sma_scores,
        'total_return': total_return 
    })

# Convert the dataset into a DataFrame
df = pd.DataFrame(dataset_weighted_returns)

# Prepare the feature matrix X (aggregated values for RSI, MACD, SMA)
X = df[['rsi_avg', 'macd_avg', 'sma_avg']].values

# Prepare the target y (total return)
y = df['total_return'].values.reshape(-1, 1)

# Initialize and train a linear model to learn the weight matrix W (size should be 3)
W, _, _, _ = np.linalg.lstsq(X, y, rcond=None)

# Output the learned weight matrix W
print("Learned Weight Matrix (W):")
print(W)


Learned Weight Matrix (W):
[[ 0.00170633]
 [-0.01155097]
 [ 0.00070175]]


In [None]:

# Now, iterate over df_step1 to calculate combined scores, portfolio weights, and returns
dataset_weighted_returns = []

# Step 2: Create the second DataFrame with combined_scores, portfolio_weights, total_return, and annualized_return
for index, row in df_scores.iterrows():
    date = row['date']
    rsi_scores = row['rsi_scores']
    macd_scores = row['macd_scores']
    sma_scores = row['sma_scores']

    print(f"Processing {date} for Step 2")

    # Step 2a: Combine the signals (you can later train a model to adjust these weights)
    signal_weights = W  # You can adjust these weights later based on your model
    combined_scores = combine_signals(signal_weights, [rsi_scores, macd_scores, sma_scores])
    combined_scores_with_tickers = list(zip(tickers, combined_scores))

    print(f"Combined Scores: {combined_scores_with_tickers}")

    # Step 2b: Solve the portfolio based on the combined signal scores
    portfolio_weights = portfolio_solver.SolveSignalPortfolio(tickers, data, combined_scores)

    # Step 2c: Calculate the returns for the portfolio based on the optimized weights
    cumulative_returns, total_return, annualized_return = portfolio_solver.CalculatePortfolioReturns(tickers, data, portfolio_weights, start_date=date, time_period=30)

    # Step 2d: Add the calculated values to the second dataset
    dataset_weighted_returns.append({
        'date': date,
        'combined_scores': combined_scores,
        'portfolio_weights': portfolio_weights,
        'total_return': total_return,
        'annualized_return': annualized_return
    })


# Convert the dataset into a DataFrame
df2 = pd.DataFrame(dataset_weighted_returns)

# Display the dataset (this will include the RSI, MACD, SMA scores, portfolio weights, and returns)

# You can now use this DataFrame for training your model.
display(df2)

Processing 2020-01-02 00:00:00 for Step 2
Combined Scores: [('AAPL', np.float64(0.14859482332094498)), ('MSFT', np.float64(0.13828617766190732)), ('TSLA', np.float64(0.13128806427162068)), ('AMZN', np.float64(0.1151909513474473)), ('GOOG', np.float64(0.11286415975393543))]
     pcost       dcost       gap    pres   dres
 0: -1.2096e-01 -1.1369e+00  1e+00  1e-16  3e+00
 1: -1.2114e-01 -1.4281e-01  2e-02  5e-17  6e-02
 2: -1.2139e-01 -1.2177e-01  4e-04  5e-17  8e-04
 3: -1.2139e-01 -1.2139e-01  4e-06  9e-17  8e-06
 4: -1.2139e-01 -1.2139e-01  4e-08  1e-16  8e-08
Optimal solution found.
Optimized Portfolio Weights:
{'AAPL': np.float64(0.23742890139320502), 'MSFT': np.float64(0.21748875612821789), 'TSLA': np.float64(0.20395222097215962), 'AMZN': np.float64(0.1728154242741844), 'GOOG': np.float64(0.16831469723223305)}
Total Portfolio Return: 22.77%
Annualized Portfolio Return: 460.37%
Processing 2020-01-03 00:00:00 for Step 2
Combined Scores: [('AAPL', np.float64(0.13728258637078022)), ('MS

In [8]:
average_annualized_return = df2['annualized_return'].mean()
print("Average return for trained matrix : ", average_annualized_return)

total_return_sum = df2['total_return'].sum()
print(f"Total Return Sum: {total_return_sum}")

KeyError: 'annualized_return'

### Matrix of size tickers x signals 

In [None]:
# Create the feature matrix dynamically
X = np.array([[
    row['rsi_scores'][i],  # Get RSI score for the i-th ticker
    row['macd_scores'][i],  # Get MACD score for the i-th ticker
    row['sma_scores'][i]    # Get SMA score for the i-th ticker
] for _, row in df.iterrows() for i in range(len(tickers))]).reshape(len(df), len(tickers) * 3)

# Ensure the shape of X is correct (it should be 2D with each row representing a data point)
#print("Shape of X:", X.shape)
#print(X)

# Target (returns), you can choose 'total_return' or 'annualized_return' based on your preference
y = df['total_return'].values.reshape(-1, 1)

# Initialize and train a linear model to learn the weight matrix W
# Using np.linalg.lstsq to solve the least squares problem, i.e., Y = XW
W2, _, _, _ = np.linalg.lstsq(X, y, rcond=None)

# Output the learned weight matrix W
print("Learned Weight Matrix (W):", W2)
