In [131]:
import pandas as pd
import os
import numpy as np
from feature_generation import prepare_features, combine_stocks, ml_preprocessing
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, max_error, root_mean_squared_error, r2_score
import matplotlib.pyplot as plt
import plotly.express as px
import yfinance as yf

In [43]:
# Aktien auswählen
relevant_stocks = ['MSFT', 'AAPL', 'NVDA', 'GOOG', 'AMZN', 'BRK-B', 'LLY', 'JPM', 'XOM', 'WMT', 'UNH', 'MA', 'PG', 'JNJ', 'COST', 'HD', 'MRK', 'ORCL', 'CVX', 'BAC', 'KO', 'CRM', 'NFLX', 'PEP', 'AMD', 'TMO', 'ADBE', 'WFC', 'LIN', 'QCOM', 'CSCO', 'MCD', 'ACN', 'DIS', 'DHR', 'ABT', 'INTU', 'GE', 'CAT', 'AMAT', 'AXP', 'TXN', 'VZ', 'AMGN', 'PFE', 'MS', 'CMCSA', 'IBM', 'NEE', 'UNP']

# Feature-Generation für jede Aktie
for symbol in relevant_stocks:
    file_path = os.path.join('data', 'stock_dataframes', f'{symbol}.csv')
    if not os.path.exists(file_path): # nicht nochmals erstellen wenn schon vorhanden
        prepare_features(symbol, option_volume=False)

# Zusammenführen, Clean-up NaN-Values (im Normalfall 1 Zeile), One-Hot Encoding der kategorischen Variable 'Sector', abspeichern 
stock_df = combine_stocks(relevant_stocks, return_df=True)

# Vorbereitung des Trainings-, Validierungs- und Testdatensatzes
X_train_scaled, y_train, X_valid_scaled, y_valid, X_test_scaled, y_test, df_train, df_valid, df_test = ml_preprocessing(stock_df)

Combining Data of all Stocks into one DataFrame...
Checking for NaN Values...
Column VWAP_D has 1 NaN Value(s)
Dropped 1 Rows. (1 is the bugged AMD Row)
Saved DataFrame as combined_dataframe.csv at Path data\ML_data\combined_dataframe.csv
-------------------------------------------------------------------------------------
Preparing Train, Validation and Test Data for ML...
Data Preparation finished
-------------------------------------------------------------------------------------


In [5]:
len(stock_df.columns)
stock_df.columns

Index(['Open', 'High', 'Low', 'Close', 'Absolute change', 'Relative change',
       'Volume', 'RSI_14', 'MACD_12_26_9', 'MACDh_12_26_9', 'MACDs_12_26_9',
       'STOCHk_14_3_3', 'STOCHd_14_3_3', 'MOM_10', 'TSI_13_25_13',
       'TSIs_13_25_13', 'ADX_14', 'DMP_14', 'DMN_14', 'OBV', 'VWAP_D',
       'PVO_12_26_9', 'PVOh_12_26_9', 'PVOs_12_26_9', 'AD', 'MFI_14', 'CMF_20',
       'BBL_5_2.0', 'BBM_5_2.0', 'BBU_5_2.0', 'BBB_5_2.0', 'BBP_5_2.0',
       'ATRr_14', 'KCLe_20_2', 'KCBe_20_2', 'KCUe_20_2', 'ISA_9', 'ISB_26',
       'ITS_9', 'IKS_26', 'CDL_HAMMER_bullish', 'CDL_MORNINGSTAR_bullish',
       'CDL_HANGINGMAN_bearish', 'CDL_DARKCLOUDCOVER_bearish',
       'CDL_ENGULFING_bullish', 'CDL_ENGULFING_bearish',
       'CDL_DOJI_10_0.1_continuation', 'DFF', 'VIX_open', 'VIX_high',
       'VIX_low', 'VIX_close', 'VIX_abs_change', 'VIX_rel_change',
       'S_P_500_open', 'S_P_500_high', 'S_P_500_low', 'S_P_500_close',
       'S_P_500_abs_change', 'S_P_rel_change', 'Symbol',
       'Sector_Basic

In [3]:
def calculate_metrics(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mape = mean_absolute_percentage_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = root_mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    return mae, mape, mse, rmse, r2

def metrics(model_name, y_train, y_pred_train, y_valid, y_pred_valid, y_test, y_pred_test, df_train, df_valid, df_test):
    mae_train, mape_train, mse_train, rmse_train, r2_train = calculate_metrics(y_train, y_pred_train)
    mae_valid, mape_valid, mse_valid, rmse_valid, r2_valid = calculate_metrics(y_valid, y_pred_valid)
    mae_test, mape_test, mse_test, rmse_test, r2_test = calculate_metrics(y_test, y_pred_test)

    # print("-------------------------------------------------------------")
    print(model_name)
    print("-------------------------------------------------------------")
    print(f"MAE on training set:\t\t{mae_train}")
    print(f"MAE on validation set:\t\t{mae_valid}")
    print(f"MAE on test set:\t\t{mae_test}")
    print("-------------------------------------------------------------")
    print(f"MAPE on training set:\t\t{(mape_train)*100:.4f}%")
    print(f"MAPE on validation set:\t\t{(mape_valid)*100:.4f}%")
    print(f"MAPE on test set:\t\t{(mape_test)*100:.4f}%")
    print("-------------------------------------------------------------")
    print(f"MSE on training set:\t\t{mse_train}")
    print(f"MSE on validation set:\t\t{mse_valid}")
    print(f"MSE on test set:\t\t{mse_test}")
    print("-------------------------------------------------------------")
    print(f"RMSE on training set:\t\t{rmse_train}")
    print(f"RMSE on validation set:\t\t{rmse_valid}")
    print(f"RMSE on test set:\t\t{rmse_test}")
    print("-------------------------------------------------------------")
    print(f"R2 on training set:\t\t{r2_train}")
    print(f"R2 on validation set:\t\t{r2_valid}")
    print(f"R2 on test set:\t\t\t{r2_test}")
    print("-------------------------------------------------------------")
    print(f"Mean Close on training set:\t{df_train['Close'].mean()}")
    print(f"Mean Close on validation set:\t{df_valid['Close'].mean()}")
    print(f"Mean Close on test set:\t\t{df_test['Close'].mean()}")
    print("-------------------------------------------------------------")


def metrics_no_valid(model_name, y_train_plus_valid, y_pred_train_plus_valid, y_test, y_pred_test, df_train_plus_valid, df_test):
    mae_train, mape_train, mse_train, rmse_train, r2_train = calculate_metrics(y_train_plus_valid, y_pred_train_plus_valid)
    mae_test, mape_test, mse_test, rmse_test, r2_test = calculate_metrics(y_test, y_pred_test)

    # print("------------------------------------------------------------------")
    print(model_name)
    print("Bigger Training Set as Model doesn't have dedicated Validation Set")
    print("------------------------------------------------------------------")
    print(f"MAE on training set:\t\t{mae_train}")
    print(f"MAE on test set:\t\t{mae_test}")
    print("------------------------------------------------------------------")
    print(f"MAPE on training set:\t\t{(mape_train)*100:.4f}%")
    print(f"MAPE on test set:\t\t{(mape_test)*100:.4f}%")
    print("------------------------------------------------------------------")
    print(f"MSE on training set:\t\t{mse_train}")
    print(f"MSE on test set:\t\t{mse_test}")
    print("------------------------------------------------------------------")
    print(f"RMSE on training set:\t\t{rmse_train}")
    print(f"RMSE on test set:\t\t{rmse_test}")
    print("------------------------------------------------------------------")
    print(f"R2 on training set:\t\t{r2_train}")
    print(f"R2 on test set:\t\t\t{r2_test}")
    print("------------------------------------------------------------------")
    print(f"Mean Close on training set:\t{df_train_plus_valid['Close'].mean()}")
    print(f"Mean Close on test set:\t\t{df_test['Close'].mean()}")
    print("------------------------------------------------------------------")


In [4]:
###################
# Stable Baseline #
###################

# Ansatz: Close morgen = Close heute
# Wir predicten für den Close morgen, dass er gleich sein wird wie der heutige
# Offensichtlich wird er in fast allen Fällen nicht exakt gleich sein, aber auch nicht extrem verschieden
model_name = "Stable Baseline: Prediction for Close tomorrow = Close today"
print("-------------------------------------------------------------")
metrics(model_name, df_train['Next close'], df_train['Close'], df_valid['Next close'], df_valid['Close'], df_test['Next close'], df_test['Close'], df_train, df_valid, df_test)

-------------------------------------------------------------
Stable Baseline: Prediction for Close tomorrow = Close today
-------------------------------------------------------------
MAE on training set:		0.44450316482212837
MAE on validation set:		0.48747105839521593
MAE on test set:		1.558719346635306
-------------------------------------------------------------
MAPE on training set:		1.4332%
MAPE on validation set:		1.0897%
MAPE on test set:		1.2200%
-------------------------------------------------------------
MSE on training set:		0.5966152170874464
MSE on validation set:		0.6173514178382743
MSE on test set:		9.616506198228505
-------------------------------------------------------------
RMSE on training set:		0.7724087111675051
RMSE on validation set:		0.7857171360217838
RMSE on test set:		3.1010492092562
-------------------------------------------------------------
R2 on training set:		0.9991986857297639
R2 on validation set:		0.999508635500174
R2 on test set:			0.999066960867

# Linear Regression

In [5]:
# Train - Validation - Test Split
# Bedenke: Einige ML Modelle verwenden kein explizites Validation Set für das Finetuning, sondern arbeiten mit Cross-Validation auf dem Training Set

# Vorbereitung des Trainings-, Validierungs- und Testdatensatzes
X_train_scaled, y_train, X_valid_scaled, y_valid, X_test_scaled, y_test, df_train, df_valid, df_test = ml_preprocessing(stock_df)

model_LinearRegression = LinearRegression()
model_LinearRegression.fit(X_train_scaled, y_train)

y_pred_train = model_LinearRegression.predict(X_train_scaled)
y_pred_valid = model_LinearRegression.predict(X_valid_scaled)
y_pred_test = model_LinearRegression.predict(X_test_scaled)

model_name = "Linear Regression"
metrics(model_name, y_train, y_pred_train, y_valid, y_pred_valid, y_test, y_pred_test, df_train, df_valid, df_test)


Preparing Train, Validation and Test Data for ML...
Data Preparation finished
-------------------------------------------------------------------------------------
Linear Regression
-------------------------------------------------------------
MAE on training set:		0.4431751335350506
MAE on validation set:		0.5269566186357568
MAE on test set:		1.7311975154534747
-------------------------------------------------------------
MAPE on training set:		1.5448%
MAPE on validation set:		1.2923%
MAPE on test set:		1.5182%
-------------------------------------------------------------
MSE on training set:		0.5819651690939105
MSE on validation set:		0.6602807375771358
MSE on test set:		10.113221633677616
-------------------------------------------------------------
RMSE on training set:		0.7628664162839458
RMSE on validation set:		0.8125766041285805
RMSE on test set:		3.180129185061138
-------------------------------------------------------------
R2 on training set:		0.9992183622183626
R2 on valida

In [6]:
# Trainings-Datensatz erweitern mit Validierungs-Datensatz, da dieses Modell keine Validierungsdaten verwendet

# Vorbereitung des Trainings-, Validierungs- und Testdatensatzes
X_train_scaled, y_train, X_valid_scaled, y_valid, X_test_scaled, y_test, df_train, df_valid, df_test = ml_preprocessing(stock_df)

X_train_plus_valid = pd.concat([X_train_scaled, X_valid_scaled])
y_train_plus_valid = pd.concat([y_train, y_valid])

df_train_plus_valid = pd.concat([df_train, df_valid])

model_LinearRegression = LinearRegression()
model_LinearRegression.fit(X_train_plus_valid, y_train_plus_valid)

y_pred_train_plus_valid = model_LinearRegression.predict(X_train_plus_valid)
y_pred_test = model_LinearRegression.predict(X_test_scaled)

model_name = "Linear Regression"
metrics_no_valid(model_name, y_train_plus_valid, y_pred_train_plus_valid, y_test, y_pred_test, df_train_plus_valid, df_test)


Preparing Train, Validation and Test Data for ML...
Data Preparation finished
-------------------------------------------------------------------------------------
Linear Regression
Bigger Training Set as Model doesn't have dedicated Validation Set
------------------------------------------------------------------
MAE on training set:		0.46145773434655607
MAE on test set:		1.560037862314247
------------------------------------------------------------------
MAPE on training set:		1.3200%
MAPE on test set:		1.2228%
------------------------------------------------------------------
MSE on training set:		0.5970263599942375
MSE on test set:		9.517915574956188
------------------------------------------------------------------
RMSE on training set:		0.7726748086965418
RMSE on test set:		3.0851119225979775
------------------------------------------------------------------
R2 on training set:		0.9994155075848459
R2 on test set:			0.9990765265982655
----------------------------------------------

In [7]:
# Training mit allen bekannten Jahren und anschliessend Prediction für folgendes Jahr
# Anschliessend neues Training mit alten Jahren + gerade bearbeitetes Jahr, Prediction für das nächste Jahr
# So wird das Modell jedes Jahr aktualisiert

# Vorbereitung des Trainings-, Validierungs- und Testdatensatzes
X_train_scaled, y_train, X_valid_scaled, y_valid, X_test_scaled, y_test, df_train, df_valid, df_test = ml_preprocessing(stock_df)

X_train = pd.concat([X_train_scaled, X_valid_scaled])
y_train = pd.concat([y_train, y_valid])
df_train = pd.concat([df_train, df_valid])

df_test_year_list = []

for year in range(2015, 2022+1):
    start = f'{year}-01-01'
    end = f'{year+1}-01-01'

    df_test_year = df_test.loc[(df_test.index >= start) & (df_test.index < end)]
    X_test_year = X_test_scaled.loc[(X_test_scaled.index >= start) & (X_test_scaled.index < end)]
    y_test_year = y_test.loc[(y_test.index >= start) & (y_test.index < end)]

    print(f"X_train: {X_train.index[0]} bis {X_train.index[-1]}")
    print(f"y_train: {y_train.index[0]} bis {y_train.index[-1]}")
    print(f"X_test: {X_test_year.index[0]} bis {X_test_year.index[-1]}")
    print(f"y_test: {y_test_year.index[0]} bis {y_test_year.index[-1]}")

    model_LinearRegression = LinearRegression()
    model_LinearRegression.fit(X_train, y_train)

    y_pred_train = model_LinearRegression.predict(X_train)
    y_pred_test_year = model_LinearRegression.predict(X_test_year)

    model_name = f"Linear Regression {year}"
    metrics_no_valid(model_name, y_train, y_pred_train, y_test_year, y_pred_test_year, df_train, df_test_year)

    # Training Set erweitern um Daten des abgeschlossenen Jahres
    X_train = pd.concat([X_train, X_test_year])
    y_train = pd.concat([y_train, y_test_year])
    df_train = pd.concat([df_train, df_test_year])

    # Predictions anfügen für spätere Auswertung
    df_test_year_with_predictions = df_test_year.copy()
    df_test_year_with_predictions.loc[:, 'Prediction'] = y_pred_test_year
    df_test_year_list.append(df_test_year_with_predictions)

df_test_all_years = pd.concat(df_test_year_list)

Preparing Train, Validation and Test Data for ML...
Data Preparation finished
-------------------------------------------------------------------------------------
X_train: 2003-04-24 00:00:00 bis 2014-12-31 00:00:00
y_train: 2003-04-24 00:00:00 bis 2014-12-31 00:00:00
X_test: 2015-01-02 00:00:00 bis 2015-12-31 00:00:00
y_test: 2015-01-02 00:00:00 bis 2015-12-31 00:00:00
Linear Regression 2015
Bigger Training Set as Model doesn't have dedicated Validation Set
------------------------------------------------------------------
MAE on training set:		0.46145773434655607
MAE on test set:		0.7555103397944662
------------------------------------------------------------------
MAPE on training set:		1.3200%
MAPE on test set:		1.1090%
------------------------------------------------------------------
MSE on training set:		0.5970263599942375
MSE on test set:		1.4785836262653798
------------------------------------------------------------------
RMSE on training set:		0.7726748086965418
RMSE on tes

# RandomForestRegressor

In [8]:
# Vorbereitung des Trainings-, Validierungs- und Testdatensatzes
X_train_scaled, y_train, X_valid_scaled, y_valid, X_test_scaled, y_test, df_train, df_valid, df_test = ml_preprocessing(stock_df)

X_train = pd.concat([X_train_scaled, X_valid_scaled])
y_train = pd.concat([y_train, y_valid])
df_train = pd.concat([df_train, df_valid])

model_RandomForestRegressor = RandomForestRegressor(n_estimators=100, random_state=6593, n_jobs=-1)
model_RandomForestRegressor.fit(X_train, y_train)

y_pred_train = model_RandomForestRegressor.predict(X_train)
y_pred_test = model_RandomForestRegressor.predict(X_test_scaled)

model_name = "RandomForestRegressor"
metrics_no_valid(model_name, y_train, y_pred_train, y_test, y_pred_test, df_train, df_test)


Preparing Train, Validation and Test Data for ML...
Data Preparation finished
-------------------------------------------------------------------------------------


In [5]:
# Vorbereitung des Trainings-, Validierungs- und Testdatensatzes
X_train_scaled, y_train, X_valid_scaled, y_valid, X_test_scaled, y_test, df_train, df_valid, df_test = ml_preprocessing(stock_df)

X_train = pd.concat([X_train_scaled, X_valid_scaled])
y_train = pd.concat([y_train, y_valid])
df_train = pd.concat([df_train, df_valid])

model_RandomForestRegressor = RandomForestRegressor(n_estimators=100, random_state=6593, n_jobs=-1)

# GridSearch
param_grid = {
    'n_estimators': [100, 200],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'bootstrap': [True]
}

kf = KFold(n_splits=5, shuffle=True, random_state=6593)
grid_search = GridSearchCV(estimator=model_RandomForestRegressor, param_grid=param_grid, 
                           cv=kf, n_jobs=-1, verbose=2, scoring='neg_mean_squared_error')

grid_search.fit(X_train, y_train)

# Beste Parameter ausgeben
best_params = grid_search.best_params_
print(f"Best parameters found: {best_params}")

# Bestes Modell mit allen Trainingsdaten fitten
best_rf_regressor = grid_search.best_estimator_
best_rf_regressor.fit(X_train, y_train)

y_pred_train = best_rf_regressor.predict(X_train)
y_pred_test = best_rf_regressor.predict(X_test_scaled)

model_name = "RandomForestRegressor optimized"
metrics_no_valid(model_name, y_train, y_pred_train, y_test, y_pred_test, df_train, df_test)

Preparing Train, Validation and Test Data for ML...
Data Preparation finished
-------------------------------------------------------------------------------------
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Best parameters found: {'bootstrap': True, 'max_depth': 20, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}
RandomForestRegressor optimized
Bigger Training Set as Model doesn't have dedicated Validation Set
------------------------------------------------------------------
MAE on training set:		0.24435416562341272
MAE on test set:		19.735089936577303
------------------------------------------------------------------
MAPE on training set:		0.6920%
MAPE on test set:		6.2420%
------------------------------------------------------------------
MSE on training set:		0.14706404683943575
MSE on test set:		3905.061274416874
------------------------------------------------------------------
RMSE on training set:		0.383489304726267

____________
# Trading Bot
____________

In [41]:
stock_df['Prediction_for_tomorrow'] = stock_df['Close'] * np.random.uniform(0.98, 1.02, size=stock_df.shape[0])
stock_df['Prediction_for_today'] = 0


In [42]:
stock_df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Absolute change,Relative change,Volume,RSI_14,MACD_12_26_9,MACDh_12_26_9,...,Sector_Consumer Defensive,Sector_Energy,Sector_Financial Services,Sector_Healthcare,Sector_Industrials,Sector_Technology,Sector_Utilities,Next close,Prediction_for_tomorrow,Prediction_for_today
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2003-04-24,0.241429,0.243036,0.232143,0.24,-0.001429,-0.005919,325108000,40.696732,-0.005764,-0.000714,...,0,0,0,0,0,1,0,0.238393,0.238349,0
2003-04-24,17.866255,18.032349,17.771986,17.920122,0.053867,0.003015,8208494,56.133805,0.24577,-0.043084,...,0,0,0,1,0,0,0,17.727097,18.160952,0
2003-04-24,15.85,15.99,15.7,15.72,-0.13,-0.008202,1433900,55.269183,0.02232,0.144321,...,0,0,0,0,0,1,0,15.64,15.578401,0
2003-04-24,17.65,18.0,17.555,17.865,0.215,0.012181,4520400,68.194453,0.673903,0.088887,...,0,0,0,0,0,1,0,17.309999,17.830556,0
2003-04-24,15.06,15.41,15.0,15.05,-0.01,-0.000664,24183200,61.796334,0.560173,0.200819,...,0,0,0,0,0,1,0,14.19,15.257879,0


In [55]:
trading_df = df_test[['Symbol', 'Open', 'Close']].copy()
trading_df['Prediction_for_tomorrow'] = trading_df['Close'] * np.random.uniform(0.98, 1.02, size=trading_df.shape[0])
# trading_df['Prediction_for_today'] = 0

trading_df.head()

Unnamed: 0_level_0,Symbol,Open,Close,Prediction_for_tomorrow
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015-01-02,AAPL,27.8475,27.3325,27.298513
2015-01-02,ABT,45.25,44.900002,45.307181
2015-01-02,ACN,89.669998,88.839996,88.661185
2015-01-02,ADBE,72.699997,72.339996,71.964322
2015-01-02,AMAT,24.99,24.959999,24.576546


In [65]:
single_stock_df_list = []

relevant_stocks = ['MSFT', 'AAPL', 'NVDA', 'GOOG', 'AMZN', 'BRK-B', 'LLY', 'JPM', 'XOM', 'WMT', 'UNH', 'MA', 'PG', 'JNJ', 'COST', 'HD', 'MRK', 'ORCL', 'CVX', 'BAC', 'KO', 'CRM', 'NFLX', 'PEP', 'AMD', 'TMO', 'ADBE', 'WFC', 'LIN', 'QCOM', 'CSCO', 'MCD', 'ACN', 'DIS', 'DHR', 'ABT', 'INTU', 'GE', 'CAT', 'AMAT', 'AXP', 'TXN', 'VZ', 'AMGN', 'PFE', 'MS', 'CMCSA', 'IBM', 'NEE', 'UNP']

for symbol in relevant_stocks:

    single_stock_df = trading_df[trading_df['Symbol']==symbol].copy()
    single_stock_df['Prediction_for_today'] = single_stock_df['Prediction_for_tomorrow'].shift(1)
    single_stock_df.drop(single_stock_df.head(1).index, inplace=True)

    single_stock_df['Difference_open_to_prediction_today'] = single_stock_df['Prediction_for_today'] - single_stock_df['Open']

    single_stock_df_list.append(single_stock_df)

trading_df = pd.concat(single_stock_df_list)

trading_df

Unnamed: 0_level_0,Symbol,Open,Close,Prediction_for_tomorrow,Prediction_for_today,Difference_open_to_prediction_today
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2015-01-05,MSFT,46.369999,46.330002,47.029418,45.904763,-0.465236
2015-01-06,MSFT,46.380001,45.650002,45.066239,47.029418,0.649417
2015-01-07,MSFT,45.980000,46.230000,45.941678,45.066239,-0.913760
2015-01-08,MSFT,46.750000,47.590000,47.529651,45.941678,-0.808322
2015-01-09,MSFT,47.610001,47.189999,47.446706,47.529651,-0.080350
...,...,...,...,...,...,...
2022-12-22,UNP,207.029999,208.660004,206.290600,205.180306,-1.849692
2022-12-23,UNP,208.199997,209.910004,209.396500,206.290600,-1.909397
2022-12-27,UNP,210.460007,210.320007,212.091072,209.396500,-1.063506
2022-12-28,UNP,210.229996,206.869995,209.421333,212.091072,1.861076


In [117]:
trading_days = trading_df.index.unique()

# dummy_trading_date = trading_days[1]

account = 1000000
n_stocks = 5
budget_per_stock = int(account / n_stocks)

daily_account_change = []
trades = []

for trading_day in trading_days:

    trading_day_df = trading_df.loc[trading_df.index == trading_day]

    top_stock_df = trading_day_df.nlargest(n_stocks, 'Difference_open_to_prediction_today')

    account_before_trades = account
    # daily_account_change = []

    # trades = []
    for i in range(n_stocks):
        symbol = top_stock_df.iloc[i]['Symbol']
        open = top_stock_df.iloc[i]['Open']
        close = top_stock_df.iloc[i]['Close']

        units_bought = budget_per_stock // open
        money_spent = units_bought * open
        account -= money_spent

        money_earned = units_bought * close
        account += money_earned

        abs_gain = money_earned - money_spent
        rel_gain = (close - open) / open

        trades.append(
            {
                'Date' : trading_day,
                'Symbol' : symbol,
                'Abs_gain' : abs_gain,
                'Rel_gain' : rel_gain
            }
        )

    account_after_trades = account
    abs_account_change = account_after_trades - account_before_trades
    rel_account_change = (account_after_trades - account_before_trades) / account_before_trades

    daily_account_change.append(
        {
            'Date' : trading_day,
            'Account' : account,
            'Abs_change' : abs_account_change,
            'Rel_change' : rel_account_change
        }
    )

trades_df = pd.DataFrame(trades)
daily_account_change_df = pd.DataFrame(daily_account_change)

trades_df
daily_account_change_df

Unnamed: 0,Date,Account,Abs_change,Rel_change
0,2015-01-05,9.841839e+05,-15816.139717,-0.015816
1,2015-01-06,9.701514e+05,-14032.420593,-0.014258
2,2015-01-07,9.719735e+05,1822.012562,0.001878
3,2015-01-08,9.821717e+05,10198.237473,0.010492
4,2015-01-09,9.712523e+05,-10919.374458,-0.011118
...,...,...,...,...
2007,2022-12-22,1.746108e+06,-4378.128601,-0.002501
2008,2022-12-23,1.757047e+06,10939.532181,0.006265
2009,2022-12-27,1.756130e+06,-917.414978,-0.000522
2010,2022-12-28,1.743549e+06,-12580.746048,-0.007164


In [128]:
# Create a line plot using Plotly
fig = px.line(daily_account_change_df, x='Date', y='Account', title='Portfolio Value in USD',
              labels={'Date': 'Date', 'Account': 'Portfolio Value ($)'})

# Customize the layout
fig.update_layout(
    xaxis_title='Date',
    yaxis_title='Portfolio Value ($)',
    xaxis_tickformat='%Y-%m-%d',
    template='plotly_white'
)

# Save the plot as HTML file to display
fig.write_html("plots/portfolio_value.html")

In [125]:
# Create a line plot using Plotly
fig = px.line(daily_account_change_df, x='Date', y='Abs_change', title='Absolute Change of the Portfolio Over Time',
              labels={'Date': 'Date', 'Abs_change': 'Absolute Change ($)'})

# Customize the layout
fig.update_layout(
    xaxis_title='Date',
    yaxis_title='Absolute Change ($)',
    xaxis_tickformat='%Y-%m-%d',
    template='plotly_white'
)

# Save the plot as HTML file to display
fig.write_html("plots/portfolio_absolute_change.html")

In [139]:
trading_df = df_test[['Symbol', 'Open', 'Close']].copy()
trading_df['Prediction_for_tomorrow'] = trading_df['Close'] * np.random.uniform(0.98, 1.02, size=trading_df.shape[0])

single_stock_df_list = []

relevant_stocks = ['MSFT', 'AAPL', 'NVDA', 'GOOG', 'AMZN', 'BRK-B', 'LLY', 'JPM', 'XOM', 'WMT', 'UNH', 'MA', 'PG', 'JNJ', 'COST', 'HD', 'MRK', 'ORCL', 'CVX', 'BAC', 'KO', 'CRM', 'NFLX', 'PEP', 'AMD', 'TMO', 'ADBE', 'WFC', 'LIN', 'QCOM', 'CSCO', 'MCD', 'ACN', 'DIS', 'DHR', 'ABT', 'INTU', 'GE', 'CAT', 'AMAT', 'AXP', 'TXN', 'VZ', 'AMGN', 'PFE', 'MS', 'CMCSA', 'IBM', 'NEE', 'UNP']

for symbol in relevant_stocks:

    single_stock_df = trading_df[trading_df['Symbol']==symbol].copy()
    single_stock_df['Prediction_for_today'] = single_stock_df['Prediction_for_tomorrow'].shift(1)
    single_stock_df.drop(single_stock_df.head(1).index, inplace=True)

    single_stock_df['Difference_open_to_prediction_today'] = single_stock_df['Prediction_for_today'] - single_stock_df['Open']

    single_stock_df_list.append(single_stock_df)

trading_df = pd.concat(single_stock_df_list)


trading_days = trading_df.index.unique()

# dummy_trading_date = trading_days[1]

account = 1000000
n_stocks = 5
budget_per_stock = int(account / n_stocks)

daily_account_change = []
trades = []


for trading_day in trading_days:

    trading_day_df = trading_df.loc[trading_df.index == trading_day]

    top_stock_df = trading_day_df.nlargest(n_stocks, 'Difference_open_to_prediction_today')

    account_before_trades = account
    # daily_account_change = []

    # trades = []
    for i in range(n_stocks):
        symbol = top_stock_df.iloc[i]['Symbol']
        open = top_stock_df.iloc[i]['Open']
        close = top_stock_df.iloc[i]['Close']

        units_bought = budget_per_stock // open
        money_spent = units_bought * open
        account -= money_spent

        money_earned = units_bought * close
        account += money_earned

        abs_gain = money_earned - money_spent
        rel_gain = (close - open) / open

        trades.append(
            {
                'Date' : trading_day,
                'Symbol' : symbol,
                'Abs_gain' : abs_gain,
                'Rel_gain' : rel_gain
            }
        )

    account_after_trades = account
    abs_account_change = account_after_trades - account_before_trades
    rel_account_change = (account_after_trades - account_before_trades) / account_before_trades

    daily_account_change.append(
        {
            'Date' : trading_day,
            'Account' : account,
            'Abs_change' : abs_account_change,
            'Rel_change' : rel_account_change
        }
    )

trades_df = pd.DataFrame(trades)
daily_account_change_df = pd.DataFrame(daily_account_change)


# Create a line plot using Plotly
fig = px.line(daily_account_change_df, x='Date', y='Account', title='Portfolio Value in USD',
              labels={'Date': 'Date', 'Account': 'Portfolio Value ($)'})

# Customize the layout
fig.update_layout(
    xaxis_title='Date',
    yaxis_title='Portfolio Value ($)',
    xaxis_tickformat='%Y-%m-%d',
    template='plotly_white'
)

# Save the plot as HTML file to display
fig.write_html("plots/portfolio_value.html")


# Create a line plot using Plotly
fig = px.line(daily_account_change_df, x='Date', y='Abs_change', title='Absolute Change of the Portfolio Over Time',
              labels={'Date': 'Date', 'Abs_change': 'Absolute Change ($)'})

# Customize the layout
fig.update_layout(
    xaxis_title='Date',
    yaxis_title='Absolute Change ($)',
    xaxis_tickformat='%Y-%m-%d',
    template='plotly_white'
)

# Save the plot as HTML file to display
fig.write_html("plots/portfolio_absolute_change.html")



In [140]:
daily_account_change_df

Unnamed: 0,Date,Account,Abs_change,Rel_change
0,2015-01-05,9.825875e+05,-17412.512779,-0.017413
1,2015-01-06,9.786428e+05,-3944.709572,-0.004015
2,2015-01-07,9.815844e+05,2941.655212,0.003006
3,2015-01-08,9.895272e+05,7942.761894,0.008092
4,2015-01-09,9.778457e+05,-11681.507599,-0.011805
...,...,...,...,...
2007,2022-12-22,1.831735e+06,-14173.717529,-0.007678
2008,2022-12-23,1.838478e+06,6742.518280,0.003681
2009,2022-12-27,1.830610e+06,-7867.687935,-0.004279
2010,2022-12-28,1.819947e+06,-10663.002655,-0.005825


In [141]:
# Define the period for which you want the S&P 500 data
start_date = "2022-01-05"
end_date = "2022-12-30"

# Download the S&P 500 data
sp500_all_data_df = yf.download('^GSPC', start=start_date, end=end_date)

# Keep only the closing price
sp500_df = sp500_all_data_df[['Close']].copy()

# Rename the column to 'SP500'
sp500_df.rename(columns={'Close': 'SP500'}, inplace=True)

sp500_df

[*********************100%%**********************]  1 of 1 completed


Unnamed: 0_level_0,SP500
Date,Unnamed: 1_level_1
2022-01-05,4700.580078
2022-01-06,4696.049805
2022-01-07,4677.029785
2022-01-10,4670.290039
2022-01-11,4713.069824
...,...
2022-12-22,3822.389893
2022-12-23,3844.820068
2022-12-27,3829.250000
2022-12-28,3783.219971


In [142]:
daily_account_change_df['Date'] = pd.to_datetime(daily_account_change_df['Date'])
daily_account_change_df.set_index('Date', inplace=True)

# Merge the two dataframes on the Date column
combined_df = daily_account_change_df.merge(sp500_df, left_index=True, right_index=True)



In [143]:
# Normalize both columns to start at 100 for comparison
combined_df['Portfolio'] = combined_df['Account'] / combined_df['Account'].iloc[0] * 100
combined_df['SP500'] = combined_df['SP500'] / combined_df['SP500'].iloc[0] * 100

combined_df

Unnamed: 0_level_0,Account,Abs_change,Rel_change,SP500,Portfolio
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2022-01-05,1.614557e+06,-35114.761719,-0.021286,100.000000,100.000000
2022-01-06,1.609334e+06,-5223.102234,-0.003235,99.903623,99.676499
2022-01-07,1.599001e+06,-10332.331055,-0.006420,99.498992,99.036551
2022-01-10,1.627121e+06,28119.668091,0.017586,99.355611,100.778185
2022-01-11,1.634412e+06,7290.948090,0.004481,100.265706,101.229761
...,...,...,...,...,...
2022-12-22,1.831735e+06,-14173.717529,-0.007678,81.317408,113.451272
2022-12-23,1.838478e+06,6742.518280,0.003681,81.794587,113.868881
2022-12-27,1.830610e+06,-7867.687935,-0.004279,81.463350,113.381583
2022-12-28,1.819947e+06,-10663.002655,-0.005825,80.484109,112.721154


In [144]:
# Create a line plot using Plotly
fig = px.line(combined_df, x=combined_df.index, y=['Portfolio', 'SP500'], title='Portfolio vs S&P 500',
              labels={'value': 'Normalized Value', 'index': 'Date', 'variable': 'Legend'})

# Customize the layout
fig.update_layout(
    xaxis_title='Date',
    yaxis_title='Normalized Value',
    xaxis_tickformat='%Y-%m-%d',
    template='plotly_white'
)

# Save the plot as HTML file to display
fig.write_html("plots/portfolio_vs_sp500.html")

In [145]:
"""
# Aktien auswählen
relevant_stocks = ['MSFT', 'AAPL', 'NVDA', 'GOOG', 'AMZN', 'BRK-B', 'LLY', 'JPM', 'XOM', 'WMT', 'UNH', 'MA', 'PG', 'JNJ', 'COST', 'HD', 'MRK', 'ORCL', 'CVX', 'BAC', 'KO', 'CRM', 'NFLX', 'PEP', 'AMD', 'TMO', 'ADBE', 'WFC', 'LIN', 'QCOM', 'CSCO', 'MCD', 'ACN', 'DIS', 'DHR', 'ABT', 'INTU', 'GE', 'CAT', 'AMAT', 'AXP', 'TXN', 'VZ', 'AMGN', 'PFE', 'MS', 'CMCSA', 'IBM', 'NEE', 'UNP']

# Feature-Generation für jede Aktie
for symbol in relevant_stocks:
    file_path = os.path.join('data', 'stock_dataframes', f'{symbol}.csv')
    if not os.path.exists(file_path): # nicht nochmals erstellen wenn schon vorhanden
        prepare_features(symbol, option_volume=False)

# Zusammenführen, Clean-up NaN-Values (im Normalfall 1 Zeile), One-Hot Encoding der kategorischen Variable 'Sector', abspeichern 
stock_df = combine_stocks(relevant_stocks, return_df=True)

# Vorbereitung des Trainings-, Validierungs- und Testdatensatzes
X_train_scaled, y_train, X_valid_scaled, y_valid, X_test_scaled, y_test, df_train, df_valid, df_test = ml_preprocessing(stock_df)
"""

trading_df = df_test[['Symbol', 'Open', 'Close']].copy()
trading_df['Prediction_for_tomorrow'] = trading_df['Close'] * np.random.uniform(0.98, 1.02, size=trading_df.shape[0])

single_stock_df_list = []

relevant_stocks = ['MSFT', 'AAPL', 'NVDA', 'GOOG', 'AMZN', 'BRK-B', 'LLY', 'JPM', 'XOM', 'WMT', 'UNH', 'MA', 'PG', 'JNJ', 'COST', 'HD', 'MRK', 'ORCL', 'CVX', 'BAC', 'KO', 'CRM', 'NFLX', 'PEP', 'AMD', 'TMO', 'ADBE', 'WFC', 'LIN', 'QCOM', 'CSCO', 'MCD', 'ACN', 'DIS', 'DHR', 'ABT', 'INTU', 'GE', 'CAT', 'AMAT', 'AXP', 'TXN', 'VZ', 'AMGN', 'PFE', 'MS', 'CMCSA', 'IBM', 'NEE', 'UNP']

for symbol in relevant_stocks:

    single_stock_df = trading_df[trading_df['Symbol']==symbol].copy()
    single_stock_df['Prediction_for_today'] = single_stock_df['Prediction_for_tomorrow'].shift(1)
    single_stock_df.drop(single_stock_df.head(1).index, inplace=True)

    single_stock_df['Difference_open_to_prediction_today'] = single_stock_df['Prediction_for_today'] - single_stock_df['Open']

    single_stock_df_list.append(single_stock_df)

trading_df = pd.concat(single_stock_df_list)


trading_days = trading_df.index.unique()

# dummy_trading_date = trading_days[1]

account = 1000000
n_stocks = 5
budget_per_stock = int(account / n_stocks)

daily_account_change = []
trades = []


for trading_day in trading_days:

    trading_day_df = trading_df.loc[trading_df.index == trading_day]

    top_stock_df = trading_day_df.nlargest(n_stocks, 'Difference_open_to_prediction_today')

    account_before_trades = account
    # daily_account_change = []

    # trades = []
    for i in range(n_stocks):
        symbol = top_stock_df.iloc[i]['Symbol']
        open = top_stock_df.iloc[i]['Open']
        close = top_stock_df.iloc[i]['Close']

        units_bought = budget_per_stock // open
        money_spent = units_bought * open
        account -= money_spent

        money_earned = units_bought * close
        account += money_earned

        abs_gain = money_earned - money_spent
        rel_gain = (close - open) / open

        trades.append(
            {
                'Date' : trading_day,
                'Symbol' : symbol,
                'Abs_gain' : abs_gain,
                'Rel_gain' : rel_gain
            }
        )

    account_after_trades = account
    abs_account_change = account_after_trades - account_before_trades
    rel_account_change = (account_after_trades - account_before_trades) / account_before_trades

    daily_account_change.append(
        {
            'Date' : trading_day,
            'Account' : account,
            'Abs_change' : abs_account_change,
            'Rel_change' : rel_account_change
        }
    )

trades_df = pd.DataFrame(trades)
daily_account_change_df = pd.DataFrame(daily_account_change)


# Create a line plot using Plotly
fig = px.line(daily_account_change_df, x='Date', y='Account', title='Portfolio Value in USD',
              labels={'Date': 'Date', 'Account': 'Portfolio Value ($)'})

# Customize the layout
fig.update_layout(
    xaxis_title='Date',
    yaxis_title='Portfolio Value ($)',
    xaxis_tickformat='%Y-%m-%d',
    template='plotly_white'
)

# Save the plot as HTML file to display
fig.write_html("plots/portfolio_value.html")


# Create a line plot using Plotly
fig = px.line(daily_account_change_df, x='Date', y='Abs_change', title='Absolute Change of the Portfolio Over Time',
              labels={'Date': 'Date', 'Abs_change': 'Absolute Change ($)'})

# Customize the layout
fig.update_layout(
    xaxis_title='Date',
    yaxis_title='Absolute Change ($)',
    xaxis_tickformat='%Y-%m-%d',
    template='plotly_white'
)

# Save the plot as HTML file to display
fig.write_html("plots/portfolio_absolute_change.html")


# Define the period for which you want the S&P 500 data
start_date = "2022-01-05"
end_date = "2022-12-30"

# Download the S&P 500 data
sp500_all_data_df = yf.download('^GSPC', start=start_date, end=end_date)

# Keep only the closing price
sp500_df = sp500_all_data_df[['Close']].copy()

# Rename the column to 'SP500'
sp500_df.rename(columns={'Close': 'SP500'}, inplace=True)


daily_account_change_df['Date'] = pd.to_datetime(daily_account_change_df['Date'])
daily_account_change_df.set_index('Date', inplace=True)

# Merge the two dataframes on the Date column
combined_df = daily_account_change_df.merge(sp500_df, left_index=True, right_index=True)


# Normalize both columns to start at 100 for comparison
combined_df['Portfolio'] = combined_df['Account'] / combined_df['Account'].iloc[0] * 100
combined_df['SP500'] = combined_df['SP500'] / combined_df['SP500'].iloc[0] * 100


# Create a line plot using Plotly
fig = px.line(combined_df, x=combined_df.index, y=['Portfolio', 'SP500'], title='Portfolio vs S&P 500',
              labels={'value': 'Normalized Value', 'index': 'Date', 'variable': 'Legend'})

# Customize the layout
fig.update_layout(
    xaxis_title='Date',
    yaxis_title='Normalized Value',
    xaxis_tickformat='%Y-%m-%d',
    template='plotly_white'
)

# Save the plot as HTML file to display
fig.write_html("plots/portfolio_vs_sp500.html")

Combining Data of all Stocks into one DataFrame...
Checking for NaN Values...
Column VWAP_D has 1 NaN Value(s)
Dropped 1 Rows. (1 is the bugged AMD Row)
Saved DataFrame as combined_dataframe.csv at Path data\ML_data\combined_dataframe.csv
-------------------------------------------------------------------------------------
Preparing Train, Validation and Test Data for ML...
Data Preparation finished
-------------------------------------------------------------------------------------


[*********************100%%**********************]  1 of 1 completed


In [175]:
"""
# Aktien auswählen
relevant_stocks = ['MSFT', 'AAPL', 'NVDA', 'GOOG', 'AMZN', 'BRK-B', 'LLY', 'JPM', 'XOM', 'WMT', 'UNH', 'MA', 'PG', 'JNJ', 'COST', 'HD', 'MRK', 'ORCL', 'CVX', 'BAC', 'KO', 'CRM', 'NFLX', 'PEP', 'AMD', 'TMO', 'ADBE', 'WFC', 'LIN', 'QCOM', 'CSCO', 'MCD', 'ACN', 'DIS', 'DHR', 'ABT', 'INTU', 'GE', 'CAT', 'AMAT', 'AXP', 'TXN', 'VZ', 'AMGN', 'PFE', 'MS', 'CMCSA', 'IBM', 'NEE', 'UNP']

# Feature-Generation für jede Aktie
for symbol in relevant_stocks:
    file_path = os.path.join('data', 'stock_dataframes', f'{symbol}.csv')
    if not os.path.exists(file_path): # nicht nochmals erstellen wenn schon vorhanden
        prepare_features(symbol, option_volume=False)

# Zusammenführen, Clean-up NaN-Values (im Normalfall 1 Zeile), One-Hot Encoding der kategorischen Variable 'Sector', abspeichern 
stock_df = combine_stocks(relevant_stocks, return_df=True)

# Vorbereitung des Trainings-, Validierungs- und Testdatensatzes
X_train_scaled, y_train, X_valid_scaled, y_valid, X_test_scaled, y_test, df_train, df_valid, df_test = ml_preprocessing(stock_df)
"""

model_name = "Linear Regression"
predictions_path = os.path.join("data", f"{model_name}_y_pred_test.csv")
predictions_df = pd.read_csv(predictions_path, usecols=['0'])
predictions_df.rename(columns={'0': 'Prediction'}, inplace=True)

trading_df = df_test[['Symbol', 'Open', 'Close']].copy()
trading_df['Prediction_for_tomorrow'] = predictions_df['Prediction'].values

single_stock_df_list = []

relevant_stocks = ['MSFT', 'AAPL', 'NVDA', 'GOOG', 'AMZN', 'BRK-B', 'LLY', 'JPM', 'XOM', 'WMT', 'UNH', 'MA', 'PG', 'JNJ', 'COST', 'HD', 'MRK', 'ORCL', 'CVX', 'BAC', 'KO', 'CRM', 'NFLX', 'PEP', 'AMD', 'TMO', 'ADBE', 'WFC', 'LIN', 'QCOM', 'CSCO', 'MCD', 'ACN', 'DIS', 'DHR', 'ABT', 'INTU', 'GE', 'CAT', 'AMAT', 'AXP', 'TXN', 'VZ', 'AMGN', 'PFE', 'MS', 'CMCSA', 'IBM', 'NEE', 'UNP']

for symbol in relevant_stocks:

    single_stock_df = trading_df[trading_df['Symbol']==symbol].copy()
    single_stock_df['Prediction_for_today'] = single_stock_df['Prediction_for_tomorrow'].shift(1)
    single_stock_df.drop(single_stock_df.head(1).index, inplace=True)

    single_stock_df['Difference_open_to_prediction_today'] = single_stock_df['Prediction_for_today'] - single_stock_df['Open']

    single_stock_df_list.append(single_stock_df)

trading_df = pd.concat(single_stock_df_list)


trading_days = trading_df.index.unique()

# dummy_trading_date = trading_days[1]

account = 1000000
n_stocks = 5
budget_per_stock = int(account / n_stocks)

daily_account_change = []
trades = []


for trading_day in trading_days:

    trading_day_df = trading_df.loc[trading_df.index == trading_day]

    top_stock_df = trading_day_df.nlargest(n_stocks, 'Difference_open_to_prediction_today')

    account_before_trades = account
    # daily_account_change = []

    # trades = []
    for i in range(n_stocks):
        symbol = top_stock_df.iloc[i]['Symbol']
        open = top_stock_df.iloc[i]['Open']
        close = top_stock_df.iloc[i]['Close']

        units_bought = budget_per_stock // open
        money_spent = units_bought * open
        account -= money_spent

        money_earned = units_bought * close
        account += money_earned

        abs_gain = money_earned - money_spent
        rel_gain = (close - open) / open

        trades.append(
            {
                'Date' : trading_day,
                'Symbol' : symbol,
                'Abs_gain' : abs_gain,
                'Rel_gain' : rel_gain
            }
        )

    account_after_trades = account
    abs_account_change = account_after_trades - account_before_trades
    rel_account_change = (account_after_trades - account_before_trades) / account_before_trades

    daily_account_change.append(
        {
            'Date' : trading_day,
            'Account' : account,
            'Abs_change' : abs_account_change,
            'Rel_change' : rel_account_change
        }
    )

trades_df = pd.DataFrame(trades)
daily_account_change_df = pd.DataFrame(daily_account_change)


# Create a line plot using Plotly
fig = px.line(daily_account_change_df, x='Date', y='Account', title='Portfolio Value in USD',
              labels={'Date': 'Date', 'Account': 'Portfolio Value ($)'})

# Customize the layout
fig.update_layout(
    xaxis_title='Date',
    yaxis_title='Portfolio Value ($)',
    xaxis_tickformat='%Y-%m-%d',
    template='plotly_white'
)

# Save the plot as HTML file to display
fig.write_html("plots/portfolio_value.html")


# Create a line plot using Plotly
fig = px.line(daily_account_change_df, x='Date', y='Abs_change', title='Absolute Change of the Portfolio Over Time',
              labels={'Date': 'Date', 'Abs_change': 'Absolute Change ($)'})

# Customize the layout
fig.update_layout(
    xaxis_title='Date',
    yaxis_title='Absolute Change ($)',
    xaxis_tickformat='%Y-%m-%d',
    template='plotly_white'
)

# Save the plot as HTML file to display
fig.write_html("plots/portfolio_absolute_change.html")


# Define the period for which you want the S&P 500 data
start_date = "2015-01-05"
end_date = "2022-12-30"

# Download the S&P 500 data
sp500_all_data_df = yf.download('^GSPC', start=start_date, end=end_date)

# Keep only the closing price
sp500_df = sp500_all_data_df[['Close']].copy()

# Rename the column to 'SP500'
sp500_df.rename(columns={'Close': 'SP500'}, inplace=True)


daily_account_change_df['Date'] = pd.to_datetime(daily_account_change_df['Date'])
daily_account_change_df.set_index('Date', inplace=True)

# Merge the two dataframes on the Date column
combined_df = daily_account_change_df.merge(sp500_df, left_index=True, right_index=True)


# Normalize both columns to start at 100 for comparison
combined_df['Portfolio'] = combined_df['Account'] / combined_df['Account'].iloc[0] * 100
combined_df['SP500'] = combined_df['SP500'] / combined_df['SP500'].iloc[0] * 100


# Create a line plot using Plotly
fig = px.line(combined_df, x=combined_df.index, y=['Portfolio', 'SP500'], title='Portfolio vs S&P 500',
              labels={'value': 'Normalized Value', 'index': 'Date', 'variable': 'Legend'})

# Customize the layout
fig.update_layout(
    xaxis_title='Date',
    yaxis_title='Normalized Value',
    xaxis_tickformat='%Y-%m-%d',
    template='plotly_white'
)

# Save the plot as HTML file to display
fig.write_html("plots/portfolio_vs_sp500.html")

[*********************100%%**********************]  1 of 1 completed


In [206]:
trades_df.loc[(trades_df['Date'] > '2020-02-10') & (trades_df['Date'] < '2020-03-01')]['Abs_gain'].sum()

-26536.093074798642

In [189]:
trades_df.loc[(trades_df['Date'] > '2020-02-10') & (trades_df['Date'] < '2020-03-26')][['Date','Symbol','Abs_gain']].tail(50)

Unnamed: 0,Date,Symbol,Abs_gain
6525,2020-03-12,MA,-2475.509003
6526,2020-03-12,ADBE,-3722.08252
6527,2020-03-12,NFLX,-6885.0
6528,2020-03-12,UNH,-7374.707184
6529,2020-03-12,HD,-4930.867493
6530,2020-03-13,CMCSA,17112.012634
6531,2020-03-13,PFE,9209.717932
6532,2020-03-13,WMT,10827.313698
6533,2020-03-13,MRK,1390.229904
6534,2020-03-13,BAC,17755.603811


In [170]:

model_name = "XGBoost"
predictions_path = os.path.join("data", f"{model_name}_y_pred_test.csv")
predictions_df = pd.read_csv(predictions_path, usecols=['0'])

predictions_df.rename(columns={'0': 'Prediction'}, inplace=True)


df_test['Prediction'] = predictions_df['Prediction'].values

df_test



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0_level_0,Open,High,Low,Close,Absolute change,Relative change,Volume,RSI_14,MACD_12_26_9,MACDh_12_26_9,...,Sector_Consumer Cyclical,Sector_Consumer Defensive,Sector_Energy,Sector_Financial Services,Sector_Healthcare,Sector_Industrials,Sector_Technology,Sector_Utilities,Next close,Prediction
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-01-02,27.847500,27.860001,26.837500,27.332500,-0.514999,-0.018494,212818400,42.591853,-0.041890,-0.070599,...,0,0,0,0,0,0,1,0,26.562500,27.001774
2015-01-02,45.250000,45.450001,44.639999,44.900002,-0.349998,-0.007735,3216600,49.967466,0.343315,-0.070000,...,0,0,0,0,1,0,0,0,44.910000,44.218735
2015-01-02,89.669998,90.089996,88.430000,88.839996,-0.830002,-0.009256,2021300,58.292391,1.689166,0.050652,...,0,0,0,0,0,0,1,0,87.339996,89.052940
2015-01-02,72.699997,73.199997,71.889999,72.339996,-0.360001,-0.004952,2349200,47.364656,0.504185,-0.271643,...,0,0,0,0,0,0,1,0,71.980003,70.843160
2015-01-02,24.990000,25.160000,24.600000,24.959999,-0.030001,-0.001201,6910200,59.481324,0.525012,-0.043217,...,0,0,0,0,0,0,1,0,24.160000,24.573229
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-12-29,208.039993,210.360001,207.139999,209.220001,1.180008,0.005672,1484800,48.719504,-0.272214,-0.699555,...,0,0,0,0,0,1,0,0,207.070007,183.996220
2022-12-29,38.900002,39.529999,38.810001,39.259998,0.359997,0.009254,17347000,61.333953,0.229991,0.208149,...,0,0,0,0,0,0,0,0,39.400002,38.548200
2022-12-29,41.150002,41.400002,41.060001,41.330002,0.180000,0.004374,11597100,36.649059,-1.167903,-0.022514,...,0,0,0,1,0,0,0,0,41.290001,40.815033
2022-12-29,47.380001,47.673332,47.250000,47.383331,0.003330,0.000070,9171900,40.437743,-0.363537,-0.250525,...,0,1,0,0,0,0,0,0,47.263332,46.985350


In [173]:

# Aktien auswählen
relevant_stocks = ['MSFT', 'AAPL', 'NVDA', 'GOOG', 'AMZN', 'BRK-B', 'LLY', 'JPM', 'XOM', 'WMT', 'UNH', 'MA', 'PG', 'JNJ', 'COST', 'HD', 'MRK', 'ORCL', 'CVX', 'BAC', 'KO', 'CRM', 'NFLX', 'PEP', 'AMD', 'TMO', 'ADBE', 'WFC', 'LIN', 'QCOM', 'CSCO', 'MCD', 'ACN', 'DIS', 'DHR', 'ABT', 'INTU', 'GE', 'CAT', 'AMAT', 'AXP', 'TXN', 'VZ', 'AMGN', 'PFE', 'MS', 'CMCSA', 'IBM', 'NEE', 'UNP']

# Feature-Generation für jede Aktie
for symbol in relevant_stocks:
    file_path = os.path.join('data', 'stock_dataframes', f'{symbol}.csv')
    if not os.path.exists(file_path): # nicht nochmals erstellen wenn schon vorhanden
        prepare_features(symbol, option_volume=False)

# Zusammenführen, Clean-up NaN-Values (im Normalfall 1 Zeile), One-Hot Encoding der kategorischen Variable 'Sector', abspeichern 
stock_df = combine_stocks(relevant_stocks, return_df=True)

# Vorbereitung des Trainings-, Validierungs- und Testdatensatzes
X_train_scaled, y_train, X_valid_scaled, y_valid, X_test_scaled, y_test, df_train, df_valid, df_test = ml_preprocessing(stock_df)


model_name = "Linear Regression"
predictions_path = os.path.join("data", f"{model_name}_y_pred_test.csv")
predictions_df = pd.read_csv(predictions_path, usecols=['0'])
predictions_df.rename(columns={'0': 'Prediction'}, inplace=True)

trading_df = df_test[['Symbol', 'Open', 'Close']].copy()
trading_df['Prediction_for_tomorrow'] = predictions_df['Prediction'].values

trading_df

Combining Data of all Stocks into one DataFrame...
Checking for NaN Values...
Column VWAP_D has 1 NaN Value(s)
Dropped 1 Rows. (1 is the bugged AMD Row)
Saved DataFrame as combined_dataframe.csv at Path data\ML_data\combined_dataframe.csv
-------------------------------------------------------------------------------------
Preparing Train, Validation and Test Data for ML...
Data Preparation finished
-------------------------------------------------------------------------------------


Unnamed: 0_level_0,Symbol,Open,Close,Prediction_for_tomorrow
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015-01-02,AAPL,27.847500,27.332500,26.964844
2015-01-02,ABT,45.250000,44.900002,44.672363
2015-01-02,ACN,89.669998,88.839996,88.718750
2015-01-02,ADBE,72.699997,72.339996,72.146484
2015-01-02,AMAT,24.990000,24.959999,24.708008
...,...,...,...,...
2022-12-29,UNP,208.039993,209.220001,208.184082
2022-12-29,VZ,38.900002,39.259998,38.443848
2022-12-29,WFC,41.150002,41.330002,40.524414
2022-12-29,WMT,47.380001,47.383331,46.623535


In [172]:
trading_df

Unnamed: 0_level_0,Symbol,Open,Close,Prediction_for_tomorrow
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015-01-02,AAPL,27.847500,27.332500,27.001774
2015-01-02,ABT,45.250000,44.900002,44.218735
2015-01-02,ACN,89.669998,88.839996,89.052940
2015-01-02,ADBE,72.699997,72.339996,70.843160
2015-01-02,AMAT,24.990000,24.959999,24.573229
...,...,...,...,...
2022-12-29,UNP,208.039993,209.220001,183.996220
2022-12-29,VZ,38.900002,39.259998,38.548200
2022-12-29,WFC,41.150002,41.330002,40.815033
2022-12-29,WMT,47.380001,47.383331,46.985350
