# Data Processing

In [1]:
# Install if needed: pip install yfinance pandas numpy
import pandas as pd
import numpy as np

In [None]:
data = pd.read_csv("FTSE100_raw.csv")
print(data.shape)
data = pd.read_csv("DAX.csv")
print(data.shape)
data = pd.read_csv("SP500_raw")
print(data.shape)

In [2]:
#start_date = "2015-01-01"
#end_date = "2023-12-31"
data_raw = pd.read_csv("sp500_full.csv")
print(data_raw.columns)
print(data_raw)

Index(['Date', 'Target', 'Lag_1', 'Lag_2', 'Lag_3', 'Lag_4', 'Lag_5',
       'day_of_week', 'month', 'day_of_month'],
      dtype='object')
            Date    Target     Lag_1     Lag_2     Lag_3     Lag_4     Lag_5  \
0     2015-01-12 -0.002819 -0.008439  0.017730  0.011563 -0.008933 -0.018447   
1     2015-01-13  0.004483 -0.008127 -0.008439  0.017730  0.011563 -0.008933   
2     2015-01-14  0.025468 -0.002582 -0.008127 -0.008439  0.017730  0.011563   
3     2015-01-15  0.029252 -0.005830 -0.002582 -0.008127 -0.008439  0.017730   
4     2015-01-16  0.018482 -0.009291 -0.005830 -0.002582 -0.008127 -0.008439   
...          ...       ...       ...       ...       ...       ...       ...   
2248  2023-12-15  0.007482  0.002644  0.013558  0.004589  0.003917  0.004087   
2249  2023-12-18  0.007186 -0.000076  0.002644  0.013558  0.004589  0.003917   
2250  2023-12-19  0.002767  0.004518 -0.000076  0.002644  0.013558  0.004589   
2251  2023-12-20  0.017930  0.005849  0.004518 -0.000076  0.

In [3]:
data_raw["Target"] = (data_raw["Target"] > 0).astype(int)
print(data_raw)

            Date  Target     Lag_1     Lag_2     Lag_3     Lag_4     Lag_5  \
0     2015-01-12       0 -0.008439  0.017730  0.011563 -0.008933 -0.018447   
1     2015-01-13       1 -0.008127 -0.008439  0.017730  0.011563 -0.008933   
2     2015-01-14       1 -0.002582 -0.008127 -0.008439  0.017730  0.011563   
3     2015-01-15       1 -0.005830 -0.002582 -0.008127 -0.008439  0.017730   
4     2015-01-16       1 -0.009291 -0.005830 -0.002582 -0.008127 -0.008439   
...          ...     ...       ...       ...       ...       ...       ...   
2248  2023-12-15       1  0.002644  0.013558  0.004589  0.003917  0.004087   
2249  2023-12-18       1 -0.000076  0.002644  0.013558  0.004589  0.003917   
2250  2023-12-19       1  0.004518 -0.000076  0.002644  0.013558  0.004589   
2251  2023-12-20       1  0.005849  0.004518 -0.000076  0.002644  0.013558   
2252  2023-12-21       1 -0.014793  0.005849  0.004518 -0.000076  0.002644   

      day_of_week  month  day_of_month  
0               0     

In [4]:
data_raw.to_csv("SP500_classification.csv", index=False)

In [4]:
def convert_german_float(value):
    if isinstance(value, str):
        # Remove periods (thousands separator)
        cleaned_value = value.replace('.', '')
        # Replace comma (decimal separator) with period
        cleaned_value = cleaned_value.replace(',', '.')
        try:
            return float(cleaned_value)
        except ValueError:
            return np.nan # Or handle other errors as needed
    return value # Return as is if not a string (e.g., already a number or NaN)
data_raw['Price'] = data_raw['Price'].apply(convert_german_float)

In [5]:
# -----------------------------------
# STEP 2: Clean and enrich the data
# -----------------------------------
# First compute daily log return
data_raw["LogReturn"] = np.log(data_raw["Price"] / data_raw["Price"].shift(1))

# Define the prediction target: "HORIZON"-day forward cumulative return
HORIZON = 5 # cumulative return of 5 days 
cum_return = data_raw["LogReturn"].rolling(window=HORIZON).sum().shift(-HORIZON)
# Print more values: show the first 20 and last 20 values, plus summary stats
data_raw["Target"] = (cum_return > 0).astype(int)

# Keep only date, log return and target
data = data_raw[["Date","LogReturn", "Target"]]
print(data)

# Drop the first row with NaN return (from shift operation)
data.dropna(inplace=True)

            Date  LogReturn  Target
0     12/29/2023        NaN       0
1     12/28/2023  -0.002995       1
2     12/27/2023   0.002423       0
3     12/22/2023  -0.002146       1
4     12/21/2023  -0.001124       1
...          ...        ...     ...
2279  01/08/2015   0.019410       0
2280  01/07/2015  -0.033009       0
2281  01/06/2015  -0.005111       0
2282  01/05/2015   0.000370       0
2283  01/02/2015   0.030314       0

[2284 rows x 3 columns]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.dropna(inplace=True)


In [6]:
# -----------------------------------
# STEP 3: Create lag features
# -----------------------------------
N_LAGS = 5
for lag in range(1, N_LAGS + 1):
    data[f"Lag_{lag}"] = data["LogReturn"].shift(lag) 

data.dropna(inplace=True)  # drop rows with NaNs introduced by lagging

data.drop(columns=["LogReturn"], inplace=True) # LogReturn no more needed

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f"Lag_{lag}"] = data["LogReturn"].shift(lag)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f"Lag_{lag}"] = data["LogReturn"].shift(lag)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f"Lag_{lag}"] = data["LogReturn"].shift(lag)
A value is trying to be set on a copy of a slice from a

In [7]:
# -----------------------------------
# STEP 4: Add temporal features
# -----------------------------------
data["Date"] = pd.to_datetime(data["Date"], format="%m/%d/%Y")
data["day_of_week"] = data["Date"].dt.dayofweek   # Monday = 0, Sunday = 6
data["month"] = data["Date"].dt.month             # 1 to 12
data["day_of_month"] = data["Date"].dt.day        # 1 to 31
print(data)

           Date  Target     Lag_1     Lag_2     Lag_3     Lag_4     Lag_5  \
6    2023-12-19       1  0.002731 -0.001124 -0.002146  0.002423 -0.002995   
7    2023-12-18       1  0.000679  0.002731 -0.001124 -0.002146  0.002423   
8    2023-12-15       1 -0.005621  0.000679  0.002731 -0.001124 -0.002146   
9    2023-12-14       0  0.006041 -0.005621  0.000679  0.002731 -0.001124   
10   2023-12-13       0  0.000047  0.006041 -0.005621  0.000679  0.002731   
...         ...     ...       ...       ...       ...       ...       ...   
2279 2015-01-08       0 -0.013731 -0.016134  0.012544 -0.021717 -0.013382   
2280 2015-01-07       0  0.019410 -0.013731 -0.016134  0.012544 -0.021717   
2281 2015-01-06       0 -0.033009  0.019410 -0.013731 -0.016134  0.012544   
2282 2015-01-05       0 -0.005111 -0.033009  0.019410 -0.013731 -0.016134   
2283 2015-01-02       0  0.000370 -0.005111 -0.033009  0.019410 -0.013731   

      day_of_week  month  day_of_month  
6               1     12          

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["Date"] = pd.to_datetime(data["Date"], format="%m/%d/%Y")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["day_of_week"] = data["Date"].dt.dayofweek   # Monday = 0, Sunday = 6
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["month"] = data["Date"].dt.month             # 1 to 12
A value

In [8]:
data.to_csv("DAX_classification.csv", index=False)


In [None]:
# -----------------------------------
# STEP 5: Define training set andcommon test set (latest 20%)
# -----------------------------------
full_train_size = int(len(data) * 0.8)
train_data_only = data.iloc[:full_train_size].copy()
common_test = data.iloc[full_train_size:].copy()

In [None]:
# -----------------------------------
# STEP 6: Simulate 1 single agent and 3 federated agents
# -----------------------------------
agent_data = {}
agent_data["SingleAgent_Train"] = train_data_only.copy()
agent_data["SingleAgent_Test"] = common_test.copy()

# Without shuffling: split for federated agents
federated_data = train_data_only.copy()
splits = np.array_split(federated_data, 3)

# With shuffling: shuffle and split for federated agents
#shuffled = train_data_only.sample(frac=1, random_state=4)
#splits = np.array_split(shuffled, 3)

# Assign to agents
for i, df in enumerate(splits):
    agent_data[f"Agent_{i+1}_Train"] = df.sort_index()
    agent_data[f"Agent_{i+1}_Test"] = common_test.copy()


In [None]:
# -----------------------------------
# STEP 7: Preview the datasets
# -----------------------------------
for name, df in agent_data.items():
    print(f"\n=== {name} ===")
    print(df.head())
    print(df.shape)

# Model

In [None]:
# Install if needed: pip install tensorflow scikit-learn matplotlib
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt

In [None]:
# -----------------------------------
# STEP 1: Features and target column names
# -----------------------------------
feature_cols = [f"Lag_{i}" for i in range(1, 6)] + ["day_of_week", "month", "day_of_month"]
target_col = "Target"

In [None]:
# -----------------------------------
# STEP 2: Function to build the model
# -----------------------------------
def build_model(input_dim):
    model = keras.Sequential([
        layers.Dense(150, activation='relu', input_shape=(input_dim,)),
        layers.Dense(10, activation='relu'),
        layers.Dense(1)  # Linear output for regression
    ])
    model.compiale(optimizer='adam', loss='mse', metrics=['mae'])
    return model

In [None]:
# -----------------------------------
# STEP 3: Plot training loss and MAE
# -----------------------------------
def plot_training(history, title):
    plt.figure(figsize=(8, 4))
    plt.plot(history.history['loss'], label='MSE')
    plt.plot(history.history['mae'], label='MAE')
    plt.title(title)
    plt.xlabel('Epoch')
    plt.ylabel('Value')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()

In [None]:
# -----------------------------------
# STEP 4: Function to train the model
# -----------------------------------
def train_model(train_df, label=""):
    X = train_df[feature_cols].values
    y = train_df[target_col].values

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X) # Normalization

    model = build_model(X.shape[1])
    print(f"\nTraining model on {label}...")
    history = model.fit(X_scaled, y, epochs=50, batch_size=32, verbose=1)

    plot_training(history, title=f"{label} Training Metrics")
    return model, scaler

In [None]:
# -----------------------------------
# STEP 5: Function to evaluate the model
# -----------------------------------
def evaluate_model(model, scaler, test_df):
    X_test = scaler.transform(test_df[feature_cols].values)
    y_true = test_df[target_col].values
    y_pred = model.predict(X_test).flatten()

    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)

    print(f"Test MSE: {mse:.6f}, MAE: {mae:.6f}")
    return mse, mae

In [None]:
# -----------------------
# STEP 6: Single-agent model 
# -----------------------
single_model, single_scaler = train_model(agent_data["SingleAgent_Train"], label="SingleAgent")
evaluate_model(single_model, single_scaler, agent_data["SingleAgent_Test"])

In [None]:
# -----------------------
# STEP 7: Per-agent models
# -----------------------
agent_models = {}
for i in range(1, 4):
    agent_name = f"Agent_{i}_Train"
    test_name = f"Agent_{i}_Test"
    model, scaler = train_model(agent_data[agent_name], label=agent_name)
    evaluate_model(model, scaler, agent_data[test_name])
    agent_models[agent_name] = {"model": model, "scaler": scaler}