# Data Processing

In [1]:
# Install if needed: pip install yfinance pandas numpy
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [None]:
data = pd.read_csv("FTSE100_raw.csv")
print(data.shape)
data = pd.read_csv("DAX.csv")
print(data.shape)
data = pd.read_csv("SP500_raw.csv")
print(data.shape)

In [2]:
#start_date = "2015-01-01"
#end_date = "2023-12-31"
data_raw = pd.read_csv("FTSE100_raw.csv")
print(data_raw.columns)
print(data_raw)

Index(['Date', 'Price', 'Open', 'High', 'Low', 'Vol.', 'Change %'], dtype='object')
            Date     Price      Open      High       Low     Vol. Change %
0     12/29/2023  7,733.24  7,722.74  7,746.91  7,719.02  225.64M    0.14%
1     12/28/2023  7,722.74  7,724.95  7,745.99  7,708.74  316.82M   -0.03%
2     12/27/2023  7,724.95  7,697.51  7,759.74  7,697.51  409.46M    0.36%
3     12/22/2023  7,697.51  7,694.73  7,715.21  7,676.43  320.56M    0.04%
4     12/21/2023  7,694.73  7,715.68  7,715.68  7,668.41  562.95M   -0.27%
...          ...       ...       ...       ...       ...      ...      ...
2267  01/08/2015  6,569.96  6,419.83  6,580.82  6,419.83  910.04M    2.34%
2268  01/07/2015  6,419.83  6,366.51  6,459.74  6,366.51  709.50M    0.84%
2269  01/06/2015  6,366.51  6,417.16  6,452.66  6,328.59  793.26M   -0.79%
2270  01/05/2015  6,417.16  6,547.80  6,576.74  6,404.49  750.52M   -2.00%
2271  01/02/2015  6,547.80  6,566.09  6,607.89  6,510.60  378.93M   -0.28%

[2272 rows x 7 

In [None]:
data_raw["Target"] = (data_raw["Target"] > 0).astype(int)
print(data_raw)

In [None]:
data_raw.to_csv("SP500_classification.csv", index=False)

In [3]:
def convert_german_float(value):
    if isinstance(value, str):
        # Remove periods (thousands separator)
        cleaned_value = value.replace('.', '')
        # Replace comma (decimal separator) with period
        cleaned_value = cleaned_value.replace(',', '.')
        try:
            return float(cleaned_value)
        except ValueError:
            return np.nan # Or handle other errors as needed
    return value # Return as is if not a string (e.g., already a number or NaN)
data_raw['Price'] = data_raw['Price'].apply(convert_german_float)

In [4]:
# -----------------------------------
# STEP 2: Clean and enrich the data
# -----------------------------------
# First compute daily log return
data_raw["LogReturn"] = np.log(data_raw["Price"] / data_raw["Price"].shift(1))

# Fit scaler on LogReturn and transform
split_idx = int(len(data_raw) * 0.8)
scaler = StandardScaler()
scaler.fit(data_raw.loc[:split_idx-1, ["LogReturn"]])
data_raw["LogReturn"] = scaler.transform(data_raw[["LogReturn"]])


# Define the prediction target: "HORIZON"-day forward cumulative return
HORIZON = 5 # cumulative return of 5 days 
cum_return = data_raw["LogReturn"].rolling(window=HORIZON).sum().shift(-HORIZON)
# Print more values: show the first 20 and last 20 values, plus summary stats
data_raw["Target"] = (cum_return > 0).astype(int)

# Keep only date, log return and target
data = data_raw[["Date","LogReturn", "Target"]]
print(data)

# Drop the first row with NaN return (from shift operation)
data.dropna(inplace=True)

            Date  LogReturn  Target
0     12/29/2023        NaN       0
1     12/28/2023  -0.129800       0
2     12/27/2023   0.033730       0
3     12/22/2023  -0.348501       0
4     12/21/2023  -0.030630       0
...          ...        ...     ...
2267  01/08/2015   1.052203       0
2268  01/07/2015  -2.292939       0
2269  01/06/2015  -0.823907       0
2270  01/05/2015   0.793114       0
2271  01/02/2015   2.008956       0

[2272 rows x 3 columns]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.dropna(inplace=True)


In [5]:
# -----------------------------------
# STEP 3: Create lag features
# -----------------------------------
N_LAGS = 15
for lag in range(1, N_LAGS + 1):
    data[f"Lag_{lag}"] = data["LogReturn"].shift(lag)
# Add lagged date columns
for lag in range(1, N_LAGS + 1):
    data[f"Date_Lag_{lag}"] = data["Date"].shift(lag)

data.dropna(inplace=True)  # drop rows with NaNs introduced by lagging

data.drop(columns=["LogReturn"], inplace=True) # LogReturn no more needed
print(data)

            Date  Target     Lag_1     Lag_2     Lag_3     Lag_4     Lag_5  \
16    12/05/2023       0  0.027245 -0.532460  0.131440  0.033222 -0.069425   
17    12/04/2023       0 -0.333161  0.027245 -0.532460  0.131440  0.033222   
18    12/01/2023       0  0.311707 -0.333161  0.027245 -0.532460  0.131440   
19    11/30/2023       1  0.221939  0.311707 -0.333161  0.027245 -0.532460   
20    11/29/2023       1 -0.998018  0.221939  0.311707 -0.333161  0.027245   
...          ...     ...       ...       ...       ...       ...       ...   
2267  01/08/2015       0  0.001001 -0.616384  2.369535 -1.696924 -0.779327   
2268  01/07/2015       0  1.052203  0.001001 -0.616384  2.369535 -1.696924   
2269  01/06/2015       0 -2.292939  1.052203  0.001001 -0.616384  2.369535   
2270  01/05/2015       0 -0.823907 -2.292939  1.052203  0.001001 -0.616384   
2271  01/02/2015       0  0.793114 -0.823907 -2.292939  1.052203  0.001001   

         Lag_6     Lag_7     Lag_8  ...  Date_Lag_6  Date_Lag_7

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f"Lag_{lag}"] = data["LogReturn"].shift(lag)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f"Lag_{lag}"] = data["LogReturn"].shift(lag)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f"Lag_{lag}"] = data["LogReturn"].shift(lag)
A value is trying to be set on a copy of a slice from a

In [6]:
# -----------------------------------
# STEP 4: Add temporal features
# -----------------------------------

# Helper to add cyclical features for a given date column
def add_cyclical_features(df, date_col, prefix):
    # Ensure the date column is in datetime format
    df[date_col] = pd.to_datetime(df[date_col], format="%m/%d/%Y")
    
    # Extract raw features
    df[f"{prefix}_day_of_week"] = df[date_col].dt.dayofweek   # 0-6
    df[f"{prefix}_month"] = df[date_col].dt.month             # 1-12
    df[f"{prefix}_day_of_month"] = df[date_col].dt.day        # 1-31

    # Day of week: 0-6, max_value=7
    df[f"{prefix}_dow_sin"] = np.sin(2 * np.pi * df[f"{prefix}_day_of_week"] / 7)
    df[f"{prefix}_dow_cos"] = np.cos(2 * np.pi * df[f"{prefix}_day_of_week"] / 7)
    # Month: 1-12, max_value=12
    df[f"{prefix}_month_sin"] = np.sin(2 * np.pi * df[f"{prefix}_month"] / 12)
    df[f"{prefix}_month_cos"] = np.cos(2 * np.pi * df[f"{prefix}_month"] / 12)
    # Day of month: 1-31, max_value=31
    df[f"{prefix}_dom_sin"] = np.sin(2 * np.pi * df[f"{prefix}_day_of_month"] / 31)
    df[f"{prefix}_dom_cos"] = np.cos(2 * np.pi * df[f"{prefix}_day_of_month"] / 31)

# Add cyclical features for each lagged date column
for lag in range(1, N_LAGS + 1):
    lag_col = f"Date_Lag_{lag}"
    add_cyclical_features(data, lag_col, f"lag{lag}")

# Learn a scaler on the first 80% of the dataset, then apply to all sin and cos columns
full_train_size = int(len(data) * 0.8)
sin_cos_cols = [col for col in data.columns if col.endswith("_sin") or col.endswith("_cos")]
scaler_cyc = StandardScaler()
scaler_cyc.fit(data.loc[:full_train_size-1, sin_cos_cols])
data[sin_cos_cols] = scaler_cyc.transform(data[sin_cos_cols])

cols_to_drop = [col for col in data.columns if col.startswith("Date_Lag_") or 
            col.endswith("_day_of_week") or col.endswith("_month") or col.endswith("_day_of_month")]
data.drop(columns=cols_to_drop, inplace=True)
print(data)

            Date  Target     Lag_1     Lag_2     Lag_3     Lag_4     Lag_5  \
16    12/05/2023       0  0.027245 -0.532460  0.131440  0.033222 -0.069425   
17    12/04/2023       0 -0.333161  0.027245 -0.532460  0.131440  0.033222   
18    12/01/2023       0  0.311707 -0.333161  0.027245 -0.532460  0.131440   
19    11/30/2023       1  0.221939  0.311707 -0.333161  0.027245 -0.532460   
20    11/29/2023       1 -0.998018  0.221939  0.311707 -0.333161  0.027245   
...          ...     ...       ...       ...       ...       ...       ...   
2267  01/08/2015       0  0.001001 -0.616384  2.369535 -1.696924 -0.779327   
2268  01/07/2015       0  1.052203  0.001001 -0.616384  2.369535 -1.696924   
2269  01/06/2015       0 -2.292939  1.052203  0.001001 -0.616384  2.369535   
2270  01/05/2015       0 -0.823907 -2.292939  1.052203  0.001001 -0.616384   
2271  01/02/2015       0  0.793114 -0.823907 -2.292939  1.052203  0.001001   

         Lag_6     Lag_7     Lag_8  ...  lag14_month_sin  lag14

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[date_col] = pd.to_datetime(df[date_col], format="%m/%d/%Y")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f"{prefix}_day_of_week"] = df[date_col].dt.dayofweek   # 0-6
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f"{prefix}_month"] = df[date_col].dt.month             # 1-12
A value is tr

In [None]:
print("Data columns:", data.columns.tolist())
means = data.drop(columns=[col for col in data.columns if "date" in col.lower()]).mean()
print("Column means (excluding Date columns):")
print(means)
print("Variance (excluding Date columns):")
non_date_cols = [col for col in data.columns if "date" not in col.lower()]
print(data[non_date_cols].var())






In [7]:
data.to_csv("FTSE100_classification.csv", index=False)


In [None]:
# -----------------------------------
# STEP 5: Define training set andcommon test set (latest 20%)
# -----------------------------------
full_train_size = int(len(data) * 0.8)
train_data_only = data.iloc[:full_train_size].copy()
common_test = data.iloc[full_train_size:].copy()

In [None]:
# -----------------------------------
# STEP 6: Simulate 1 single agent and 3 federated agents
# -----------------------------------
agent_data = {}
agent_data["SingleAgent_Train"] = train_data_only.copy()
agent_data["SingleAgent_Test"] = common_test.copy()

# Without shuffling: split for federated agents
federated_data = train_data_only.copy()
splits = np.array_split(federated_data, 3)

# With shuffling: shuffle and split for federated agents
#shuffled = train_data_only.sample(frac=1, random_state=4)
#splits = np.array_split(shuffled, 3)

# Assign to agents
for i, df in enumerate(splits):
    agent_data[f"Agent_{i+1}_Train"] = df.sort_index()
    agent_data[f"Agent_{i+1}_Test"] = common_test.copy()


In [None]:
# -----------------------------------
# STEP 7: Preview the datasets
# -----------------------------------
for name, df in agent_data.items():
    print(f"\n=== {name} ===")
    print(df.head())
    print(df.shape)

# Model

In [None]:
# Install if needed: pip install tensorflow scikit-learn matplotlib
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt

In [None]:
# -----------------------------------
# STEP 1: Features and target column names
# -----------------------------------
feature_cols = [f"Lag_{i}" for i in range(1, 6)] + ["day_of_week", "month", "day_of_month"]
target_col = "Target"

In [None]:
# -----------------------------------
# STEP 2: Function to build the model
# -----------------------------------
def build_model(input_dim):
    model = keras.Sequential([
        layers.Dense(150, activation='relu', input_shape=(input_dim,)),
        layers.Dense(10, activation='relu'),
        layers.Dense(1)  # Linear output for regression
    ])
    model.compiale(optimizer='adam', loss='mse', metrics=['mae'])
    return model

In [None]:
# -----------------------------------
# STEP 3: Plot training loss and MAE
# -----------------------------------
def plot_training(history, title):
    plt.figure(figsize=(8, 4))
    plt.plot(history.history['loss'], label='MSE')
    plt.plot(history.history['mae'], label='MAE')
    plt.title(title)
    plt.xlabel('Epoch')
    plt.ylabel('Value')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()

In [None]:
# -----------------------------------
# STEP 4: Function to train the model
# -----------------------------------
def train_model(train_df, label=""):
    X = train_df[feature_cols].values
    y = train_df[target_col].values

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X) # Normalization

    model = build_model(X.shape[1])
    print(f"\nTraining model on {label}...")
    history = model.fit(X_scaled, y, epochs=50, batch_size=32, verbose=1)

    plot_training(history, title=f"{label} Training Metrics")
    return model, scaler

In [None]:
# -----------------------------------
# STEP 5: Function to evaluate the model
# -----------------------------------
def evaluate_model(model, scaler, test_df):
    X_test = scaler.transform(test_df[feature_cols].values)
    y_true = test_df[target_col].values
    y_pred = model.predict(X_test).flatten()

    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)

    print(f"Test MSE: {mse:.6f}, MAE: {mae:.6f}")
    return mse, mae

In [None]:
# -----------------------
# STEP 6: Single-agent model 
# -----------------------
single_model, single_scaler = train_model(agent_data["SingleAgent_Train"], label="SingleAgent")
evaluate_model(single_model, single_scaler, agent_data["SingleAgent_Test"])

In [None]:
# -----------------------
# STEP 7: Per-agent models
# -----------------------
agent_models = {}
for i in range(1, 4):
    agent_name = f"Agent_{i}_Train"
    test_name = f"Agent_{i}_Test"
    model, scaler = train_model(agent_data[agent_name], label=agent_name)
    evaluate_model(model, scaler, agent_data[test_name])
    agent_models[agent_name] = {"model": model, "scaler": scaler}