# Data Processing

In [None]:
# Install if needed: pip install yfinance pandas numpy
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [None]:
data_raw = pd.read_csv("FTSE100_raw.csv")
print(data_raw.columns)
print(data_raw)

In [None]:
def convert_german_float(value):
    if isinstance(value, str):
        # Remove periods (thousands separator)
        cleaned_value = value.replace('.', '')
        # Replace comma (decimal separator) with period
        cleaned_value = cleaned_value.replace(',', '.')
        try:
            return float(cleaned_value)
        except ValueError:
            return np.nan # Or handle other errors as needed
    return value # Return as is if not a string (e.g., already a number or NaN)
data_raw['Price'] = data_raw['Price'].apply(convert_german_float)

In [None]:
# -----------------------------------
# STEP 2: Clean and enrich the data
# -----------------------------------
# First compute daily log return
data_raw["LogReturn"] = np.log(data_raw["Price"] / data_raw["Price"].shift(1))

# Fit scaler on LogReturn and transform
split_idx = int(len(data_raw) * 0.8)
scaler = StandardScaler()
scaler.fit(data_raw.loc[:split_idx-1, ["LogReturn"]])
data_raw["LogReturn"] = scaler.transform(data_raw[["LogReturn"]])


# Define the prediction target: "HORIZON"-day forward cumulative return
HORIZON = 5 # cumulative return of 5 days 
cum_return = data_raw["LogReturn"].rolling(window=HORIZON).sum().shift(-HORIZON)
# Print more values: show the first 20 and last 20 values, plus summary stats
data_raw["Target"] = (cum_return > 0).astype(int)

# Keep only date, log return and target
data = data_raw[["Date","LogReturn", "Target"]]
print(data)

# Drop the first row with NaN return (from shift operation)
data.dropna(inplace=True)

In [None]:
# -----------------------------------
# STEP 3: Create lag features
# -----------------------------------
N_LAGS = 15
for lag in range(1, N_LAGS + 1):
    data[f"Lag_{lag}"] = data["LogReturn"].shift(lag)
# Add lagged date columns
for lag in range(1, N_LAGS + 1):
    data[f"Date_Lag_{lag}"] = data["Date"].shift(lag)

data.dropna(inplace=True)  # drop rows with NaNs introduced by lagging

data.drop(columns=["LogReturn"], inplace=True) # LogReturn no more needed
print(data)

In [None]:
# -----------------------------------
# STEP 4: Add temporal features
# -----------------------------------

# Helper to add cyclical features for a given date column
def add_cyclical_features(df, date_col, prefix):
    # Ensure the date column is in datetime format
    df[date_col] = pd.to_datetime(df[date_col], format="%m/%d/%Y")
    
    # Extract raw features
    df[f"{prefix}_day_of_week"] = df[date_col].dt.dayofweek   # 0-6
    df[f"{prefix}_month"] = df[date_col].dt.month             # 1-12
    df[f"{prefix}_day_of_month"] = df[date_col].dt.day        # 1-31

    # Day of week: 0-6, max_value=7
    df[f"{prefix}_dow_sin"] = np.sin(2 * np.pi * df[f"{prefix}_day_of_week"] / 7)
    df[f"{prefix}_dow_cos"] = np.cos(2 * np.pi * df[f"{prefix}_day_of_week"] / 7)
    # Month: 1-12, max_value=12
    df[f"{prefix}_month_sin"] = np.sin(2 * np.pi * df[f"{prefix}_month"] / 12)
    df[f"{prefix}_month_cos"] = np.cos(2 * np.pi * df[f"{prefix}_month"] / 12)
    # Day of month: 1-31, max_value=31
    df[f"{prefix}_dom_sin"] = np.sin(2 * np.pi * df[f"{prefix}_day_of_month"] / 31)
    df[f"{prefix}_dom_cos"] = np.cos(2 * np.pi * df[f"{prefix}_day_of_month"] / 31)

# Add cyclical features for each lagged date column
for lag in range(1, N_LAGS + 1):
    lag_col = f"Date_Lag_{lag}"
    add_cyclical_features(data, lag_col, f"lag{lag}")

# Learn a scaler on the first 80% of the dataset, then apply to all sin and cos columns
full_train_size = int(len(data) * 0.8)
sin_cos_cols = [col for col in data.columns if col.endswith("_sin") or col.endswith("_cos")]
scaler_cyc = StandardScaler()
scaler_cyc.fit(data.loc[:full_train_size-1, sin_cos_cols])
data[sin_cos_cols] = scaler_cyc.transform(data[sin_cos_cols])

cols_to_drop = [col for col in data.columns if col.startswith("Date_Lag_") or 
            col.endswith("_day_of_week") or col.endswith("_month") or col.endswith("_day_of_month")]
data.drop(columns=cols_to_drop, inplace=True)
print(data)

In [None]:
data.to_csv("FTSE100_classification.csv", index=False)
