In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import os
import numpy as np

def preprocess_data(file_path):
    df = pd.read_excel(file_path)
    df = df.drop(df.columns[0], axis=1)
    rename_dict = {col: f'Q{i+1}' for i, col in enumerate(df.columns[:10])}
    df = df.rename(columns=rename_dict)
    df = df.drop(['Q1', 'Q2', 'Q3'], axis=1)
    return df

def detect_anomalies(data, window=10):
    # Filter out non-numeric columns (like datetime or string columns)
    numeric_cols = data.select_dtypes(include=[np.number])

    # Calculate moving averages and standard deviation for the numeric feature columns
    moving_averages = numeric_cols.rolling(window=window, min_periods=1).mean()
    std_dev = numeric_cols.rolling(window=window, min_periods=1).std()

    # Add a buffer to standard deviation for defining a 'significant change'
    threshold = moving_averages + 2 * std_dev

    # Initialize the anomaly label column
    data['Anomaly_Label'] = False

    # Check the last three rows against the threshold for each numeric column
    for i in range(len(data)):
        if i >= 3:  # Ensure there are at least three entries before starting comparison
            # Recent values from numeric columns
            recent_values = data.iloc[i-3:i][numeric_cols.columns]
            # Trend thresholds just before recent values
            trend = threshold.iloc[i-1]  # Use the threshold calculated just before the recent values
            
            # If any of the recent values exceed the trend significantly, label it as an anomaly
            if (recent_values > trend).any(axis=1).any():
                data.at[i, 'Anomaly_Label'] = True

    return data


# Paths to your files
directory = "Patient data"
file_names = [f'File_{i+1}.xlsx' for i in range(37)]
file_paths = [os.path.join(directory, file_name) for file_name in file_names]

# Load and preprocess all files
all_data = pd.concat([preprocess_data(fp) for fp in file_paths], ignore_index=True)

# Detect anomalies based on trends
all_data = detect_anomalies(all_data)

# Stratify based on the new Anomaly_Label
X_train, X_test, y_train, y_test = train_test_split(
    all_data.drop(['Condition_Met', 'Anomaly_Label'], axis=1), all_data['Anomaly_Label'], 
    test_size=0.2, random_state=42, stratify=all_data['Anomaly_Label']
)

# Save the training and test sets to CSV files
X_train.to_csv('X_train.csv', index=False)
y_train.to_csv('y_train.csv', index=False)
X_test.to_csv('X_test.csv', index=False)
y_test.to_csv('y_test.csv', index=False)
