# Preprocessing: OASIS-2 Dataset

In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

# Load the dataset
file_path = ("PATH_TO_OASIS2_DATASET")
df = pd.read_excel(file_path)

# Remove columns
df = df.drop(columns=['Hand', 'MRI ID', 'Visit', 'MR Delay'])

# Impute missing values for 'SES' using the median (as they are integer-valued data)
df[['SES', 'MMSE']] = df[['SES', 'MMSE']].apply(lambda x: x.fillna(x.median()))

# Check for any remaining missing values in each column
print(missing_values_count = df.isnull().sum())

# Selecting numeric columns for normalization
numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
scaler = MinMaxScaler()

# Applying normalization
df[numeric_columns] = scaler.fit_transform(df[numeric_columns])

# Encoding 'Subject ID' using Label Encoding 
label_encoder = LabelEncoder()
df['Subject ID'] = label_encoder.fit_transform(df['Subject ID'])

# Encoding 'M/F' using binary encoding
df['M/F'] = df['M/F'].map({'M': 0, 'F': 1})

# Save the preprocessed data
output_file_path = ("PATH_TO_LOCATION")
df.to_csv(output_file_path, index=False)

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

# Load the dataset
file_path = ("PATH_TO_OASIS2_DATASET")
df = pd.read_excel(file_path)

# Remove unnecessary columns
df = df.drop(columns=['Hand', 'MRI ID'])

# Display initial data information
print("Initial data shape:", df.shape)
print("Columns in the dataset:", df.columns.tolist())

# Impute missing values for 'SES' and 'MMSE' using the median
df[['SES', 'MMSE']] = df[['SES', 'MMSE']].apply(lambda x: x.fillna(x.median()))

# Check for any remaining missing values
missing_values_count = df.isnull().sum()
print("\nMissing values in each column after imputation:\n", missing_values_count)

# One-Hot Encode 
df = pd.get_dummies(df, columns=['M/F'], drop_first=True) 

# Encode 'Group' 
label_encoder_group = LabelEncoder()
df['Group_encoded'] = label_encoder_group.fit_transform(df['Group'])

# Print the mapping of classes
print("\nGroup classes and their labels:", dict(zip(label_encoder_group.classes_, 
                                                    label_encoder_group.transform(label_encoder_group.classes_))))

In [None]:
# Sort data by 'Subject ID' and 'Visit'
df = df.sort_values(by=['Subject ID', 'Visit'])

# Feature Engineering

# Age features
df['Age_at_Baseline'] = df.groupby('Subject ID')['Age'].transform('first')
df['Age_Difference'] = df['Age'] - df['Age_at_Baseline']

# MMSE features
df['MMSE_Change'] = df.groupby('Subject ID')['MMSE'].diff().fillna(0)
df['MMSE_at_Baseline'] = df.groupby('Subject ID')['MMSE'].transform('first')
df['Cumulative_MMSE_Change'] = df['MMSE'] - df['MMSE_at_Baseline']
df['Days_Since_Baseline'] = df.groupby('Subject ID')['MR Delay'].transform(lambda x: x - x.min())
df['MMSE_Rate_of_Change'] = df['Cumulative_MMSE_Change'] / df['Days_Since_Baseline'].replace(0, np.nan)
df['MMSE_Rate_of_Change'] = df['MMSE_Rate_of_Change'].fillna(0)

# CDR features
df['CDR_Change'] = df.groupby('Subject ID')['CDR'].diff().fillna(0)
df['CDR_at_Baseline'] = df.groupby('Subject ID')['CDR'].transform('first')
df['Cumulative_CDR_Change'] = df['CDR'] - df['CDR_at_Baseline']

# nWBV features
df['nWBV_Change'] = df.groupby('Subject ID')['nWBV'].diff().fillna(0)
df['nWBV_at_Baseline'] = df.groupby('Subject ID')['nWBV'].transform('first')
df['Cumulative_nWBV_Change'] = df['nWBV'] - df['nWBV_at_Baseline']
df['nWBV_Rate_of_Change'] = df['Cumulative_nWBV_Change'] / df['Days_Since_Baseline'].replace(0, np.nan)
df['nWBV_Rate_of_Change'] = df['nWBV_Rate_of_Change'].fillna(0)

# SES features
df['SES_at_Baseline'] = df.groupby('Subject ID')['SES'].transform('first')
df['SES_Change'] = df['SES'] - df['SES_at_Baseline']

# EDUC features
df['EDUC_at_Baseline'] = df.groupby('Subject ID')['EDUC'].transform('first')
df['EDUC_Change'] = df['EDUC'] - df['EDUC_at_Baseline']

# Time Since Last Visit
df['Time_Since_Last_Visit'] = df.groupby('Subject ID')['MR Delay'].diff().fillna(0)

# eTIV features
df['eTIV_Change'] = df.groupby('Subject ID')['eTIV'].diff().fillna(0)
df['eTIV_at_Baseline'] = df.groupby('Subject ID')['eTIV'].transform('first')
df['Cumulative_eTIV_Change'] = df['eTIV'] - df['eTIV_at_Baseline']
df['eTIV_Rate_of_Change'] = df['Cumulative_eTIV_Change'] / df['Days_Since_Baseline'].replace(0, np.nan)
df['eTIV_Rate_of_Change'] = df['eTIV_Rate_of_Change'].fillna(0)

# ASF features
df['ASF_Change'] = df.groupby('Subject ID')['ASF'].diff().fillna(0)
df['ASF_at_Baseline'] = df.groupby('Subject ID')['ASF'].transform('first')
df['Cumulative_ASF_Change'] = df['ASF'] - df['ASF_at_Baseline']
df['ASF_Rate_of_Change'] = df['Cumulative_ASF_Change'] / df['Days_Since_Baseline'].replace(0, np.nan)
df['ASF_Rate_of_Change'] = df['ASF_Rate_of_Change'].fillna(0)

# Features to include
feature_columns = [
    'Age', 'EDUC', 'SES', 'MMSE', 'CDR', 'eTIV', 'nWBV', 'ASF', 'M/F_M',
    'Age_Difference', 'MMSE_Change', 'Cumulative_MMSE_Change', 'MMSE_Rate_of_Change',
    'CDR_Change', 'Cumulative_CDR_Change',
    'nWBV_Change', 'Cumulative_nWBV_Change', 'nWBV_Rate_of_Change',
    'SES_Change', 'EDUC_Change', 'Time_Since_Last_Visit', 'Days_Since_Baseline',
    'eTIV_Change', 'Cumulative_eTIV_Change', 'eTIV_Rate_of_Change',
    'ASF_Change', 'Cumulative_ASF_Change', 'ASF_Rate_of_Change'
]

# Prepare feature matrix X and target vector y
X = df[feature_columns]
y = df['Group_encoded']
groups = df['Subject ID']

# Check for missing values in features
missing_values_features = X.isnull().sum()
print("\nMissing values in features:\n", missing_values_features)

# Impute missing values in features (if any)
imputer = SimpleImputer(strategy='constant', fill_value=0)
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# Examine the variability of SES_Change and EDUC_Change
print("\nUnique values in SES_Change:", df['SES_Change'].unique())
print("Unique values in EDUC_Change:", df['EDUC_Change'].unique())

# Check if EDUC_Change & SES have any variability and remove features with no variability
for feature in ['SES_Change', 'EDUC_Change']:
    if df[feature].nunique() <= 1:
        print(f"\n{feature} has no variability and will be removed from the feature set.")
        feature_columns.remove(feature)
        X_imputed = X_imputed.drop(columns=[feature])
    else:
        print(f"\n{feature} has variability and will be included in the feature set.")

In [None]:
# Save the feature set and target to a CSV file

# Combine features and target into one DataFrame
processed_data = X_imputed.copy()
processed_data['Target'] = y.values  # Ensure alignment

# Optionally add 'Subject ID' and 'Visit' columns
processed_data['Subject ID'] = df['Subject ID'].values
processed_data['Visit'] = df['Visit'].values

# Specify the output file path
output_file_path = ("PATH_TO_LOCATION")

# Save the DataFrame to a CSV file
processed_data.to_csv(output_file_path, index=False)

print(f"\nProcessed dataset saved to {output_file_path}")

In [None]:
# Update numerical features list
numerical_features = [col for col in feature_columns if col != 'M/F_M']

# Define ColumnTransformer for scaling
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features)
    ],
    remainder='passthrough'  
)

# Apply preprocessing
X_processed_array = preprocessor.fit_transform(X_imputed)

# Get the names of the features after transformation
feature_names_transformed = numerical_features + ['M/F_M']

# Convert the array back to DataFrame
X_processed = pd.DataFrame(X_processed_array, columns=feature_names_transformed)

print('Ready for classification')