# Using maching learning to examine how the electrode clusters, depression, and reaction time predict prospective memory accuracy

Utilizing: random forest regression, linear regression, mean-based prediction

In [None]:
import pandas as pd

# Load the dataset
file_path = '/Users/mickey.rice/Desktop/PMD_ML.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset to understand its contents
data.head()


Unnamed: 0,ID,Accuracy,RT,PHQ9,CESD,DASS,PRMQ1,Confidence,JOL,O1,...,O2,P3,PZ,P4,Age,Sex,Gender,Race,Ethnicity,Handedness
0,PMD02,73.611111,1116.633001,16,33,7,27,11,60,-6.24,...,-8.709,5.933,4.483,-2.124,,Female,Female,\te. White,\tb. Not Hispanic or Latino,Right handed
1,PMD03,76.388889,805.957503,13,26,0,28,8,83,-8.447,...,-11.056,1.551,1.785,-0.738,18.0,Female,Female,e. White,b. Not Hispanic or Latino,Right handed
2,PMD04,77.777778,713.443193,13,37,14,27,8,95,-2.105,...,-0.879,-4.668,-4.044,-3.622,,Male,Transgender Nonbinary,\te. White,\tb. Not Hispanic or Latino,Right handed
3,PMD05,68.055556,1137.573536,16,40,16,37,22,87,-2.702,...,-2.472,2.191,0.527,-1.259,18.0,Female,Female,\te. White,\tb. Not Hispanic or Latino,Right handed
4,PMD06,18.055556,796.787629,10,32,10,27,12,70,-4.068,...,-2.215,0.483,2.186,4.175,18.0,Female,Female,e. White,b. Not Hispanic or Latino,Right handed


In [25]:
# Check for missing values
missing_values = data.isnull().sum()
print("Missing Values:\n", missing_values)

# Impute missing 'Age' with the mean age
data['Age'].fillna(data['Age'].mean(), inplace=True)

# Impute missing 'Race' with the most frequent value (mode)
data['Race'].fillna(data['Race'].mode()[0], inplace=True)

print(f"Dataset shape after dropping missing values: {data.shape}")





Missing Values:
 ID            0
Accuracy      0
RT            0
PHQ9          0
CESD          0
DASS          0
PRMQ1         0
Confidence    0
JOL           0
O1            0
OZ            0
O2            0
P3            0
PZ            0
P4            0
Age           0
Sex           0
Gender        0
Race          0
Ethnicity     0
Handedness    0
dtype: int64
Dataset shape after dropping missing values: (54, 21)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Age'].fillna(data['Age'].mean(), inplace=True)


In [27]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Extracting the electrode data for clustering
electrodes_O = data[['O1', 'OZ', 'O2']].dropna()
electrodes_P = data[['P3', 'PZ', 'P4']].dropna()

# Standardizing the data
scaler_O = StandardScaler()
scaler_P = StandardScaler()

electrodes_O_scaled = scaler_O.fit_transform(electrodes_O)
electrodes_P_scaled = scaler_P.fit_transform(electrodes_P)

# Applying PCA to reduce dimensionality of each cluster for validation
pca_O = PCA(n_components=1)  # Reducing to 1 principal component for O1, OZ, O2 cluster
pca_P = PCA(n_components=1)  # Reducing to 1 principal component for P3, Pz, P4 cluster

O_cluster = pca_O.fit_transform(electrodes_O_scaled)
P_cluster = pca_P.fit_transform(electrodes_P_scaled)

# Adding the cluster outputs to the dataframe for further analysis
data['O_cluster'] = pca_O.transform(scaler_O.transform(data[['O1', 'OZ', 'O2']].fillna(0)))
data['P_cluster'] = pca_P.transform(scaler_P.transform(data[['P3', 'PZ', 'P4']].fillna(0)))

# Display the updated dataframe with the new clusters
data[['O_cluster', 'P_cluster', 'Accuracy']].head()

Unnamed: 0,O_cluster,P_cluster,Accuracy
0,3.053585,-0.909989,73.611111
1,4.04657,0.543674,76.388889
2,0.729328,4.228924,77.777778
3,1.122568,0.845241,68.055556
4,1.109637,-0.410279,18.055556


In [28]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Define features (O_cluster, P_cluster) and target (Accuracy)
X = data[['O_cluster', 'P_cluster']]
y = data['Accuracy']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Random Forest Regression
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

# Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)

# Calculate performance metrics for both models
rf_mse = mean_squared_error(y_test, y_pred_rf)
rf_r2 = r2_score(y_test, y_pred_rf)

lr_mse = mean_squared_error(y_test, y_pred_lr)
lr_r2 = r2_score(y_test, y_pred_lr)

rf_mse, rf_r2, lr_mse, lr_r2

(819.7469134133657, -12.321184668954333, 571.0484533226318, -8.279744488401139)

In [43]:
# Adding 'RT' as an additional feature to the model
X = data[['O_cluster', 'P_cluster', 'PHQ9', 'RT']]

# Split the data again with the new feature
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Refit the models with the additional feature

# Random Forest Regression
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

# Linear Regression
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)

# Calculate performance metrics for both models
rf_mse_PHQ9 = mean_squared_error(y_test, y_pred_rf)
rf_r2_PHQ9 = r2_score(y_test, y_pred_rf)

lr_mse_PHQ9 = mean_squared_error(y_test, y_pred_lr)
lr_r2_PHQ9 = r2_score(y_test, y_pred_lr)

rf_mse_PHQ9, rf_r2_PHQ9, lr_mse_PHQ9, lr_r2_PHQ9

(543.3181290506262, -7.829116661064509, 499.71257459947975, -7.12051058529762)

In [41]:
from sklearn.metrics import mean_squared_error, r2_score

# Split the data into training and testing sets (as before)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Calculate the mean of the training set accuracy
mean_accuracy = y_train.mean()

# Use the mean to predict for all test set samples
y_pred_mean = [mean_accuracy] * len(y_test)

# Calculate the performance metrics for the mean-based prediction
mean_mse = mean_squared_error(y_test, y_pred_mean)
mean_r2 = r2_score(y_test, y_pred_mean)

print(f"Mean-based MSE: {mean_mse}")
print(f"Mean-based R²: {mean_r2}")

Mean-based MSE: 629.8896321284295
Mean-based R²: -9.23593498596235
