# Machine learning - multimodal regression for GSR using other modalities

*This notebook contains initial work on regression model for target GSR using other physiological signals. This work was out of scope of the initial paper and is incomplete.

In [None]:
import os
if('notebooks' in os.getcwd()):
    os.chdir('..')
import pandas as pd
import numpy as np

In [None]:
# Create new directory for notebook output
notebook_temp_dir = os.path.join(os.getcwd(), "temp", "5_multimodal_regression")

if not os.path.exists(notebook_temp_dir):
    os.makedirs(notebook_temp_dir)

In [None]:
# Load file
features_file_directory = os.path.join(os.getcwd(), 'temp', 'windowed_features.csv')
df = pd.read_csv(features_file_directory, index_col=0)

In [None]:
df = df[df['Segment']=='gas_inhalation']
df = df.dropna()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

# Load your data into a DataFrame
data = df  # Replace with your data file path


# Filter columns that contain 'Emg/Contact[RightOrbicularis]', 'Emg/Contact[LeftOrbicularis]', 'Emg/Contact[RightZygomaticus]', 'Emg/Contact[LeftZygomaticus]', or 'Biopac_GSR_mean' in their names
#selected_columns = [col for col in data.columns if 'Emg/Contact[RightOrbicularis]' in col 
#                                                 or 'Emg/Contact[LeftOrbicularis]' in col 
#                                                 or 'Emg/Contact[RightZygomaticus]' in col 
#                                                 or 'Emg/Contact[LeftZygomaticus]' in col 
#                                                 or col == 'Biopac_GSR_mean'
#                                                 or col == 'Condition'
#                                                 or col == 'Segment'
#                                                 or col == 'participant_number']

# Create a new DataFrame with only the selected columns
#data = data[selected_columns]

# Initialize lists to store evaluation results
mae_scores = []
rmse_scores = []
r2_scores = []


# Initialize lists to store actual and predicted values
all_actual_values = []
all_predicted_values = []

# Initialize lists to store average mean values for actual and predicted GSR
average_actual_means = []
average_predicted_means = []

# Initialize an array to accumulate feature importances across all participants
total_feature_importances = []


# Iterate through each participant for leave-one-subject-out cross-validation
unique_participants = data['participant_number'].unique()
for participant in unique_participants:
    print("Running ML for participant:" + participant)
    # Split data into training and validation sets
    train_data = data[data['participant_number'] != participant]
    #train_data = train_data[train_data['Condition']=='CO2']
    val_data = data[data['participant_number'] == participant]
    #val_data = val_data[val_data['Condition']=='CO2']

    # Select features and target for training and validation
    X_train = train_data.drop(['participant_number', 'Condition', 'Segment'], axis=1)
    y_train = train_data['Biopac_GSR_mean']
    X_val = val_data.drop(['participant_number', 'Condition', 'Segment'], axis=1)
    y_val = val_data['Biopac_GSR_mean']

    # Drop columns that start with specified prefixes
    #drop_columns = [col for col in X_train.columns if col.startswith(('Biopac', 'RSP', 'SCR', 'EDA'))]
    drop_columns = [col for col in X_train.columns if col.startswith(('Biopac_GSR', 'SCR', 'EDA'))]
    X_train = X_train.drop(drop_columns, axis=1)
    X_val = X_val.drop(drop_columns, axis=1)

    # Initialize and train a Ridge Regressor
    model = Ridge(alpha=1.0)  # You can adjust the alpha value for regularization strength
    model.fit(X_train, y_train)
    
    # Get the learned coefficients (feature importances) from the trained model
    participant_feature_importances = model.coef_

    # Accumulate the feature importances for this participant
    total_feature_importances.append(participant_feature_importances)

    # Predict GSR values on the validation set
    y_val_pred = model.predict(X_val)

    # Evaluate the model
    mae_scores.append(mean_absolute_error(y_val, y_val_pred))
    rmse_scores.append(mean_squared_error(y_val, y_val_pred, squared=False))
    r2_scores.append(r2_score(y_val, y_val_pred))
    
    # Append actual and predicted values for this participant
    all_actual_values.extend(y_val)
    all_predicted_values.extend(y_val_pred)
    
    # Calculate and append average mean for actual and predicted GSR values for this participant
    average_actual_mean = y_val.mean()
    average_predicted_mean = y_val_pred.mean()
    average_actual_means.append(average_actual_mean)
    average_predicted_means.append(average_predicted_mean)
    
    # Assuming y_val_pred and y_val are NumPy arrays or Pandas Series
    plt.figure(figsize=(10, 6))  # Adjust the figure size as needed

    # Plot the predicted values
    plt.plot(y_val_pred, label='Predicted')

    # Plot the actual values
    plt.plot(y_val.reset_index(drop=True), label='Actual')
    plt.legend()

# Calculate average MAE, RMSE, R-squared
avg_mae = sum(mae_scores) / len(mae_scores)
avg_rmse = sum(rmse_scores) / len(rmse_scores)
avg_r2 = sum(r2_scores) / len(r2_scores)

print(f"Average Mean Absolute Error: {avg_mae:.2f}")
print(f"Average Root Mean Squared Error: {avg_rmse:.2f}")
print(f"Average R-squared: {avg_r2:.2f}")

# Scatter Plot: Predicted vs Actual GSR values for all participants
plt.figure(figsize=(8, 6))
sns.scatterplot(x=all_actual_values, y=all_predicted_values)
plt.xlabel('Actual GSR')
plt.ylabel('Predicted GSR')
plt.title('Scatter Plot: Actual vs Predicted GSR (All Participants)')
plt.show()

# Bar Plot: Average Evaluation Scores
evaluation_metrics = ['MAE', 'RMSE', 'R-squared']
average_scores = [avg_mae, avg_rmse, avg_r2]
plt.figure(figsize=(8, 6))
sns.barplot(x=evaluation_metrics, y=average_scores)
plt.ylabel('Average Score')
plt.title('Bar Plot: Average Evaluation Scores')
plt.show()

# Line Plot: Average Mean Predicted vs Actual GSR values across participants
plt.figure(figsize=(10, 6))
plt.plot(unique_participants, average_actual_means, label='Actual GSR', marker='o')
plt.plot(unique_participants, average_predicted_means, label='Predicted GSR', marker='o')
plt.xlabel('Participant')
plt.ylabel('Average Mean GSR Value')
plt.title('Line Plot: Average Mean Actual vs Predicted GSR across Participants')
plt.legend()
plt.show()

# Calculate the average feature importances
average_feature_importances = sum(total_feature_importances) / len(unique_participants)

# Create a DataFrame to associate feature names with their corresponding average importance values
average_feature_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Average_Importance': average_feature_importances})
average_feature_importance_df = average_feature_importance_df.sort_values(by='Average_Importance', ascending=False)

# Print the average feature importances
print("Average Feature Importances:")
print(average_feature_importance_df)









In [None]:
# Create a sideways bar plot showing the top 10 most important features
top_features = average_feature_importance_df.head(20)

plt.figure(figsize=(10, 6))
sns.barplot(x='Average_Importance', y='Feature', data=top_features, orient='h')
plt.xlabel('Average Importance')
plt.ylabel('')
plt.title('20 Most Relevant Features')
# Adjust the left margin to make sure y-axis labels are not cut off
plt.subplots_adjust(left=0.4)
plt.tight_layout()
plt.savefig(os.path.join(notebook_temp_dir, 'regression_feature_importance.pdf'))
plt.show()

In [None]:
# Create a DataFrame to store the individual R-squared scores
r2_df = pd.DataFrame({'Participant': unique_participants, 'R-squared': r2_scores})

# Extract participant numbers from the labels
r2_df['Participant'] = r2_df['Participant'].str.split('_').str[0]

# Determine colors based on R-squared value ranges
def assign_color(r2_value):
    if r2_value >= 0.75:
        return 'green'
    elif r2_value >= 0.5:
        return 'yellow'
    elif r2_value >= 0.25:
        return 'orange'
    else:
        return 'red'

# Determine colors based on R-squared values
colors = r2_df['R-squared'].apply(assign_color)

# Create a bar plot of individual R-squared scores with extracted participant numbers and colors
plt.figure(figsize=(10, 6))
sns.barplot(x='Participant', y='R-squared', data=r2_df, palette=colors)
plt.title('Ridge Regression for target GSR using all features')
plt.xlabel('Participant')
plt.ylabel('R-squared')
# Create a custom legend for the color range
legend_labels = {
    'green': 'R-squared >= 0.75',
    'yellow': '0.75 > R-squared >= 0.5',
    'orange': '0.5 > R-squared >= 0.25',
    'red': 'R-squared < 0.25'
}
handles = [plt.Line2D([], [], color=color, label=label, linewidth=6) for color, label in legend_labels.items()]
plt.legend(handles=handles, title='R-squared Range')
plt.xticks(rotation=0)
plt.tight_layout()
plt.savefig(os.path.join(notebook_temp_dir, 'regression_results.pdf'))
plt.show()

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

# Load data into a DataFrame (replace 'df' with your data file)
data = df

feature_names = data.drop(['participant_number', 'Condition', 'Segment', 'Biopac_GSR_mean'], axis=1).columns.tolist()

# Get unique participant numbers
unique_participants = data['participant_number'].unique()

# Initialize a dictionary to store performance metrics for each feature
feature_performance = {}

# Iterate through each feature
for feature in feature_names:
    print(f"Running ML for feature: {feature}")
    
    # Initialize lists to store performance metrics for each participant
    mse_scores = []
    r2_scores = []
    
    # Iterate through each participant
    for participant in unique_participants:
        # Select data for the current participant
        #val_data = data[(data['participant_number'] == participant) & (data['Condition'] == 'CO2')]
        #train_data = data[(data['participant_number'] != participant) & (data['Condition'] == 'CO2')]
        
        val_data = data[(data['participant_number'] == participant)]
        train_data = data[(data['participant_number'] != participant)]
        
        # Select features and target for training and validation
        X_train = train_data[[feature]]
        y_train = train_data['Biopac_GSR_mean']
        X_val = val_data[[feature]]
        y_val = val_data['Biopac_GSR_mean']
        
        # Train Ridge Regressor
        model = Ridge(alpha=1.0)
        model.fit(X_train, y_train)
        
        # Predict GSR values on validation set
        y_val_pred = model.predict(X_val)
        
        # Calculate Mean Squared Error (MSE) and R-squared for the current participant
        mse = mean_squared_error(y_val, y_val_pred)
        r2 = r2_score(y_val, y_val_pred)
        
        # Append the scores to the lists
        mse_scores.append(mse)
        r2_scores.append(r2)
    
    # Calculate the average performance metrics for the current feature
    avg_mse = np.mean(mse_scores)
    avg_r2 = np.mean(r2_scores)
    
    # Store the average performance metrics in the dictionary
    feature_performance[feature] = {'Avg MSE': avg_mse, 'Avg R-squared': avg_r2}

# Create a DataFrame from the dictionary
feature_performance_df = pd.DataFrame.from_dict(feature_performance, orient='index')

# Print the average performance metrics for each feature
print("Average Performance Metrics per Feature:")
print(feature_performance_df)
