# Example notebook: BP Prediction
This example notebook uses both the `features.tsv` and `participants.tsv` files in the `sample` data directory.

In [None]:
# Imports.
import os.path

import matplotlib.pyplot as plt
import pandas as pd
from sklearn import linear_model, model_selection

In [None]:
# Location of 'sample' data directory.
DATA_DIR=os.path.join('..', 'sample')

In [None]:
# Load as dataframes, and join features/participants files on pid field.
ppt_df = pd.read_csv(os.path.join(DATA_DIR, 'participants.tsv'), delimiter='\t')
feat_df = pd.read_csv(os.path.join(DATA_DIR, 'features.tsv'), delimiter='\t')
comb_df = ppt_df.merge(feat_df, how='left', left_on='pid', right_on='pid')

# View key aspects of combined dataframe: participant id (pid), study phase, and measurement within phase.
comb_df[['pid', 'phase', 'measurement']]

In [None]:
# Show alphabetized list of features (see paper for details).
feature_list = list(comb_df.columns)
feature_list.sort()
print(feature_list)

In [None]:
# Set up features.
indep_features = ['baseline_sbp', 'age', 'weight', 'height', 'delta_hr_ekg', 'delta_rpat_pressure']
target_feature = 'delta_sbp' # For an easier target, try 'delta_hr_pressure'.

# Subset dataframe to contain only the ambulatory measurements by restricting 'phase'.
ambulatory_df = comb_df.loc[comb_df['phase'] == 'ambulatory']
cv_df = ambulatory_df.dropna(how='any', subset=indep_features+[target_feature])

# Compute unique set of participants.
ppts = set(cv_df['pid'])
print(f'{len(ppts)} participants, {cv_df.shape[0]} total rows.')

In [None]:
# Cross-validation loop over folds per participant. It is essential to stratify by participant vs. by row, 
# i.e., the training data for a given participant must not contain samples from that participant, since in 
# realistic problem settings we will not have access to other ambulatory values for that participant.
# 
# The fold strategy below can also be achieved with sklearn's model_selection.LeaveOneGroupOut, but we construct 
# the groups manually in order to make the stratification clear.
test_df_list = []
# Choose a model type.
model = linear_model.Ridge(alpha=0.001)
# Iterate over participants.
for ppt in ppts:
    # Training subset.
    fold_train_df = cv_df.loc[cv_df['pid'] != ppt]
    # Create deep copy for test subset, since we'll be adding a column.
    fold_test_df = pd.DataFrame(cv_df.loc[cv_df['pid'] == ppt])
    # Fit model based on training subset.
    model.fit(fold_train_df[indep_features], fold_train_df[target_feature])
    # Add 'model_prediction' column to fold_test_df, containing predictions on test subset.
    fold_test_df['model_prediction'] = model.predict(fold_test_df[indep_features])
    # Append augmented fold_test_df to list for later assembly.
    test_df_list.append(fold_test_df)
# Assemble all fold_test_dfs into new dataframe
predictions_df = pd.concat(test_df_list, axis=0)

In [None]:
# Plot results vs. ground truth.
plt.plot(predictions_df[target_feature], predictions_df['model_prediction'], 'b.')
plt.plot([-60,40], [-60,40], 'r-')
plt.title('Model Prediction vs. Ground Truth\nwith Proper Stratification\n(leave one participant out)')
plt.xlabel(target_feature)
plt.ylabel('Model Prediction')

In [None]:
# How would it look if we naively used leave-one-(row)-out cross-validation? We expect this will provide us
# with unrealistically optimistic results, as explained above, and shown below.
demo_df = pd.DataFrame(cv_df)
demo_df['false_prediction'] = model_selection.cross_val_predict(model, demo_df[indep_features], demo_df[target_feature], 
                                                                cv=model_selection.LeaveOneOut())

# Plot results vs. ground truth.
plt.plot(demo_df[target_feature], demo_df['false_prediction'], 'b.')
plt.plot([-60,40], [-60,40], 'r-')
plt.title('Model Prediction vs. Ground Truth\nwithout Proper Stratification\n(leave one row out)')
plt.xlabel(target_feature)
plt.ylabel('False Prediction')