In [1]:
import numpy as np
import scipy.stats as stats

# Set arrays to values from perturbed final Random Forest classification report and brier score
precision_md = [0.77, 0.71, 0.75, 0.71, 0.76, 0.72, 0.71, 0.73, 0.74, 0.74]
recall_md = [0.69, 0.65, 0.67, 0.66, 0.67, 0.65, 0.70, 0.66, 0.61, 0.70]
f1_score_md = [0.72, 0.68, 0.71, 0.68, 0.71, 0.68, 0.71, 0.69, 0.67, 0.72]
precision_watch = [0.68, 0.63, 0.66, 0.64, 0.66, 0.64, 0.66, 0.64, 0.62, 0.67]
recall_watch = [0.76, 0.69, 0.75, 0.69, 0.76, 0.71, 0.67, 0.71, 0.75, 0.73]
f1_score_watch = [0.72, 0.66, 0.70, 0.66, 0.71, 0.67, 0.67, 0.68, 0.68, 0.70]
accuracy = [0.72, 0.67, 0.70, 0.67, 0.71, 0.68, 0.69, 0.68, 0.67, 0.71]
precision_macro_avg = [0.72, 0.67, 0.71, 0.67, 0.71, 0.68, 0.69, 0.68, 0.68, 0.71]        
recall_macro_avg = [0.72, 0.67, 0.71, 0.67, 0.71, 0.68, 0.69, 0.68, 0.68, 0.71]
f1_macro_avg = [0.72, 0.67, 0.70, 0.67, 0.71, 0.68, 0.69, 0.68, 0.67, 0.71]
precision_weighted_avg = [0.72, 0.67, 0.71, 0.68, 0.71, 0.68, 0.69, 0.68, 0.68, 0.71]     
recall_weighted_avg = [0.72, 0.67, 0.70, 0.67, 0.71, 0.68, 0.69, 0.68, 0.67, 0.71]
f1_weighted_avg = [0.72, 0.67, 0.70, 0.67, 0.71, 0.68, 0.69, 0.68, 0.67, 0.71]
brier_score = [0.2806, 0.3316, 0.2959, 0.3265, 0.2908, 0.3214, 0.3112, 0.3163, 0.3265, 0.2908] 

# Calcualte means, standard deviations, and lengths
mean_precision_md, std_precision_md, len_precision_md = np.mean(precision_md), np.std(precision_md, ddof=1), len(precision_md)
mean_recall_md, std_recall_md, len_recall_md = np.mean(recall_md), np.std(recall_md, ddof=1), len(recall_md)
mean_f1_score_md, std_f1_score_md, len_f1_score_md = np.mean(f1_score_md), np.std(f1_score_md, ddof=1), len(f1_score_md)
mean_precision_watch, std_precision_watch, len_precision_watch = np.mean(precision_watch), np.std(precision_watch, ddof=1), len(precision_watch)
mean_recall_watch, std_recall_watch, len_recall_watch = np.mean(recall_watch), np.std(recall_watch, ddof=1), len(recall_watch)
mean_f1_score_watch, std_f1_score_watch, len_f1_score_watch = np.mean(f1_score_watch), np.std(f1_score_watch, ddof=1), len(f1_score_watch)
mean_accuracy, std_accuracy, len_accuracy = np.mean(accuracy), np.std(accuracy, ddof=1), len(accuracy)
mean_precision_macro_avg, std_precision_macro_avg, len_precision_macro_avg = np.mean(precision_macro_avg), np.std(precision_macro_avg, ddof=1), len(precision_macro_avg)
mean_recall_macro_avg, std_recall_macro_avg, len_recall_macro_avg = np.mean(recall_macro_avg), np.std(recall_macro_avg, ddof=1), len(recall_macro_avg)
mean_f1_macro_avg, std_f1_macro_avg, len_f1_macro_avg = np.mean(f1_macro_avg), np.std(f1_macro_avg, ddof=1), len(f1_macro_avg)
mean_precision_weighted_avg, std_precision_weighted_avg, len_precision_weighted_avg = np.mean(precision_weighted_avg), np.std(precision_weighted_avg, ddof=1), len(precision_weighted_avg)
mean_recall_weighted_avg, std_recall_weighted_avg, len_recall_weighted_avg = np.mean(recall_weighted_avg), np.std(recall_weighted_avg, ddof=1), len(recall_weighted_avg)
mean_f1_weighted_avg, std_f1_weighted_avg, len_f1_weighted_avg = np.mean(f1_weighted_avg), np.std(f1_weighted_avg, ddof=1), len(f1_weighted_avg)
mean_brier_score, std_brier_score, len_brier_score = np.mean(brier_score), np.std(brier_score, ddof=1), len(brier_score)

# Calculate t-values
t_value_precision_md = stats.t.ppf(0.95, df = len_precision_md - 1)
t_value_recall_md = stats.t.ppf(0.95, df = len_recall_md - 1)
t_value_f1_score_md = stats.t.ppf(0.95, df = len_f1_score_md - 1)
t_value_precision_watch = stats.t.ppf(0.95, df = len_precision_watch - 1)
t_value_recall_watch = stats.t.ppf(0.95, df = len_recall_watch - 1)
t_value_f1_score_watch = stats.t.ppf(0.95, df = len_f1_score_watch - 1)
t_value_accuracy = stats.t.ppf(0.95, df = len_accuracy - 1)
t_value_precision_macro_avg = stats.t.ppf(0.95, df = len_precision_macro_avg - 1)
t_value_recall_macro_avg = stats.t.ppf(0.95, df = len_recall_macro_avg - 1)
t_value_f1_macro_avg = stats.t.ppf(0.95, df = len_f1_macro_avg - 1)
t_value_precision_weighted_avg = stats.t.ppf(0.95, df = len_precision_weighted_avg - 1)
t_value_recall_weighted_avg = stats.t.ppf(0.95, df = len_recall_weighted_avg - 1)
t_value_f1_weighted_avg = stats.t.ppf(0.95, df = len_f1_weighted_avg - 1)
t_value_brier_score = stats.t.ppf(0.95, df = len_brier_score - 1)

# Calculate convidence intervals
ci_precision_md = t_value_precision_md * (std_precision_md / np.sqrt(len_precision_md))
ci_recall_md = t_value_recall_md * (std_recall_md / np.sqrt(len_recall_md))
ci_f1_score_md = t_value_precision_md * (std_f1_score_md / np.sqrt(len_f1_score_md))
ci_precision_watch = t_value_precision_watch * (std_precision_watch / np.sqrt(len_precision_watch))
ci_recall_watch = t_value_recall_watch * (std_recall_watch / np.sqrt(len_recall_watch))
ci_f1_score_watch = t_value_precision_watch * (std_f1_score_watch / np.sqrt(len_f1_score_watch))
ci_accuracy = t_value_accuracy * (std_accuracy / np.sqrt(len_accuracy))
ci_precision_macro_avg = t_value_precision_macro_avg * (std_precision_macro_avg / np.sqrt(len_precision_macro_avg))
ci_recall_macro_avg = t_value_recall_macro_avg * (std_recall_macro_avg / np.sqrt(len_recall_macro_avg))
ci_f1_macro_avg = t_value_f1_macro_avg * (std_f1_macro_avg / np.sqrt(len_f1_macro_avg))
ci_precision_weighted_avg = t_value_precision_weighted_avg * (std_precision_weighted_avg / np.sqrt(len_precision_weighted_avg))
ci_recall_weighted_avg = t_value_recall_weighted_avg * (std_recall_weighted_avg / np.sqrt(len_recall_weighted_avg))
ci_f1_weighted_avg = t_value_f1_weighted_avg * (std_f1_weighted_avg / np.sqrt(len_f1_weighted_avg))
ci_brier_score = t_value_brier_score * (std_brier_score / np.sqrt(len_brier_score))

# Print outputs
print(f"Precision MD: {mean_precision_md:.3f} ± {ci_precision_md:.3f} \nRecall MD: {mean_recall_md:.3f} ± {ci_recall_md:.3f}" \
      f"\nF1 Score MD: {mean_f1_score_md:.3f} ± {ci_f1_score_md:.3f} \nPrecision Watch: {mean_precision_watch:.3f} ± {ci_precision_watch:.3f}" \
      f"\nRecall Watch: {mean_recall_watch:.3f} ± {ci_recall_watch:.3f} \nF1 Score Watch: {mean_f1_score_watch:.3f} ± {ci_f1_score_watch:.3f}" \
      f"\nAccuracy: {mean_accuracy:.3f} ± {ci_accuracy:.3f} \nPrecision Macro Avg: {mean_precision_macro_avg:.3f} ± {ci_precision_macro_avg:.3f}" \
      f"\nRecall Marco Avg: {mean_recall_macro_avg:.3f} ± {ci_recall_macro_avg:.3f} \nF1 Score Macro Avg: {mean_f1_macro_avg:.3f} ± {ci_f1_macro_avg:.3f}" \
      f"\nPrecision Weighted Avg: {mean_precision_weighted_avg:.3f} ± {ci_precision_weighted_avg:.3f}" \
      f"\nRecall Weighted Avg: {mean_recall_weighted_avg:.3f} ± {ci_recall_weighted_avg:.3f}" \
      f"\nF1 Score Weighted Avg: {mean_f1_weighted_avg:.3f} ± {ci_f1_weighted_avg:.3f} \nBrier Score: {mean_brier_score:.3f} ± {ci_brier_score:.3f}")

Precision MD: 0.734 ± 0.013 
Recall MD: 0.666 ± 0.016
F1 Score MD: 0.697 ± 0.011 
Precision Watch: 0.650 ± 0.011
Recall Watch: 0.722 ± 0.019 
F1 Score Watch: 0.685 ± 0.012
Accuracy: 0.690 ± 0.011 
Precision Macro Avg: 0.692 ± 0.011
Recall Marco Avg: 0.692 ± 0.011 
F1 Score Macro Avg: 0.690 ± 0.011
Precision Weighted Avg: 0.693 ± 0.010
Recall Weighted Avg: 0.690 ± 0.011
F1 Score Weighted Avg: 0.690 ± 0.011 
Brier Score: 0.309 ± 0.011
