# Data Load and Processing

## Import raw data, rename cols

In [1]:
import pandas as pd
from scipy import stats
import statsmodels.api as sm
from statsmodels.formula.api import ols
from scipy.stats import ttest_ind, ttest_rel
from scipy.stats import pearsonr, spearmanr
from scipy.stats import f_oneway

# Load the sheet into a DataFrame (already done in this case)
df_master = pd.read_excel('MEC Compiled.xlsx', sheet_name='Sheet1')

df_master = df_master.dropna(how='all')

df_metadata = pd.read_excel('Metadata.xlsx', sheet_name='Sheet1')

# Create a dictionary mapping from the `Column` to `RENAME`
column_mapping = dict(zip(df_metadata['Column'], df_metadata['RENAME']))
df_master.rename(columns=column_mapping, inplace=True)

In [2]:
df_master[0:10]

Unnamed: 0,ID,Group,Timestamp,Email,TookSurveyLastYear,CompassionateCarePatients,CompassionateCareColleagues,EmpathyPriority,EmpathyMissionStatement,SupportedMission,...,Gender,HispanicBinary,Race,Age,UnderstandFellowTeammateWork,AskingHelpIneffective,GivingHelpIdle,OfferHelpTool,AssertSafetyConcern,PersonalConflicts
0,1,Pre,2022-06-03 15:36:58.428,Smothekl@Sutterhealth.org,,Completely Agree,Completely Agree,Completely Agree,Completely Agree,Mostly Agree,...,Male,"No, Not Hispanic or Latino",Black or African-American,65 to 74,,,,,,
1,2,Pre,2022-06-03 15:51:52.673,mckinnr@sutterhealth.org,,Completely Agree,Somewhat Agree,Somewhat Agree,Completely Agree,Mostly Agree,...,Female,"No, Not Hispanic or Latino",White,35 to 44,,,,,,
2,3,Pre,2022-06-10 21:10:02.732,grossmf@sutterhealth.org,,Somewhat Agree,Mostly Agree,Completely Agree,Completely Agree,Completely Agree,...,Female,"No, Not Hispanic or Latino",White,45 to 54,,,,,,
3,4,Pre,2022-06-14 12:51:01.751,mbutcher@dpmginc.com,,Completely Agree,Completely Agree,Completely Agree,Completely Agree,Completely Agree,...,Female,"No, Not Hispanic or Latino",White,35 to 44,,,,,,
4,5,Pre,2022-06-14 15:05:38.538,abatem@sutterhealth.org,,Somewhat Agree,Mostly Agree,Somewhat Agree,Completely Agree,Completely Agree,...,Male,"No, Not Hispanic or Latino",White,55 to 64,,,,,,
5,6,Pre,2022-06-14 17:40:26.038,FalcoD@sutterhealth.org,,Completely Agree,Mostly Agree,Somewhat Disagree,Mostly Agree,Completely Agree,...,Male,"No, Not Hispanic or Latino",White,55 to 64,,,,,,
6,7,Pre,2022-06-15 11:08:18.492,Jerwers@sutterhealth.org,,Mostly Agree,Mostly Agree,Mostly Agree,Mostly Agree,Mostly Agree,...,Male,"No, Not Hispanic or Latino",White,45 to 54,,,,,,
7,8,Pre,2022-06-15 13:53:09.095,piyarar@sutterhealth.org,,Somewhat Agree,Mostly Agree,Mostly Agree,Completely Agree,Somewhat Agree,...,Male,"No, Not Hispanic or Latino",Asian,35 to 44,,,,,,
8,9,Pre,2022-06-17 14:37:43.319,schaefmh@sutterhealth.org,,Mostly Agree,Mostly Agree,Mostly Agree,Completely Agree,Completely Agree,...,Male,"No, Not Hispanic or Latino",White,65 to 74,,,,,,
9,10,Pre,2022-06-20 16:51:10.250,carrieann.drenten@vituity.com,,Mostly Agree,Mostly Agree,Completely Agree,Completely Agree,Mostly Agree,...,Female,"No, Not Hispanic or Latino",White,35 to 44,,,,,,


## Numerical Conversion

In [3]:
# Likert mapping for climate questions
likert_mapping = {
    "Completely Agree": 7,
    "Mostly Agree": 6,
    "Somewhat Agree": 5,
    "Slightly Agree": 5,
    "Unsure": 4,
    "Somewhat Disagree": 3,
    "Disagree Somewhat": 3,
    "Mostly Disagree": 2,
    "Completely Disagree": 1
}

# Specify the exact columns to convert
columns_to_convert = [
    'CompassionateCarePatients','CompassionateCareColleagues','EmpathyPriority','EmpathyMissionStatement','SupportedMission',
    'WalkthroughdoorsCaring','CuriosityAndOpenness','StressedAndPressed','LikeWorkingHere','FindsTimeNeeded','AdminInterference'
]

# Ensure all values are strings before cleaning
for col in columns_to_convert:
    df_master[col] = df_master[col].astype(str).str.strip()

# Apply Likert mapping
for col in columns_to_convert:
    df_master[col] = df_master[col].map(likert_mapping)

# Convert columns to numeric
for col in columns_to_convert:
    df_master[col] = pd.to_numeric(df_master[col], errors='coerce')

In [4]:
# Reverse the scale for StressedAndPressed and AdminInterference
df_master['StressedAndPressed'] = df_master['StressedAndPressed'].apply(lambda x: 8 - x if pd.notna(x) else x)
df_master['AdminInterference'] = df_master['AdminInterference'].apply(lambda x: 8 - x if pd.notna(x) else x)

df_master.rename(columns={
    'StressedAndPressed': 'StressedAndPressed_NOT',
    'AdminInterference': 'AdminInterference_NOT'
}, inplace=True)

In [5]:
df_master.iloc[0:10,4:13]

Unnamed: 0,TookSurveyLastYear,CompassionateCarePatients,CompassionateCareColleagues,EmpathyPriority,EmpathyMissionStatement,SupportedMission,WalkthroughdoorsCaring,MatchEyeLevel,CuriosityAndOpenness
0,,7,7,7,7,6,,Completely Agree,6
1,,7,5,5,7,6,5.0,Somewhat Agree,3
2,,5,6,7,7,7,5.0,Completely Agree,5
3,,7,7,7,7,7,7.0,Completely Agree,7
4,,5,6,5,7,7,5.0,Completely Agree,6
5,,7,6,3,6,7,6.0,Mostly Agree,6
6,,6,6,6,6,6,6.0,Mostly Agree,6
7,,5,6,6,7,5,6.0,Somewhat Agree,6
8,,6,6,6,7,7,6.0,Completely Agree,6
9,,6,6,7,7,6,3.0,Mostly Agree,6


In [6]:
# learning mapping
learning_mapping = {
    "Yes, I definitely learned this": 4,
    "YES, definitely learned this here": 4,
    "Completely Agree": 4,
    "Strongly Agree": 4,
    "Mostly Agree": 4,
    "Somewhat Agree": 3,
    "No, but I heard something about": 3,
    "NO, but I heard something about this": 3,
    "Agree": 3,
    "Unsure": 2,
    "Neither  Nor Dis": 2,
    "Neither Agree Nor Disagree": 2,
    "No, and I know nothing about this": 1,
    "NO, and I know nothing about this": 1,
    "Disagree": 1,
    "Strongly Disagree": 1
    
}

# Specify the exact columns to convert
columns_to_convert = [
    'EyeContact','IDNeedExplanation',
    'AskTellAskLearned','AskQuestions',
    'ChallengeTechniques','EmpathicBridge',
    'MatchEyeLevel','TeachBackMethod',
    'NotRushed','HearComplaintConcern',
    'TakeAMoment','EmotionalStateAwareness',
    'VentFrustration','MinimizeTechnicalLanguage',
    'BadNewsDelivery'
]
# Ensure all values are strings before cleaning
for col in columns_to_convert:
    df_master[col] = df_master[col].astype(str).str.strip()
# Apply Likert mapping
for col in columns_to_convert:
    df_master[col] = df_master[col].map(learning_mapping)


In [7]:
df_master.iloc[0:10,10:20]

Unnamed: 0,WalkthroughdoorsCaring,MatchEyeLevel,CuriosityAndOpenness,StressedAndPressed_NOT,LikeWorkingHere,FindsTimeNeeded,AdminInterference_NOT,EyeContact,IDNeedExplanation,AskTellAskLearned
0,,4,6,3,6,5,2,3,3,3
1,5.0,3,3,1,6,5,1,1,3,1
2,5.0,4,5,1,5,3,4,4,3,1
3,7.0,4,7,3,6,5,2,1,2,2
4,5.0,4,6,3,6,6,5,3,4,4
5,6.0,4,6,5,6,5,2,1,4,4
6,6.0,4,6,3,6,6,6,4,4,4
7,6.0,3,6,6,6,5,2,4,3,3
8,6.0,4,6,3,6,6,2,1,4,4
9,3.0,4,6,1,6,3,2,4,4,1


In [8]:
# TTPQ
learning_mapping = {
    "Strongly Agree": 5,
    "Agree": 4,
    "Disagree": 2,
    "Strongly Disagree": 1
    
}

# Specify the exact columns to convert
columns_to_convert = [
'UnderstandFellowTeammateWork',
'AskingHelpIneffective',
'GivingHelpIdle',
'OfferHelpTool',
'AssertSafetyConcern',
'PersonalConflicts'
]
# Ensure all values are strings before cleaning
for col in columns_to_convert:
    df_master[col] = df_master[col].astype(str).str.strip()
# Apply Likert mapping
for col in columns_to_convert:
    df_master[col] = df_master[col].map(learning_mapping)


In [9]:
# TTPQ Reversals
df_master['AskingHelpIneffective'] = df_master['AskingHelpIneffective'].apply(lambda x: 6 - x if pd.notna(x) else x)
df_master['GivingHelpIdle'] = df_master['GivingHelpIdle'].apply(lambda x: 6 - x if pd.notna(x) else x)
df_master['PersonalConflicts'] = df_master['PersonalConflicts'].apply(lambda x: 6 - x if pd.notna(x) else x)

df_master.rename(columns={
    'AskingHelpIneffective': 'AskingHelpIneffective_NOT',
    'GivingHelpIdle': 'GivingHelpIdle_NOT',
    'PersonalConflicts': 'PersonalConflicts_NOT'
}, inplace=True)

In [10]:
df_master.iloc[25:35,40:45]

Unnamed: 0,Age,UnderstandFellowTeammateWork,AskingHelpIneffective_NOT,GivingHelpIdle_NOT,OfferHelpTool
25,45-54,,,,
26,55-64,,,,
27,45-54,5.0,5.0,5.0,5.0
28,45-54,5.0,4.0,4.0,4.0
29,45-54,4.0,5.0,5.0,4.0
30,45-54,5.0,4.0,5.0,5.0
31,45-54,5.0,5.0,5.0,4.0
32,35-44,5.0,5.0,5.0,5.0
33,55-64,4.0,4.0,4.0,4.0


## Category Normalization

In [11]:
df_master['Race_normalized'] = df_master['Race'].str.strip().replace({
    'Asian': 'Asian',
    'White\xa0': 'White',
    'White':'White',
    'Black or African-American': 'Other',
    'Other\xa0': 'Other',
    'Pakistani': 'Asian',
    'Native Hawaiian or Pacific Islander': 'Asian',
    'White, Asian' :'Other',
    'Asian, Afghan': 'Asian'  
})

df_master['Age_normalized'] = df_master['Age'].str.strip().replace({
    '18-24': 'Gen Z',
    '25-34': 'Gen Z/Millenials',
    '35-44': 'Millenials', 
    '35 to 44': 'Millenials',
    '45-54': 'Gen X',
    '45 to 54': 'Gen X',
    '55-64': 'Senior',
    '55 to 64': 'Senior',
    '65-74': 'Senior',
    '65 to 74': 'Senior',
})

In [12]:
#course counts
exclude_mapping = {
    "Course 1-6": 6,
    "Course 1-3": 3,
    "Course 1-3; 6": 4,
    "Course 1, 2, 3": 3,
    "Course 1, Course 2, Course 3": 3,
    "I did not participate in any E-Learning courses.;": 0
}
# Create a new column to count semicolons with specific handling for exclude cases
df_master['CourseCount'] = df_master['CoursesCompleted'].apply(
    lambda x: exclude_mapping[x.strip()] if pd.notna(x) and x.strip() in exclude_mapping 
    else (x.strip().count(';') if pd.notna(x) else 0)
)



# Workshop counts
exclude_mapping = {
    "Workshop 1-3; Self-Empathy": 4,
    "Workshop 1-6; Self-Empathy": 7,
    "Workshop 1-3": 3,
    "Workshop 1-3; 6": 4,
    "I did not participate in any workshops.;": 0
}
# Create a new column to count semicolons with specific handling for exclude cases
df_master['WorkshopCount'] = df_master['WorkshopsCompleted'].apply(
    lambda x: exclude_mapping[x.strip()] if pd.notna(x) and x.strip() in exclude_mapping 
    else (x.strip().count(';') if pd.notna(x) else 0)
)

In [13]:
# Set the display option to wrap text
pd.set_option('display.max_colwidth', None)

# Display unique values for WorkshopsCompleted and WorkshopCount
df_master[['WorkshopsCompleted', 'WorkshopCount']].drop_duplicates()
#df_master[['CoursesCompleted', 'CourseCount']].drop_duplicates()

Unnamed: 0,WorkshopsCompleted,WorkshopCount
0,Self-Care;,1
2,,0
6,Course 1;,1
13,Course 1; Course 2;,2
14,Course 1; Self-Care;,2
19,Course 3. Delivering Bad News;Self-Empathy Workshop ;Course 2. Managing Difficult Medical Interactions;Course 1. Enhancing Empathy in Healthcare;,4
20,Course 1. Enhancing Empathy in Healthcare;Course 2. Managing Difficult Medical Interactions;Course 3. Delivering Bad News;,3
22,Course 1. Enhancing Empathy in Healthcare;Course 2. Managing Difficult Medical Interactions;Course 3. Delivering Bad News;Self-Empathy Workshop ;,4
26,Self-Empathy Workshop ;Course 3. Delivering Bad News;Course 1. Enhancing Empathy in Healthcare;,3
27,Course 1. Enhancing Empathy in Healthcare;Course 3. Delivering Bad News;Self-Empathy Workshop ;Course 2. Managing Difficult Medical Interactions;,4


In [14]:
#df_master['CourseCount'].value_counts()

#filtered_df = df_master[df_master['WorkshopCount'] == 2]
#print(filtered_df[['WorkshopsCompleted', 'WorkshopCount']])

In [15]:
# Define the logic for LearningSubgroup
def determine_learning_subgroup(row): 
    if row['CourseCount'] >= 3 and row['WorkshopCount'] >= 1:
        return "Fully Trained"
    elif row['CourseCount'] >= 1 and row['WorkshopCount'] >= 1:
        return "1+ course, 1+ workshop" 
    elif row['CourseCount'] >= 2 and row['WorkshopCount'] == 0:
        return "2+ courses, no workshop" 
    elif row['CourseCount'] == 0 and row['WorkshopCount'] > 0:
        return "No courses, >0 workshop"
    elif row['CourseCount'] < 2 and row['WorkshopCount'] < 1:
        return "No training"
    else:
        return 'No Category'  # For cases that don't match any condition (optional)

# Apply the logic to create the new column
df_master['LearningSubgroup'] = df_master.apply(determine_learning_subgroup, axis=1)


In [16]:
df_master.LearningSubgroup.value_counts()

# Set the display option to wrap text
pd.set_option('display.max_colwidth', None)

# Display unique values for WorkshopsCompleted and WorkshopCount
df_master[['CourseCount','WorkshopCount','LearningSubgroup']].drop_duplicates()

Unnamed: 0,CourseCount,WorkshopCount,LearningSubgroup
0,0,1,"No courses, >0 workshop"
2,0,0,No training
3,1,0,No training
13,2,2,"1+ course, 1+ workshop"
14,1,2,"1+ course, 1+ workshop"
15,1,1,"1+ course, 1+ workshop"
18,2,1,"1+ course, 1+ workshop"
19,3,4,Fully Trained
20,3,3,Fully Trained
21,4,3,Fully Trained


In [17]:
#filtered_df = df_master[df_master['LearningSubgroup'] == 'No Category']

#filtered_df[['CoursesCompleted', 'CourseCount','WorkshopsCompleted', 'WorkshopCount']]

In [18]:
"""
#df_master[['CourseCount', 'WorkshopCount','LearningSubgroup']].value_counts()

# Get the value counts for the selected columns
value_counts = df_master[['CourseCount', 'WorkshopCount', 'LearningSubgroup']].value_counts()

# Convert the value counts to a DataFrame
value_counts_df = value_counts.reset_index()
value_counts_df.columns = ['CourseCount', 'WorkshopCount', 'LearningSubgroup', 'Count']

# Display the DataFrame as a table
print(value_counts_df)
"""

"\n#df_master[['CourseCount', 'WorkshopCount','LearningSubgroup']].value_counts()\n\n# Get the value counts for the selected columns\nvalue_counts = df_master[['CourseCount', 'WorkshopCount', 'LearningSubgroup']].value_counts()\n\n# Convert the value counts to a DataFrame\nvalue_counts_df = value_counts.reset_index()\nvalue_counts_df.columns = ['CourseCount', 'WorkshopCount', 'LearningSubgroup', 'Count']\n\n# Display the DataFrame as a table\nprint(value_counts_df)\n"

In [19]:
# Create the ExposureGroup column
df_master['ExposureGroup'] = df_master.apply(
    lambda row: 'N' if row['CourseCount'] == 0 and row['WorkshopCount'] == 0 else 'Y',
    axis=1
)

In [21]:
df_master = df_master[df_master['Group'] != 'Pre']
df_master

Unnamed: 0,ID,Group,Timestamp,Email,TookSurveyLastYear,CompassionateCarePatients,CompassionateCareColleagues,EmpathyPriority,EmpathyMissionStatement,SupportedMission,...,GivingHelpIdle_NOT,OfferHelpTool,AssertSafetyConcern,PersonalConflicts_NOT,Race_normalized,Age_normalized,CourseCount,WorkshopCount,LearningSubgroup,ExposureGroup
19,20,Post 1,2024-01-18 18:43:00,anonymous,Yes,6,7,6,7,7,...,,,,,White,Gen X,3,4,Fully Trained,Y
20,21,Post 1,2024-01-18 19:41:00,anonymous,Yes,6,7,3,7,6,...,,,,,White,Gen X,3,3,Fully Trained,Y
21,22,Post 1,2024-01-19 00:59:00,anonymous,No,7,7,6,7,7,...,,,,,White,Millenials,4,3,Fully Trained,Y
22,23,Post 1,2024-01-19 11:59:00,anonymous,Unsure,7,7,6,7,6,...,,,,,Other,Senior,0,4,"No courses, >0 workshop",Y
23,24,Post 1,2024-01-21 12:07:00,anonymous,Yes,6,6,6,7,7,...,,,,,White,Senior,3,4,Fully Trained,Y
24,25,Post 1,2024-01-22 22:58:00,anonymous,No,7,7,2,5,2,...,,,,,White,Gen X,5,4,Fully Trained,Y
25,26,Post 1,2024-01-26 15:23:00,anonymous,No,6,6,7,7,6,...,,,,,White,Gen X,3,4,Fully Trained,Y
26,27,Post 1,2024-02-03 15:16:00,anonymous,Yes,7,6,5,7,7,...,,,,,White,Senior,3,3,Fully Trained,Y
27,28,Post 2,2025-02-14 12:09:00,anonymous,No,6,6,6,7,7,...,5.0,5.0,5.0,5.0,White,Gen X,3,4,Fully Trained,Y
28,29,Post 2,2025-02-14 13:09:00,anonymous,No,7,7,6,6,6,...,4.0,4.0,5.0,5.0,White,Gen X,3,3,Fully Trained,Y


## Averaging

In [22]:
# List of columns to average
empathy_climate_cols = [
    'CompassionateCareColleagues',
    'CompassionateCarePatients',
    'EmpathyMissionStatement',
    'EmpathyPriority',
    'SupportedMission'
]

# Create the ClimateOfEmpathy_Avg column
df_master['ClimateOfEmpathy_Avg'] = df_master[empathy_climate_cols].mean(axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_master['ClimateOfEmpathy_Avg'] = df_master[empathy_climate_cols].mean(axis=1)


In [23]:
# List of columns to average for EmpathyLearning_Avg
empathy_learning_columns = [
'MatchEyeLevel',
'EyeContact',
'IDNeedExplanation',
'AskTellAskLearned',
'AskQuestions',
'ChallengeTechniques',
'EmpathicBridge',
'TeachBackMethod',
'NotRushed',
'HearComplaintConcern',
'TakeAMoment',
'EmotionalStateAwareness',
'VentFrustration',
'MinimizeTechnicalLanguage',
'BadNewsDelivery'
]

# Create the EmpathyLearning_Avg column
df_master['EmpathyLearning_Avg'] = df_master[empathy_learning_columns].mean(axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_master['EmpathyLearning_Avg'] = df_master[empathy_learning_columns].mean(axis=1)


In [24]:
# List of columns to average for GeneralClimate_Avg
general_climate_columns = [
    'AdminInterference_NOT',
    'CuriosityAndOpenness',
    'FindsTimeNeeded',
    'LikeWorkingHere',
    'StressedAndPressed_NOT',
    'WalkthroughdoorsCaring'
]

# Create the GeneralClimate_Avg column
df_master['GeneralClimate_Avg'] = df_master[general_climate_columns].mean(axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_master['GeneralClimate_Avg'] = df_master[general_climate_columns].mean(axis=1)


In [25]:
# Create the OverallClimate_Avg column by averaging GeneralClimate_Avg and ClimateOfEmpathy_Avg
df_master['OverallClimate_Avg'] = df_master[['GeneralClimate_Avg', 'ClimateOfEmpathy_Avg']].mean(axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_master['OverallClimate_Avg'] = df_master[['GeneralClimate_Avg', 'ClimateOfEmpathy_Avg']].mean(axis=1)


In [24]:
#averages, median, std dev across all numerical cols

## Save to xlsx

In [26]:
average_row = df_master.mean(numeric_only=True)

# Create a new DataFrame with the average row
df_average = pd.DataFrame(average_row).T
df_average.index = ['Average']

# Append the original DataFrame to the new DataFrame
df_new = pd.concat([df_master, df_average], ignore_index=False)

# Save the new DataFrame as an Excel file
output_file = "MEC_Processed.xlsx"
df_new.to_excel(output_file, index=False)

print(f"DataFrame saved as {output_file}")


DataFrame saved as MEC_Processed.xlsx


# Statistical Testing

## Class Creation

In [27]:
class StatisticalAnalysis:
    def __init__(self, df):
        """
        Initialize the class with a DataFrame.
        """
        self.df = df

    def t_test_by_group(self, group_column, group1, group2):
        unique_groups = self.df[group_column].dropna().unique()
        if group1 not in unique_groups or group2 not in unique_groups:
            raise ValueError(f"Both '{group1}' and '{group2}' must be present in the column '{group_column}'.")

        print(f"Group 1: {group1} (first group in the t-test)")
        print(f"Group 2: {group2} (second group in the t-test)")
        print("Interpretation:")
        print(" - A positive t-statistic means the mean of the first group is greater than the mean of the second group.")
        print(" - A negative t-statistic means the mean of the first group is less than the mean of the second group.")
        print("\nT-Test Results:\n")

        numeric_columns = self.df.select_dtypes(include='number').columns
        group1_mask = self.df[group_column] == group1
        group2_mask = self.df[group_column] == group2

        t_test_results = {}
        for col in numeric_columns:
            group1_values = self.df.loc[group1_mask, col].dropna()
            group2_values = self.df.loc[group2_mask, col].dropna()
            t_stat, p_value = ttest_ind(group1_values, group2_values, equal_var=False)

            t_test_results[col] = {'t_stat': t_stat, 'p_value': p_value}

        t_test_results_df = pd.DataFrame(t_test_results).T.sort_values(by='p_value')
        t_test_results_df['Group1'] = group1
        t_test_results_df['Group2'] = group2

        return t_test_results_df

    def anova_by_group(self, group_column):
        unique_groups = self.df[group_column].dropna().unique()
        if len(unique_groups) < 2:
            raise ValueError(f"The column '{group_column}' must have at least two unique groups.")

        print(f"Groups: {', '.join(map(str, unique_groups))} (categories in the ANOVA test)")
        print("Interpretation:")
        print(" - The F-statistic measures the ratio of variation between group means to variation within groups.")
        print(" - A high F-statistic suggests larger differences between group means relative to within-group variability.")
        print("\nANOVA Results:\n")

        numeric_columns = self.df.select_dtypes(include='number').columns
        anova_results = {}
        for col in numeric_columns:
            group_means = {
                group: self.df.loc[self.df[group_column] == group, col].mean() for group in unique_groups
            }

            group_values = [self.df.loc[self.df[group_column] == group, col].dropna() for group in unique_groups]
            f_stat, p_value = f_oneway(*group_values)

            sorted_groups = sorted(group_means.items(), key=lambda x: x[1], reverse=True)
            highest_group, highest_mean = sorted_groups[0]

            anova_results[col] = {
                'f_stat': f_stat,
                'p_value': p_value,
                'highest_group': highest_group,
                'highest_mean': highest_mean,
                'group_means': group_means
            }

        anova_results_df = pd.DataFrame(anova_results).T.sort_values(by='p_value')
        anova_results_df['Groups'] = ', '.join(map(str, unique_groups))

        return anova_results_df

    def correlation_with_column(self, target_column):
        if target_column not in self.df.select_dtypes(include='number').columns:
            raise ValueError(f"Target column '{target_column}' is not numeric or does not exist in the DataFrame.")

        numeric_columns = [col for col in self.df.select_dtypes(include='number').columns if col != target_column]
        correlation_results = {}

        for col in numeric_columns:
            valid_data = self.df[[target_column, col]].dropna()

            if len(valid_data) < 2:
                correlation_results[col] = {
                    'correlation_coefficient': None,
                    'p_value': None,
                    'interpretation': "Not enough data to calculate correlation."
                }
                continue

            corr, p_value = pearsonr(valid_data[target_column], valid_data[col])

            correlation_results[col] = {
                'correlation_coefficient': corr,
                'p_value': p_value
            }
        correlation_results_df = pd.DataFrame(correlation_results).T.sort_values(by='correlation_coefficient', key=abs, ascending=False)
        return correlation_results_df


## Initialization/Testing

In [28]:
analysis = StatisticalAnalysis(df_master)

In [29]:
analysis.anova_by_group('Race_normalized')

Groups: White, Other, Asian (categories in the ANOVA test)
Interpretation:
 - The F-statistic measures the ratio of variation between group means to variation within groups.
 - A high F-statistic suggests larger differences between group means relative to within-group variability.

ANOVA Results:



  res = hypotest_fun_out(*samples, **kwds)
  if _f_oneway_is_too_small(samples):


Unnamed: 0,f_stat,p_value,highest_group,highest_mean,group_means,Groups
WorkshopCount,11.333333,0.00172,Other,4.0,"{'White': 3.4615384615384617, 'Other': 4.0, 'Asian': 1.0}","White, Other, Asian"
CourseCount,10.107317,0.002672,White,3.230769,"{'White': 3.230769230769231, 'Other': 0.0, 'Asian': 2.0}","White, Other, Asian"
GeneralClimate_Avg,2.295516,0.143168,Other,6.666667,"{'White': 5.423076923076922, 'Other': 6.666666666666667, 'Asian': 5.666666666666667}","White, Other, Asian"
OverallClimate_Avg,1.825182,0.203208,Other,6.633333,"{'White': 5.788461538461538, 'Other': 6.633333333333333, 'Asian': 5.833333333333334}","White, Other, Asian"
ID,1.378378,0.289162,Asian,33.0,"{'White': 26.846153846153847, 'Other': 23.0, 'Asian': 33.0}","White, Other, Asian"
LikeWorkingHere,1.222222,0.328763,Other,7.0,"{'White': 6.3076923076923075, 'Other': 7.0, 'Asian': 6.0}","White, Other, Asian"
StressedAndPressed_NOT,1.126904,0.356043,Other,6.0,"{'White': 3.769230769230769, 'Other': 6.0, 'Asian': 5.0}","White, Other, Asian"
EmpathyLearning_Avg,0.985928,0.401387,Other,3.933333,"{'White': 3.771428571428572, 'Other': 3.933333333333333, 'Asian': 3.933333333333333}","White, Other, Asian"
FindsTimeNeeded,0.933333,0.420006,Other,7.0,"{'White': 6.0, 'Other': 7.0, 'Asian': 6.0}","White, Other, Asian"
CompassionateCarePatients,0.933333,0.420006,Other,7.0,"{'White': 6.538461538461538, 'Other': 7.0, 'Asian': 6.0}","White, Other, Asian"


In [32]:
analysis.correlation_with_column('OverallClimate_Avg')

  corr, p_value = pearsonr(valid_data[target_column], valid_data[col])
  corr, p_value = pearsonr(valid_data[target_column], valid_data[col])
  corr, p_value = pearsonr(valid_data[target_column], valid_data[col])


Unnamed: 0,correlation_coefficient,p_value
AdminInterference_NOT,0.858333,4.2e-05
ClimateOfEmpathy_Avg,0.7340429,0.001836
UnderstandFellowTeammateWork,0.7331232,0.060833
GeneralClimate_Avg,0.7293212,0.002032
EmpathyPriority,0.7140774,0.002786
OfferHelpTool,0.6097583,0.146012
ChallengeTechniques,0.5820114,0.028993
StressedAndPressed_NOT,0.5272076,0.043436
AskingHelpIneffective_NOT,-0.4362498,0.327812
BadNewsDelivery,0.4189157,0.120136


In [29]:
def is_senior(age_group):
    senior_age_groups = ['Senior']
    return age_group in senior_age_groups

# Create a new column 'SeniorStatus'
df_master['SeniorStatus'] = df_master['Age_normalized'].apply(
    lambda x: 'Senior' if is_senior(x) else 'Non-Senior'
)

In [33]:
filtered_df = df_master[df_master['Gender'] != 'Prefer Not To Answer']
analysis = StatisticalAnalysis(filtered_df)
analysis.t_test_by_group('Gender', 'Female', 'Male')

Group 1: Female (first group in the t-test)
Group 2: Male (second group in the t-test)
Interpretation:
 - A positive t-statistic means the mean of the first group is greater than the mean of the second group.
 - A negative t-statistic means the mean of the first group is less than the mean of the second group.

T-Test Results:



  res = hypotest_fun_out(*samples, **kwds)


Unnamed: 0,t_stat,p_value,Group1,Group2
StressedAndPressed_NOT,-4.692519,0.001134,Female,Male
GeneralClimate_Avg,-2.90193,0.019188,Female,Male
ID,-1.786172,0.099461,Female,Male
SupportedMission,1.875974,0.104748,Female,Male
UnderstandFellowTeammateWork,-2.0,0.183503,Female,Male
VentFrustration,-1.343122,0.209507,Female,Male
CompassionateCarePatients,-1.305952,0.214366,Female,Male
WalkthroughdoorsCaring,-1.279204,0.22403,Female,Male
CourseCount,-1.195208,0.25348,Female,Male
FindsTimeNeeded,-1.180393,0.2617,Female,Male


In [None]:
analysis = StatisticalAnalysis(df_master)
analysis.t_test_by_group('SeniorStatus')

In [33]:
# Select only numeric columns
numeric_columns = df_master.select_dtypes(include='number').columns

# Group by 'Group' and calculate mean, median, and std for numeric columns
grouped_stats = df_master.groupby('Group')[numeric_columns].agg(['mean', 'median', 'std'])

# Save the results to an Excel file
output_file = "Group_Statistics.xlsx"
grouped_stats.to_excel(output_file)

print(f"Grouped statistics saved to {output_file}")

Grouped statistics saved to Group_Statistics.xlsx
