In [None]:
import pandas as pd
import ast


health_equity_substrings = [
    'health equity', 'health inequity', 'disparity','disparities' ,'inequality', 'inequalities',
    'social determinants of health', 'access to health', 'healthcare access',
    'racial health', 'ethnic health', 'economic impact on health', 'healthcare barrier',
    'health justice', 'community health', 'health initiative', 'health policy',
    'health education', 'minority health', 'cultural compet', 'cultural sensitivity',
    'inclusive health', 'health equality', 'socioeconomic health', 'underserved populations',
    'vulnerable groups', 'health intervention', 'health outcomes', 'preventive health',
    'chronic health', 'mental wellbeing', 'physical wellbeing', 'healthcare disparity',
    'equitable care', 'healthcare equality', 'racial', 'ethnic', 'underserved', 'equity', 'inequity', 'discrimination'
    ,'social determinants'
]


def check_health_equity(keywords_list):

    try:
        keywords = ast.literal_eval(keywords_list)
        if not isinstance(keywords, list):
            return False
    except:
        return False

    for keyword in keywords:
        if any(substring in keyword.lower() for substring in health_equity_substrings):
            return True
    return False

def demographic_flag(value):
    if "This study will not center on underrepresented populations" in value:
        return False
    else:
        return True



file_path = 'final_data_combined_with_keywords.csv'


data = pd.read_csv(file_path)




data['Health_Equity_Flag'] = data['Keywords'].apply(check_health_equity)


data['Health_Equity_Flag'] = data['Keywords'].apply(check_health_equity)


data['Demographic_Flag'] = data['Categories'].apply(demographic_flag)


data.to_csv('output_file.csv', index=False)


In [None]:
import pandas as pd
import os


file_paths = [
    'final_data_asthma_pollution_with_keywords.csv',
    'final_data_cardiovascular_with_keywords.csv',
    'final_data_dementias_alzheimers_with_keywords.csv',
    'final_data_diabetes_with_keywords.csv',
    'final_data_mental_health_with_keywords.csv'
]


combined_df = pd.DataFrame()


def simplify_category(name):
    parts = name.split('_')
    return '-'.join(parts[2:4])


for file_path in file_paths:

    temp_df = pd.read_csv(file_path)
    category = simplify_category(os.path.basename(file_path))
    temp_df['Category'] = category
    combined_df = pd.concat([combined_df, temp_df], ignore_index=True)

output_path = 'final_data_combined_with_keywords.csv'
combined_df.to_csv(output_path, index=False)

output_path


'final_data_combined_with_keywords.csv'

In [None]:
import pandas as pd


file_path = 'processed_responses.csv'
data = pd.read_csv(file_path)


categories = data['Category'].unique()
tables_dict = {}


# data['Health_Equity_Flag']
# data['Demographic_Flag']
# data['Is Multi-Institutional']


for category in categories:
    category_data = data[data['Category'] == category]
    table = pd.crosstab(index=category_data['R2_flag'],
                        columns=category_data['Health_Equity_Flag'],
                        margins=False)
    tables_dict[category] = table

for category, table in tables_dict.items():
    print(f"2x2 Table for {category}:")
    print(table)
    print("\n")


2x2 Table for asthma-pollution:
Health_Equity_Flag  False  True 
R2_flag                         
False                  64     22
True                    4      2


2x2 Table for cardiovascular-with:
Health_Equity_Flag  False  True 
R2_flag                         
False                 237     91
True                   35     25


2x2 Table for dementias-alzheimers:
Health_Equity_Flag  False  True 
R2_flag                         
False                 116     27
True                   27      4


2x2 Table for diabetes-with:
Health_Equity_Flag  False  True 
R2_flag                         
False                 220     96
True                   51     40


2x2 Table for mental-health:
Health_Equity_Flag  False  True 
R2_flag                         
False                 102     94
True                   20     28




In [None]:

def calculate_odds_ratio(table):
    a = table.loc[True, True]
    b = table.loc[True, False]
    c = table.loc[False, True]
    d = table.loc[False, False]
    return (a * d) / (b * c)


odds_ratios = {}


for category, table in tables_dict.items():
    odds_ratios[category] = calculate_odds_ratio(table)

for category, odds_ratio in odds_ratios.items():
    print(f"Odds Ratio for {category}: {odds_ratio}")


Odds Ratio for asthma-pollution: 5.5
Odds Ratio for cardiovascular-with: 6.845888936719759
Odds Ratio for dementias-alzheimers: 1.728813559322034
Odds Ratio for diabetes-with: 3.6059618071727995
Odds Ratio for mental-health: 3.6358024691358026


In [None]:
import pandas as pd
from collections import Counter


df = pd.read_csv('processed_responses.csv')


def calculate_metrics(df):
    results = {}

    for category, group in df.groupby('Category'):
        health_equity_percent = (group['Health_Equity_Flag'].value_counts(normalize=True).get(True, 0) * 100)

        keyword_list = sum(group['Keywords'].dropna().str.replace('[\[\]\'\"]', '').str.split(', '), [])
        top_five_keywords = Counter(keyword_list).most_common(15)

        r1_percent = (group['R1_flag'].value_counts(normalize=True).get(True, 0) * 100)
        r2_percent = (group['R2_flag'].value_counts(normalize=True).get(True, 0) * 100)
        neither_percent = (group.apply(lambda x: not x['R1_flag'] and not x['R2_flag'], axis=1).mean()) * 100
        results[category] = {
            'Health_Equity_Flag_Percent': health_equity_percent,
            'Top_Five_Keywords': top_five_keywords,
            'R1_Percent': r1_percent,
            'R2_Percent': r2_percent,
            'Neither_R1_R2_Percent': neither_percent
        }

    return results

metrics = calculate_metrics(df)
metrics


{'asthma-pollution': {'Health_Equity_Flag_Percent': 26.08695652173913,
  'Top_Five_Keywords': [("['Asthma'", 21),
   ("'Asthma'", 18),
   ("'asthma'", 10),
   ("'Comorbidities'", 8),
   ("'Air pollution'", 8),
   ("'Allergic rhinitis'", 7),
   ("'COPD'", 7),
   ("'Logistic regression'", 6),
   ("'Risk factors'", 5),
   ("'Health disparities'", 4),
   ("'Genetic risk factors'", 4),
   ("'Asthma prevalence'", 3),
   ("'Genetic ancestry'", 3),
   ("'Atrial fibrillation'", 3),
   ("'Coronary heart disease'", 3)],
  'R1_Percent': 82.6086956521739,
  'R2_Percent': 6.521739130434782,
  'Neither_R1_R2_Percent': 10.869565217391305},
 'cardiovascular-with': {'Health_Equity_Flag_Percent': 29.896907216494846,
  'Top_Five_Keywords': [("'Cardiovascular disease'", 85),
   ("'Cardiovascular diseases'", 32),
   ("'Risk factors'", 32),
   ("['Cardiovascular disease'", 28),
   ("'Social determinants of health'", 25),
   ("'Hypertension'", 23),
   ("'Diabetes'", 19),
   ("['Cardiovascular diseases'", 18),