In [None]:
%load_ext autoreload
%autoreload 2
import os
from glob import glob
from matplotlib import pyplot as plt 
import numpy as np
import pandas as pd
from collections import Counter
from prettytable import PrettyTable
from prelim_analysis_helper import *
from datetime import datetime

# Preliminary Analysis
This notebook is the preliminary analysis portion of the COVID research project. The analysis is split into two parts 
1) Distributions of the demographics, PPE, testing, communication, exposure, general. All tables represented show conditional probability tables 
2) Chi2 and Rank sum analysis

## Part 1: Distributions and Conditional Probability Tables
Below shows a subset of the data imported

In [None]:
today = datetime.today().strftime("%m/%d/%Y")
print("Date Generated: ", today)
response_fp = "/path/to/preprocessed/no/FR/data"
response_df = pd.read_csv(response_fp)

In [None]:
vaccine_simplified = []
for x in response_df.Comms_vaccine:
    if x == "I have already received the vaccine" or x == "I will receive the vaccine by the school earlier than the general population":
        vaccine_simplified.append("Earlier than general population/Already received")
    elif x == "I will not receive the vaccine by the school" or x == "I will receive the vaccine by the school at the same time as the general population" or x == "I have not received information regarding receiving vaccine":
        vaccine_simplified.append("Same time as general population/Don't know")
    else:
        print("not found")
        vaccine_simplified.append(np.nan)
response_df["vaccine_simplified"] = vaccine_simplified
response_df["Endowment_per_capita_thousands"] = response_df["Endowment_per_capita"] * 1000000

***
### Demographics


In [None]:
continuous_demographic_features = ['Age', "MedSchool_size", "Endowment_per_capita_thousands"]
categorical_demographic_features = ['Year', 'Race', 'Gender', 'SES', "Patient_SES_Quartile", "Region", "vaccine_simplified"]

#Create histograms of the continuous features
plt.figure(figsize = (8,1.75), dpi = 100)
for i, continuous_feature in enumerate(continuous_demographic_features):
    plt.subplot(1,3,i+1)
    plt.hist(response_df[continuous_feature])
    plt.title(continuous_feature)

plt.tight_layout()
plt.show()

#Show Conditional Tables for the Categorical Features and General Satisfaction
demographic_features = categorical_demographic_features.copy()
demographic_features.append("General_satisfaction")

create_conditional_matrix(response_df, demographic_features)

***
### PPE


In [None]:
ppe_features = ['PPE_accessibility', 'PPE_training', 'PPE_satisfaction']

plt.figure(figsize = (10, 3), dpi = 100)
create_pie_chart(response_df, ppe_features, 1, 3, 140, (0.1,-1.3))
plt.tight_layout()

independent_vars = ['PPE_accessibility', 'PPE_training']
dependent_var = 'PPE_satisfaction'
        
conditional_agree_disagree_table(response_df, ppe_features)

***
### Testing


In [None]:
testing_features = ['Testing_freq', 'Testing_results_turnover', 'M1_2_tested',
                       'Testing_access_if_wanted', 'Testing_symptoms_required',
                       'Testing_satisfaction',]

plt.figure(figsize = (10, 6), dpi = 100)
create_pie_chart(response_df, testing_features, 3, 2, 160, (0.75,-1.11))
plt.tight_layout()
group_unsure_and_no_responses_testing(response_df)
create_conditional_matrix(response_df, testing_features)

***
### Communication


In [None]:
comms_features = [
    'Comms_mental_health',
    'Comms_update_freq', 'Comms_student_worker',
    'Comms_vaccine', 
    'Comms_update_Information on testing protocols',
    'Comms_update_Number of COVID-19 cases in your school',
    'Comms_update_Supportive messages',
    'Comms_update_COVID-19 protocol changes',
    'Comms_update_Status on PPE inventory/availability',
    'Comms_satisfaction',
]
plt.figure(figsize = (8,18))
create_pie_chart(response_df, comms_features, 5, 2, 145, (0,0))
plt.tight_layout()

#Create conditional for agree/disagree
conditional_agree_disagree_table(response_df, ['Comms_mental_health', 'Comms_satisfaction'])

#Create variable summing the number of communication methods
comms_composition =  ['Comms_update_Information on testing protocols',
                      'Comms_update_Number of COVID-19 cases in your school',
                      'Comms_update_Supportive messages',
                      'Comms_update_COVID-19 protocol changes',
                      'Comms_update_Status on PPE inventory/availability',
                     ]
response_df["nCommunication_methods"] = response_df['Comms_update_Information on testing protocols'] + \
                                        response_df['Comms_update_Number of COVID-19 cases in your school'] + \
                                        response_df['Comms_update_Supportive messages'] + \
                                        response_df['Comms_update_COVID-19 protocol changes'] + \
                                        response_df['Comms_update_Status on PPE inventory/availability']
nCommunication_cutoff = 3
response_df["nCommunication_methods>"+str(nCommunication_cutoff)] = response_df["nCommunication_methods"] > nCommunication_cutoff

comms_features.insert(len(comms_features) -1, "nCommunication_methods")
comms_features.insert(len(comms_features)-1, "nCommunication_methods>"+str(nCommunication_cutoff))
make_vaccine_groups(response_df)
count_important_messages_comms(response_df)
group_comms_update_freq(response_df)
create_conditional_matrix(response_df, comms_features)

***
### Exposure


In [None]:
exposure_features = ['Exposure_COVID_positive_hx', 'Exposure_COVID_pt_freq',
                       'Exposure_quarantine_policy', 'Exposure_discomfort_PPE_by_peers',
                       'Exposure_sharing_comfort', 'Exposure_satisfaction',
            ]
plt.figure(figsize = (10,7))
create_pie_chart(response_df, exposure_features, 3, 2, 180, (.1,-1))
plt.tight_layout()
group_unsure_and_no_responses_exposure(response_df)
create_conditional_matrix(response_df, exposure_features)

***
### General Satisfication


In [None]:
Counter(response_df["General_satisfaction"])

In [None]:
general_features = ['PPE_satisfaction', 'Testing_satisfaction', 'Comms_satisfaction',
       'Exposure_satisfaction', 'General_satisfaction']

plt.figure(figsize = (10,7))
create_pie_chart(response_df, general_features, 3, 2, 180, (.1,-1))
plt.tight_layout()

conditional_agree_disagree_table(response_df, general_features)

***
***
## Part 2: Chi2 and Rank Sum Analysis
***
### Chi2 Analysis
Blue represents significant values < 0.01
Red represents significant values < 0.05

In [None]:
demographic_chi = [
    'Year', 'Race', 'Gender','SES',
    'Patient_SES_Quartile',
    'Region',
    "vaccine_simplified", "Comms_student_worker"
    ]
ppe_chi = [
    'PPE_accessibility', 'PPE_training'
    ]
testing_chi = [
    'Testing_freq', 'Testing_results_turnover', 'M1_2_tested',
    'Testing_access_if_wanted', 'Testing_symptoms_required',
    'M1_2_tested_grouped', 'testing_symptoms_required_group', 'testing_access_if_wanted_group', 'test_symptom_req_invgroup'
    ]
comms_chi = [
    'vacc_faster_than_gen_pop', 'vaccinated', 'vaccinated_faster_else',
    'Comms_update_freq', 'Comms_student_worker',
    'Comms_vaccine', 'Comms_mental_health',
    'Comms_update_Information on testing protocols',
    'Comms_update_Number of COVID-19 cases in your school',
    'Comms_update_Supportive messages',
    'Comms_update_COVID-19 protocol changes',
    'Comms_update_Status on PPE inventory/availability', "nCommunication_methods>"+str(nCommunication_cutoff),
    'Comms_nImportant_Messages', 'Comms_update_freq_weekly', 'Comms_update_freq_monthly',
    'Comms_update_freq_mle'
    ]
exposure_chi = [
    'Exposure_COVID_positive_hx', 'Exposure_COVID_pt_freq',
    'Exposure_quarantine_policy', 'Exposure_discomfort_PPE_by_peers', 'Exposure_sharing_comfort',
    'exposure_quarantine_policy_grouped', 'expos_quaran_policy_invgroup', 'exposure_covid_pt_freq_bin'
]

satisfaction = [
    'PPE_satisfaction', 'Testing_satisfaction', 'Comms_satisfaction', 'Exposure_satisfaction','General_satisfaction'
]

get_chi_square_table(response_df, "Demographics", demographic_chi, ['General_satisfaction'])
get_chi_square_table(response_df, "PPE", ppe_chi, ['PPE_satisfaction', 'General_satisfaction'])
get_chi_square_table(response_df, "Testing", testing_chi, ['Testing_satisfaction', 'General_satisfaction'])
get_chi_square_table(response_df, "Communication", comms_chi, ['Comms_satisfaction', 'General_satisfaction'])
get_chi_square_table(response_df, "Exposure", exposure_chi, ['Exposure_satisfaction', 'General_satisfaction'])
get_chi_square_table(response_df, "Satisfaction", ['PPE_satisfaction', 'Testing_satisfaction', 'Comms_satisfaction', 'Exposure_satisfaction'], ['General_satisfaction'])       

***
### Rank Sum

In [None]:
demographic_ranksum = [
    'Age', 'MedSchool_size', 'Endowment_per_capita_thousands'
]

comms_rank_sum = [
    'nCommunication_methods'
]
satisfaction = [
    'PPE_satisfaction_binary', 'Testing_satisfaction_binary', 
    'Comms_satisfaction_binary', 'Exposure_satisfaction_binary','General_satisfaction_binary'
]

get_ranksums(response_df, 'Demographic', demographic_ranksum, ['PPE_satisfaction_binary', 'General_satisfaction_binary'])
get_ranksums(response_df, 'Communication', comms_rank_sum, ['Comms_satisfaction_binary', 'General_satisfaction_binary'])

In [None]:
response_df.reset_index(drop = True, inplace = True)
response_df.drop(columns = ["Unnamed: 0"], inplace = True)
response_df.head()
fp = "./../../Datasets/Survey_Data/preprocessed/after_prelim_analysis.csv"
response_df.to_csv(fp)
print("File saved to ", fp)