In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import textwrap


# Question 1:

### Part 5

In [None]:
df = pd.read_csv("../Datasets/DS-1/data.csv", 
                usecols=["age_group", "underlying_conditions_yn", "icu_yn"],
                dtype={"age_group": "category", "underlying_conditions_yn": "category", "icu_yn": "category"})


In [None]:
df.info()
print("Categories for 'age_group':", df["age_group"].cat.categories)
print("Categories for 'underlying_conditions_yn':", df["underlying_conditions_yn"].cat.categories)
print("Categories for 'icu_yn':", df["icu_yn"].cat.categories)


In [None]:
df = df[(df["age_group"] != "Unknown") & (df["age_group"] != "Missing") & (df["age_group"] != "NA")]
df = df[(df["icu_yn"] != "Missing") & (df["icu_yn"] != "Unknown")]
df = df[(df["underlying_conditions_yn"] != "Missing") & (df["underlying_conditions_yn"] != "Unknown")]
df["underlying_conditions_yn"] = df["underlying_conditions_yn"].fillna("No")

df["age_group"] = df["age_group"].cat.remove_unused_categories()
df["icu_yn"] = df["icu_yn"].cat.remove_unused_categories()
df["underlying_conditions_yn"] = df["underlying_conditions_yn"].cat.remove_unused_categories()

df.reset_index(drop=True, inplace=True)

display(df.head())
display(df.info())

In [None]:
# Create the subplots
fig, axes = plt.subplots(1, 2, figsize=(14, 8), sharey=True)
fig.suptitle('ICU Admissions by Age Group and Underlying Conditions', y=1.03)

# Plot for underlying conditions = Yes
plot_yes = sns.histplot(
    data=df[df['underlying_conditions_yn'] == 'Yes'], 
    y='age_group', 
    hue='icu_yn', 
    multiple='dodge', 
    shrink=0.8, 
    stat='percent', 
    element='bars', 
    palette='Set2', 
    ax=axes[0]
)
axes[0].set_title('Underlying Conditions: Yes')

for p in plot_yes.patches:
    percentage = '{:.1f}%'.format(p.get_width())
    plot_yes.annotate(percentage, 
                      (p.get_width(), p.get_y() + p.get_height() / 2.), 
                      ha='center', 
                      va='center', 
                      xytext=(5, 0), 
                      textcoords='offset points')

# Plot for underlying conditions = No
plot_no = sns.histplot(
    data=df[df['underlying_conditions_yn'] == 'No'], 
    y='age_group', 
    hue='icu_yn', 
    multiple='dodge', 
    shrink=0.8, 
    stat='percent', 
    element='bars', 
    palette='Set2', 
    ax=axes[1]
)
axes[1].set_title('Underlying Conditions: No')

for p in plot_no.patches:
    percentage = '{:.1f}%'.format(p.get_width())
    plot_no.annotate(percentage, 
                     (p.get_width(), p.get_y() + p.get_height() / 2.), 
                     ha='center', 
                     va='center', 
                     xytext=(5, 0), 
                     textcoords='offset points')

# Set axis labels
for ax in axes:
    ax.set_xlabel('Percentage (%)')
    ax.set_ylabel('Age Group')
    ax.legend(title='ICU Admission', labels=['No', 'Yes'])

plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
from scipy.stats import chi2_contingency

# Create the contingency table
contingency_table = pd.crosstab(index=[df['age_group'], df['underlying_conditions_yn']], columns=df['icu_yn'])

# Display the contingency table
print("Contingency Table:")
print(contingency_table)

# Perform the chi-square test
chi2, p, dof, expected = chi2_contingency(contingency_table)

# Print the results
print(f"\nChi-square statistic: {chi2}")
print(f"p-value: {p}")
print(f"Degrees of freedom: {dof}")
print("\nExpected frequencies:")
print(expected)

# Interpret the results
alpha = 0.05
if p < alpha:
    print("\nReject the null hypothesis: There is a significant association between ICU admission and the presence of underlying conditions in different age groups.")
else:
    print("\nFail to reject the null hypothesis: There is no significant association between ICU admission and the presence of underlying conditions in different age groups.")


### Part 4

In [None]:
df = pd.read_csv("../Datasets/DS-1/data.csv", 
                usecols=["hosp_yn", "death_yn", "res_state", "case_month"],
                dtype={"age_group": "category", "hosp_yn": "category", "death_yn": "category", 
                       "res_state": "category", "case_month": "category", "sex": "category",
                       "race":"category"})


display(df.info())
# 534

In [None]:
print("Categories for 'hosp_yn':", df["hosp_yn"].cat.categories)
print("Categories for 'death_yn':", df["death_yn"].cat.categories)
print("Categories for 'res_state':", df["res_state"].cat.categories)
print("Categories for 'case_month':", df["case_month"].cat.categories)


In [None]:
df = df[(df["hosp_yn"] != "Unknown") & (df["hosp_yn"] != "Missing")]
df = df[(df["death_yn"] != "Unknown") & (df["death_yn"] != "Missing")]
df = df[(df["res_state"] != "Unknown") & (df["res_state"] != "Missing")]
df = df[(df["case_month"] != "Unknown") & (df["case_month"] != "Missing")]

df.dropna(inplace=True)

df.reset_index(drop=True, inplace=True)

df["hosp_yn"] = df["hosp_yn"].cat.remove_unused_categories()
df["death_yn"] = df["death_yn"].cat.remove_unused_categories()
df["res_state"] = df["res_state"].cat.remove_unused_categories()
df["case_month"] = df["case_month"].cat.remove_unused_categories()

display(df.info())

In [None]:
# Convert 'case_month' from categorical to string
df['case_month'] = df['case_month'].astype(str)

# Inspect the unique values
unique_case_months = df['case_month'].unique()
print(unique_case_months)

# Identify and fix values that do not match the format
# For example, if you find that some values have '-01' at the end, you can strip those
df['case_month'] = df['case_month'].str.replace('-01$', '', regex=True)

# Now convert to datetime
df['case_month'] = pd.to_datetime(df['case_month'], format='%Y-%m', errors='coerce')

# Display the DataFrame info to confirm the change
print(df.info())

In [None]:
# Set a nicer theme
sns.set_theme(style="whitegrid")


# List of states for the example
states = df['res_state'].unique()

# Number of states per row
states_per_row = 3
num_states = len(states)
num_rows = (num_states + states_per_row - 1) // states_per_row

# Create subplots
fig, axes = plt.subplots(num_rows, states_per_row, figsize=(18, 6 * num_rows), constrained_layout=True)
axes = axes.flatten()
for i, state in enumerate(states):
    state_df = df[df["res_state"] == state]
    
    sns.kdeplot(data=state_df[state_df["hosp_yn"] == "Yes"], x="case_month", ax=axes[i], color='#e99675', linestyle='-', linewidth=2, label='Hospitalizations')
    sns.kdeplot(data=state_df[state_df["death_yn"] == "Yes"], x="case_month", ax=axes[i], color='#72b6a1', linestyle='--', linewidth=2, label='Deaths')

    axes[i].set_xlabel('Case Month')
    axes[i].set_ylabel(f'Density (Total Number = {df[df["res_state"] == state].count()["case_month"]:,})')
    axes[i].set_title(f'Hospitalizations and Deaths Over Time in {state}', fontsize=14)
    axes[i].xaxis.set_major_formatter(plt.matplotlib.dates.DateFormatter('%Y'))
    axes[i].xaxis.set_major_locator(plt.matplotlib.dates.YearLocator())
    # Add gridlines for better readability
    axes[i].grid(False)
    
    # Improve legend
    handles, labels = axes[i].get_legend_handles_labels()
    axes[i].legend(handles=handles, labels=labels, title='Legend', fontsize=10, title_fontsize='13')

# Remove any empty subplots
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.show()

### Part 3

In [None]:
df = pd.read_csv("../Datasets/DS-1/data.csv", 
                usecols=["hosp_yn", "death_yn", "age_group", "case_month"],
                dtype={"age_group": "category", "hosp_yn": "category", "death_yn": "category", 
                       "age_group": "category", "case_month": "category", "sex": "category",
                       "race":"category"})


display(df.info())

In [None]:
print("Categories for 'hosp_yn':", df["hosp_yn"].cat.categories)
print("Categories for 'death_yn':", df["death_yn"].cat.categories)
print("Categories for 'age_group':", df["age_group"].cat.categories)
print("Categories for 'case_month':", df["case_month"].cat.categories)

In [None]:
df = df[(df["hosp_yn"] != "Unknown") & (df["hosp_yn"] != "Missing")]
df = df[(df["death_yn"] != "Unknown") & (df["death_yn"] != "Missing")]
df = df[(df["age_group"] != "Unknown") & (df["age_group"] != "Missing")]
df = df[(df["case_month"] != "Unknown") & (df["case_month"] != "Missing")]

df.dropna(inplace=True)

df.reset_index(drop=True, inplace=True)

df["hosp_yn"] = df["hosp_yn"].cat.remove_unused_categories()
df["death_yn"] = df["death_yn"].cat.remove_unused_categories()
df["age_group"] = df["age_group"].cat.remove_unused_categories()
df["case_month"] = df["case_month"].cat.remove_unused_categories()

display(df.info())

In [None]:
# Convert 'case_month' from categorical to string
df['case_month'] = df['case_month'].astype(str)

# Inspect the unique values
unique_case_months = df['case_month'].unique()
print(unique_case_months)

# Identify and fix values that do not match the format
# For example, if you find that some values have '-01' at the end, you can strip those
df['case_month'] = df['case_month'].str.replace('-01$', '', regex=True)

# Now convert to datetime
df['case_month'] = pd.to_datetime(df['case_month'], format='%Y-%m', errors='coerce')

# Display the DataFrame info to confirm the change
print(df.info())

In [None]:
# Set a nicer theme
sns.set_theme(style="whitegrid")

# List of age_groups for the example
age_groups = df['age_group'].unique()

# Number of age_groups per row
age_groups_per_row = 2
num_age_groups = len(age_groups)
num_rows = (num_age_groups + age_groups_per_row - 1) // age_groups_per_row

# Create subplots
fig, axes = plt.subplots(num_rows, age_groups_per_row, figsize=(18, 6 * num_rows), constrained_layout=True)
axes = axes.flatten()

for i, age_group in enumerate(age_groups):
    age_df = df[df["age_group"] == age_group]
    
    sns.kdeplot(data=age_df[age_df["hosp_yn"] == "Yes"], x="case_month", ax=axes[i], color='#72b6a1', linestyle='-', linewidth=2, label='Hospitalizations')
    sns.kdeplot(data=age_df[age_df["death_yn"] == "Yes"], x="case_month", ax=axes[i], color='#e99675', linestyle='--', linewidth=2, label='Deaths')

    axes[i].set_xlabel('Case Month')
    axes[i].set_ylabel(f'Density (Total Number = {df[df["age_group"] == age_group].count()["case_month"]:,})')
    axes[i].set_title(f'Hospitalizations and Deaths Over Time in Range {age_group} old', fontsize=14)
    axes[i].xaxis.set_major_formatter(plt.matplotlib.dates.DateFormatter('%Y'))
    axes[i].xaxis.set_major_locator(plt.matplotlib.dates.YearLocator())
    # Add gridlines for better readability
    axes[i].grid(False)
    
    # Improve legend
    handles, labels = axes[i].get_legend_handles_labels()
    axes[i].legend(handles=handles, labels=labels, title='Legend', fontsize=10, title_fontsize='13')

# Remove any empty subplots
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.show()

In [None]:
# df['case_month'] = pd.to_datetime(df['case_month'], format='%Y-%m')

# # List of age_groups for the example
# age_groups = df['age_group'].unique()

# # Number of age_groups per row
# age_groups_per_row = 2
# num_age_groups = len(age_groups)
# num_rows = (num_age_groups + age_groups_per_row - 1) // age_groups_per_row

# # Create subplots
# fig, axes = plt.subplots(num_rows, age_groups_per_row, figsize=(18, 6 * num_rows), constrained_layout=True)
# axes = axes.flatten()

# for i, age_group in enumerate(age_groups):
#     age_df = df[df["age_group"] == age_group]
    
#     sns.kdeplot(data=age_df[age_df["hosp_yn"] == "Yes"], x="case_month", ax=axes[i], color='#72b6a1', linestyle='-', linewidth=2, label='Hospitalizations')
#     sns.kdeplot(data=age_df[age_df["death_yn"] == "Yes"], x="case_month", ax=axes[i], color='#e99675', linestyle='--', linewidth=2, label='Deaths')

#     axes[i].set_xlabel('Case Month')
#     axes[i].set_ylabel(f'Density (Total Number = {df[df["age_group"] == age_group].count()["case_month"]:,})')
#     axes[i].set_title(f'Hospitalizations and Deaths Over Time in Range {age_group} old', fontsize=14)
#     axes[i].xaxis.set_major_formatter(plt.matplotlib.dates.DateFormatter('%Y'))
#     axes[i].xaxis.set_major_locator(plt.matplotlib.dates.YearLocator())
#     # Add gridlines for better readability
#     axes[i].grid(False)
    
#     # Improve legend
#     handles, labels = axes[i].get_legend_handles_labels()
#     axes[i].legend(handles=handles, labels=labels, title='Legend', fontsize=10, title_fontsize='13')

# # Remove any empty subplots
# for j in range(i + 1, len(axes)):
#     fig.delaxes(axes[j])

# plt.show()

### Part 2

In [None]:
df = pd.read_csv("../Datasets/DS-1/data.csv", 
                usecols=["age_group", "death_yn", "sex", "race", "case_month"],
                dtype={"age_group": "category", "death_yn": "category",
                        "sex": "category", "race":"category", "case_month": "category"})


In [None]:
print("Categories for 'sex':", df["sex"].cat.categories)
print("Categories for 'race':", df["race"].cat.categories)
print("Categories for 'age_group':", df["age_group"].cat.categories)
print("Categories for 'death_yn':", df["death_yn"].cat.categories)
print("Categories for 'case_month':", df["case_month"].cat.categories)

In [None]:
df = df[(df["death_yn"] != "Unknown") & (df["death_yn"] != "Missing")]
df = df[(df["sex"] != "Unknown") & (df["sex"] != "Missing") & (df["sex"] != "Other")]
df = df[(df["race"] != "Unknown") & (df["race"] != "Missing")]
df = df[(df["age_group"] != "Unknown") & (df["age_group"] != "Missing")]
df = df[(df["case_month"] != "Unknown") & (df["case_month"] != "Missing")]
df["sex"] = df["sex"].cat.remove_unused_categories()
df["race"] = df["race"].cat.remove_unused_categories()
df["age_group"] = df["age_group"].cat.remove_unused_categories()
df["death_yn"] = df["death_yn"].cat.remove_unused_categories()
df["case_month"] = df["case_month"].cat.remove_unused_categories()

display(df.info())

In [None]:
# Create subplots
fig, axes = plt.subplots(1, 3, figsize=(24, 6))

# Plot 1: Deaths by Age Group
age_group_plot = sns.histplot(data=df, y="age_group", hue="death_yn", multiple="dodge", shrink=.8, stat="percent", palette="Set2", ax=axes[0], legend=False)
axes[0].set_title('Deaths by Age Group')
axes[0].set_xlabel('Percentage')
axes[0].set_ylabel('Age Group')
axes[0].legend(title='Death')
for container in age_group_plot.containers:
    labels = [f'{(v):.1f}%' for v in container.datavalues]
    age_group_plot.bar_label(container, labels=labels, label_type='edge')

# Plot 2: Deaths by Gender
gender_plot = sns.histplot(data=df, y="sex", hue="death_yn", multiple="dodge", shrink=.8, stat="percent", palette="Set2", ax=axes[1], legend=False)
axes[1].set_title('Deaths by Gender')
axes[1].set_xlabel('Percentage')
axes[1].set_ylabel('Gender')
axes[1].legend(title='Death')
for container in gender_plot.containers:
    labels = [f'{(v):.1f}%' for v in container.datavalues]
    gender_plot.bar_label(container, labels=labels, label_type='edge')

# Plot 3: Deaths by Race
race_plot = sns.histplot(data=df, y="race", hue="death_yn", multiple="dodge", shrink=.8, stat="percent", palette="Set2", ax=axes[2], legend=False)
axes[2].set_title('Deaths by Race')
axes[2].set_xlabel('Percentage')
axes[2].set_ylabel('Race')
labels = [textwrap.fill(label.get_text(), 15) for label in race_plot.get_yticklabels()]
race_plot.set_yticklabels(labels)
for container in race_plot.containers:
    labels = [f'{(v):.1f}%' for v in container.datavalues]
    race_plot.bar_label(container, labels=labels, label_type='edge')

axes[0].legend(title='Death', labels=['Yes', 'No'])
axes[1].legend(title='Death', labels=['Yes', 'No'])
axes[2].legend(title='Death', labels=['Yes', 'No'])
plt.tight_layout()
plt.show()


In [None]:
# Convert 'case_month' to datetime format
# Convert 'case_month' from categorical to string
df['case_month'] = df['case_month'].astype(str)

# Inspect the unique values
unique_case_months = df['case_month'].unique()
print(unique_case_months)

# Identify and fix values that do not match the format
# For example, if you find that some values have '-01' at the end, you can strip those
df['case_month'] = df['case_month'].str.replace('-01$', '', regex=True)

# Now convert to datetime
df['case_month'] = pd.to_datetime(df['case_month'], format='%Y-%m', errors='coerce')

# Display the DataFrame info to confirm the change
print(df.info())


In [None]:

# Calculate death rates by month and sex
death_rates_by_month_sex = df.groupby(['case_month', 'sex'])['death_yn'].apply(lambda x: (x == 'Yes').mean()).reset_index()
death_rates_by_month_sex.columns = ['case_month', 'sex', 'death_rate']
death_rates_by_month_sex['death_rate'] = death_rates_by_month_sex.groupby('sex')['death_rate'].transform(lambda x: x.rolling(window=3, min_periods=1).mean())

# Calculate death rates by month and race
death_rates_by_month_race = df.groupby(['case_month', 'race'])['death_yn'].apply(lambda x: (x == 'Yes').mean()).reset_index()
death_rates_by_month_race.columns = ['case_month', 'race', 'death_rate']
death_rates_by_month_race['death_rate'] = death_rates_by_month_race.groupby('race')['death_rate'].transform(lambda x: x.rolling(window=3, min_periods=1).mean())

# Calculate death rates by month and age group
death_rates_by_month_age = df.groupby(['case_month', 'age_group'])['death_yn'].apply(lambda x: (x == 'Yes').mean()).reset_index()
death_rates_by_month_age.columns = ['case_month', 'age_group', 'death_rate']
death_rates_by_month_age['death_rate'] = death_rates_by_month_age.groupby('age_group')['death_rate'].transform(lambda x: x.rolling(window=3, min_periods=1).mean())

# Create side-by-side plots
fig, axes = plt.subplots(1, 3, figsize=(24, 6), sharey=True)

# Plot death rates by month and sex
sns.lineplot(data=death_rates_by_month_sex, x='case_month', y='death_rate', hue='sex', palette='Set2', ax=axes[0])
axes[0].set_title('COVID-19 Death Rates by Sex Over Time (Smoothed)')
axes[0].set_xlabel('Month-Year')
axes[0].set_ylabel('Death Rate')
axes[0].legend(title='Sex')

# Plot death rates by month and race
sns.lineplot(data=death_rates_by_month_race, x='case_month', y='death_rate', hue='race', palette='Set2', ax=axes[1])
axes[1].set_title('COVID-19 Death Rates by Race Over Time (Smoothed)')
axes[1].set_xlabel('Month-Year')
axes[1].legend(title='Race')

# Plot death rates by month and age group
sns.lineplot(data=death_rates_by_month_age, x='case_month', y='death_rate', hue='age_group', palette='Set2', ax=axes[2])
axes[2].set_title('COVID-19 Death Rates by Age Group Over Time (Smoothed)')
axes[2].set_xlabel('Month-Year')
axes[2].legend(title='Age Group')

plt.tight_layout()
plt.show()

### Part 6

In [None]:
from pprint import pprint
df = pd.read_csv("../Datasets/DS-2/data2.csv", 
                usecols=["kindwork", "expctloss"],
                dtype={"kindwork": "category", "expctloss": "category"})


In [None]:
print(df.info())
print(df.shape)
print("Categories for 'kindwork':", df["kindwork"].cat.categories)
print("Categories for 'expctloss':", df["expctloss"].cat.categories)

In [None]:
# Define the mapping dictionaries
mapping_dict_kindwork = {
    '1': 'Government',
    '2': 'Private company',
    '3': 'Non-profit organization including tax exempt and charitable organizations',
    '4': 'Self-employed',
    '5': 'Working in a family business',
}
mapping_dict_WorkLoss = {
    '1': 'Yes',
    '2': 'No',
}

# Filter out unwanted categories
df = df[(df['kindwork'] != "-99") & (df['kindwork'] != "-88")]
df = df[(df['expctloss'] != "-99") & (df['expctloss'] != "-88")]

# Remove unused categories
df['expctloss'] = df['expctloss'].cat.remove_unused_categories()
df['kindwork'] = df['kindwork'].cat.remove_unused_categories()
df.reset_index(drop=True, inplace=True)
# Use rename_categories to replace the codes with the names
df['kindwork'] = df['kindwork'].cat.rename_categories(mapping_dict_kindwork)
df['expctloss'] = df['expctloss'].cat.rename_categories(mapping_dict_WorkLoss)

# Check the updated categories
pprint(df['kindwork'].cat.categories.to_list())
pprint(df['expctloss'].cat.categories.to_list())

In [None]:
# Plot the category vs work loss rate as a horizontal bar chart
plt.figure(figsize=(10, 6))
sns.histplot(data=df, y='kindwork', hue='expctloss', multiple='dodge', shrink=.8, stat='percent', palette='Set2')

# Add percentage labels on top of bars
for p in plt.gca().patches:
    plt.gca().annotate(f'{p.get_width():.1f}%', 
                       (p.get_width(), p.get_y() + p.get_height() / 2.), 
                       ha='center', va='center', 
                       xytext=(9, 0), textcoords='offset points')

plt.title('Rate of Expected Employment Loss Due to COVID-19 by Sector of Employment')
plt.xlabel('Employment Loss Rate (%)')
plt.ylabel('Sector of Employment')

# Get the current y-axis labels
labels = [item.get_text() for item in plt.gca().get_yticklabels()]

# Wrap the labels
wrapped_labels = [textwrap.fill(label, 15) for label in labels]

# Set the wrapped labels
plt.gca().set_yticklabels(wrapped_labels)

plt.tight_layout()
plt.show()


In [None]:
# Load the dataset
df = pd.read_csv("../Datasets/DS-2/data2.csv", 
                usecols=["egender", "rrace", "expctloss"],
                dtype={"egender": "category", "rrace": "category", "expctloss": "category"})

# Define the mapping dictionaries
mapping_dict_gender = {
    '1': 'Male',
    '2': 'Female'
}
mapping_dict_race = {
    '1': 'White, Alone',
    '2': 'Black, Alone',
    '3': 'Asian, Alone',
    '4': 'Any other race alone, or race in combination'
}
mapping_dict_WorkLoss = {
    '1': 'Yes',
    '2': 'No',
}

# Filter out unwanted categories
df = df[(df['egender'] != "-99") & (df['egender'] != "-88")]
df = df[(df['rrace'] != "-99") & (df['rrace'] != "-88")]
df = df[(df['expctloss'] != "-99") & (df['expctloss'] != "-88")]

# Remove unused categories
df['egender'] = df['egender'].cat.remove_unused_categories()
df['rrace'] = df['rrace'].cat.remove_unused_categories()
df['expctloss'] = df['expctloss'].cat.remove_unused_categories()

# Use rename_categories to replace the codes with the names
df['egender'] = df['egender'].cat.rename_categories(mapping_dict_gender)
df['rrace'] = df['rrace'].cat.rename_categories(mapping_dict_race)
df['expctloss'] = df['expctloss'].cat.rename_categories(mapping_dict_WorkLoss)

# Plot the category vs work loss rate as a horizontal bar chart for egender
plt.figure(figsize=(10, 6))
sns.histplot(data=df, y='egender', hue='expctloss', multiple='dodge', shrink=.8, stat='percent', palette='Set2')

# Add percentage labels on top of bars
for p in plt.gca().patches:
    plt.gca().annotate(f'{p.get_width():.1f}%', 
                       (p.get_width(), p.get_y() + p.get_height() / 2.), 
                       ha='center', va='center', 
                       xytext=(9, 0), textcoords='offset points')

plt.title('Rate of Expected Employment Loss Due to COVID-19 by Gender')
plt.xlabel('Employment Loss Rate (%)')
plt.ylabel('Gender')

# Get the current y-axis labels
labels = [item.get_text() for item in plt.gca().get_yticklabels()]

# Wrap the labels
wrapped_labels = [textwrap.fill(label, 15) for label in labels]

# Set the wrapped labels
plt.gca().set_yticklabels(wrapped_labels)

plt.tight_layout()
plt.show()

# Plot the category vs work loss rate as a horizontal bar chart for rrace
plt.figure(figsize=(10, 6))
sns.histplot(data=df, y='rrace', hue='expctloss', multiple='dodge', shrink=.8, stat='percent', palette='Set2')

# Add percentage labels on top of bars
for p in plt.gca().patches:
    plt.gca().annotate(f'{p.get_width():.1f}%', 
                       (p.get_width(), p.get_y() + p.get_height() / 2.), 
                       ha='center', va='center', 
                       xytext=(9, 0), textcoords='offset points')

plt.title('Rate of Expected Employment Loss Due to COVID-19 by Race')
plt.xlabel('Employment Loss Rate (%)')
plt.ylabel('Race')

# Get the current y-axis labels
labels = [item.get_text() for item in plt.gca().get_yticklabels()]

# Wrap the labels
wrapped_labels = [textwrap.fill(label, 15) for label in labels]

# Set the wrapped labels
plt.gca().set_yticklabels(wrapped_labels)

plt.tight_layout()
plt.show()


In [None]:
# Load the dataset
df = pd.read_csv("../Datasets/DS-2/data2.csv", 
                usecols=["tbirth_year", "expctloss"],
                dtype={"tbirth_year": "category", "expctloss": "category"})

# Define the mapping dictionary for work loss
mapping_dict_WorkLoss = {
    '1': 'Yes',
    '2': 'No',
}

# Filter out unwanted categories
df = df[(df['tbirth_year'] != "-99") & (df['tbirth_year'] != "-88")]
df = df[(df['expctloss'] != "-99") & (df['expctloss'] != "-88")]

# Remove unused categories
df['tbirth_year'] = df['tbirth_year'].cat.remove_unused_categories()
df['expctloss'] = df['expctloss'].cat.remove_unused_categories()

# Convert tbirth_year to numeric for plotting
df['tbirth_year'] = df['tbirth_year'].astype(int)

# Use rename_categories to replace the codes with the names
df['expctloss'] = df['expctloss'].cat.rename_categories(mapping_dict_WorkLoss)

# Plot the category vs work loss rate as a horizontal bar chart for tbirth_year
plt.figure(figsize=(12, 8))
sns.histplot(data=df, x='tbirth_year', hue='expctloss', multiple='dodge', shrink=.8, stat='percent', palette='Set2', element="step", fill=True, kde=True)

plt.title('Rate of Expected Employment Loss Due to COVID-19 by Year of Birth')
plt.ylabel('Employment Loss Rate (%)')
plt.xlabel('Year of Birth')


plt.tight_layout()
plt.show()


In [None]:
# Load the main dataset
df = pd.read_csv("../Datasets/DS-2/data2.csv", 
                      usecols=["est_st", "expctloss", "income", "tbirth_year", "delay", "notget"],
                      dtype={"est_st": "category", "expctloss": "category", "income": "category", "tbirth_year": "category", "delay": "category", "notget": "category"})


# Load the dataset
df_hospitalization = pd.read_csv("../Datasets/DS-1/data.csv", dtype={"res_state": "category", "hosp_yn": "category"}, usecols=["res_state", "hosp_yn"])


In [None]:
# Filter the dataframe for hospitalized cases
hospitalized_cases = df_hospitalization[df_hospitalization['hosp_yn'].str.lower() == 'yes']

# Handle missing values
hospitalized_cases = hospitalized_cases.dropna(subset=['res_state'])

# Get the states with the most hospitalized cases
state_hospitalization_counts = hospitalized_cases['res_state'].value_counts().sort_values(ascending=False)

# Display the top states
top_10_states = state_hospitalization_counts.head(10)
display(top_10_states)
top_10_states = top_10_states.index.tolist()

In [None]:
# Define the state mapping dictionary
state_map = {
    'AL': '01', 'AK': '02', 'AZ': '04', 'AR': '05', 'CA': '06', 'CO': '08', 'CT': '09', 'DE': '10',
    'DC': '11', 'FL': '12', 'GA': '13', 'HI': '15', 'ID': '16', 'IL': '17', 'IN': '18', 'IA': '19',
    'KS': '20', 'KY': '21', 'LA': '22', 'ME': '23', 'MD': '24', 'MA': '25', 'MI': '26', 'MN': '27',
    'MS': '28', 'MO': '29', 'MT': '30', 'NE': '31', 'NV': '32', 'NH': '33', 'NJ': '34', 'NM': '35',
    'NY': '36', 'NC': '37', 'ND': '38', 'OH': '39', 'OK': '40', 'OR': '41', 'PA': '42', 'RI': '44',
    'SC': '45', 'SD': '46', 'TN': '47', 'TX': '48', 'UT': '49', 'VT': '50', 'VA': '51', 'WA': '53',
    'WV': '54', 'WI': '55', 'WY': '56', 'PR': '72', 'GU': '66', 'VI': '78', 'MP': '69', 'AS': '60'
}
state_mapping_inv = {
    '01': 'Alabama', '02': 'Alaska', '04': 'Arizona', '05': 'Arkansas', '06': 'California', '08': 'Colorado',
    '09': 'Connecticut', '10': 'Delaware', '11': 'District of Columbia', '12': 'Florida', '13': 'Georgia', 
    '15': 'Hawaii', '16': 'Idaho', '17': 'Illinois', '18': 'Indiana', '19': 'Iowa', '20': 'Kansas', '21': 'Kentucky',
    '22': 'Louisiana', '23': 'Maine', '24': 'Maryland', '25': 'Massachusetts', '26': 'Michigan', '27': 'Minnesota',
    '28': 'Mississippi', '29': 'Missouri', '30': 'Montana', '31': 'Nebraska', '32': 'Nevada', '33': 'New Hampshire',
    '34': 'New Jersey', '35': 'New Mexico', '36': 'New York', '37': 'North Carolina', '38': 'North Dakota', 
    '39': 'Ohio', '40': 'Oklahoma', '41': 'Oregon', '42': 'Pennsylvania', '44': 'Rhode Island', '45': 'South Carolina',
    '46': 'South Dakota', '47': 'Tennessee', '48': 'Texas', '49': 'Utah', '50': 'Vermont', '51': 'Virginia', 
    '53': 'Washington', '54': 'West Virginia', '55': 'Wisconsin', '56': 'Wyoming'
}


pprint(top_10_states)
top_10_full = [state_map[state] for state in top_10_states]
pprint(top_10_full)

In [None]:
df = df[df['est_st'].isin(top_10_full)]

df = df[(df['expctloss'] != "-99") & (df['expctloss'] != "-88")]

df['expctloss'] = df['expctloss'].cat.remove_unused_categories()
df['expctloss'] = df['expctloss'].cat.rename_categories(mapping_dict_WorkLoss)
df['est_st'] = df['est_st'].cat.rename_categories(state_mapping_inv)
df['est_st'] = df['est_st'].cat.remove_unused_categories()
print(df["est_st"].cat.categories)

In [None]:
# Plot the category vs work loss rate as a horizontal bar chart
plt.figure(figsize=(10, 6))
sns.histplot(data=df, y='est_st', hue='expctloss', multiple='dodge', shrink=.8, stat='percent', palette='Set2')

# Add percentage labels on top of bars
for p in plt.gca().patches:
    plt.gca().annotate(f'{p.get_width():.1f}%', 
                       (p.get_width(), p.get_y() + p.get_height() / 2.), 
                       ha='center', va='center', 
                       xytext=(9, 0), textcoords='offset points')

plt.title('Rate of Expected Employment Loss Due to COVID-19 by Sector of Employment')
plt.xlabel('Employment Loss Rate (%)')
plt.ylabel('Sector of Employment')

# Get the current y-axis labels
labels = [item.get_text() for item in plt.gca().get_yticklabels()]

# Wrap the labels
wrapped_labels = [textwrap.fill(label, 15) for label in labels]

# Set the wrapped labels
plt.gca().set_yticklabels(wrapped_labels)

plt.tight_layout()
plt.show()


In [None]:
df = pd.read_csv("../Datasets/DS-2/data2.csv", 
    usecols=["income", "delay", "notget"],
    dtype={
    "delay": "category",
    "notget": "category",
    "income": "category"
})


In [None]:
print(df.info())
print(df.shape)
print("Categories for 'delay':", df["delay"].cat.categories)
print("Categories for 'notget':", df["notget"].cat.categories)
print("Categories for 'income':", df["income"].cat.categories)
df = df[(df["delay"] != "-99") & (df["delay"] != "-88")]
df = df[(df["notget"] != "-99") & (df["notget"] != "-88")]
df = df[(df["income"] != "-99") & (df["income"] != "-88")]

y_n = {"1": "Yes", "2": "No"}
income_categories = {
    "1": "Less than $25,000",
    "2": "$25,000 - $34,999",
    "3": "$35,000 - $49,999",
    "4": "$50,000 - $74,999",
    "5": "$75,000 - $99,999",
    "6": "$100,000 - $149,999",
    "7": "$150,000 - $199,999",
    "8": "$200,000 and above",
    "-99": "Question seen but not selected",
    "-88": "Missing / Did not report"
}

df["delay"] = df["delay"].cat.remove_unused_categories()
df["notget"] = df["notget"].cat.remove_unused_categories()
df ["income"] = df["income"].cat.remove_unused_categories()
df['delay'] = df['delay'].cat.rename_categories(y_n)
df['notget'] = df['notget'].cat.rename_categories(y_n)
df["income"] = df["income"].cat.rename_categories(income_categories)

print("Categories for 'delay':", df["delay"].cat.categories)
print("Categories for 'notget':", df["notget"].cat.categories)
print("Categories for 'income':", df["income"].cat.categories)

In [None]:

# Create the delay_or_notget column
df['delay_or_notget'] = ((df['delay'] == "Yes") | (df['notget'] == "No")).map({True: 'Yes', False: 'No'})


# Create the plot
plt.figure(figsize=(14, 8))
ax = sns.histplot(
    data=df, 
    x='income', 
    hue='delay_or_notget', 
    palette='Set2', 
    multiple='dodge', 
    shrink=0.8, 
    stat='percent',
    element='bars'
)

# Annotate percentages on the plot
for p in ax.patches:
    percentage = '{:.1f}%'.format(p.get_height())
    ax.annotate(percentage, 
                (p.get_x() + p.get_width() / 2., p.get_height()), 
                ha='center', 
                va='center', 
                xytext=(0, 9), 
                textcoords='offset points')

# Set the plot labels and title
plt.title('Delayed or Unobtained Medical Treatment by Household Income', fontsize=16)
plt.xlabel('Household Income', fontsize=14)
plt.ylabel('Percentage (%)', fontsize=14)
plt.legend(title='Delayed/Unobtained Treatment', labels=['No', 'Yes'], fontsize=12)

# Wrap x labels
labels = [item.get_text() for item in plt.gca().get_xticklabels()]
wrapped_labels = ['\n'.join(textwrap.wrap(label, 10)) for label in labels]
plt.gca().set_xticklabels(wrapped_labels)

plt.tight_layout()
plt.show()



In [None]:
# Create a contingency table for the chi-square test
contingencyy_table = pd.crosstab(df['income'], df['delay_or_notget'])

# Perform the chi-square test of independence
chi2, p, dof, expected = chi2_contingency(contingencyy_table)

# Print the test results
print(f'Chi-square statistic: {chi2}')
print(f'p-value: {p}')
print(f'Degrees of freedom: {dof}')
print('Expected frequencies:')
print(expected)

# Interpretation of the results
alpha = 0.05
if p < alpha:
    print("Reject the null hypothesis: There is a significant relationship between household income and the rate of delayed or unobtained medical treatment.")
else:
    print("Fail to reject the null hypothesis: There is no significant relationship between household income and the rate of delayed or unobtained medical treatment.")

In [None]:
df = pd.read_csv("../Datasets/DS-1/data.csv", 
                usecols=["age_group", "symptom_status"],
                dtype={"age_group": "category", "symptom_status": "category"})
print(df.info())
print(df.shape)


In [None]:
print("Categories for 'age_group':", df["age_group"].cat.categories)
print("Categories for 'symptom_status':", df["symptom_status"].cat.categories)
df = df[(df["age_group"] != "Unknown") & (df["age_group"] != "Missing")]
df = df[(df["symptom_status"] != "Unknown") & (df["symptom_status"] != "Missing")]
df["age_group"] = df["age_group"].cat.remove_unused_categories()
df["symptom_status"] = df["symptom_status"].cat.remove_unused_categories()
df.reset_index(drop=True, inplace=True)

In [None]:

print("Categories for 'age_group':", df["age_group"].cat.categories)
print("Categories for 'symptom_status':", df["symptom_status"].cat.categories)
print(df.info())
print(df.shape)

In [None]:
# Create the plot
plt.figure(figsize=(14, 8))
ax = sns.histplot(data=df, x='age_group', hue='symptom_status', palette='Set2', multiple='dodge', shrink=0.8, stat='percent')

# Calculate the percentages for annotation
total_counts = df['age_group'].value_counts().sum()
for p in ax.patches:
    height = p.get_height()
    if height > 0:
        ax.annotate(f'{height:.1f}%', 
                    (p.get_x() + p.get_width() / 2., height), 
                    ha = 'center', 
                    va = 'center', 
                    xytext = (0, 9), 
                    textcoords = 'offset points',
                    fontsize=10,
                    color='black')

# Set the plot labels and title
plt.title('COVID-19 Symptom Manifestation by Age Group', fontsize=16)
plt.xlabel('Age Group', fontsize=14)
plt.ylabel('Percentage (%)', fontsize=14)

# Wrap x labels
labels = [item.get_text() for item in plt.gca().get_xticklabels()]
wrapped_labels = ['\n'.join(textwrap.wrap(label, 10)) for label in labels]
plt.gca().set_xticklabels(wrapped_labels)

plt.tight_layout()
plt.show()


In [None]:

# Create a contingency table for the chi-square test
contingency_table = pd.crosstab(df['symptom_status'], df['age_group'])

# Perform the chi-square test of independence
chi2, p, dof, expected = chi2_contingency(contingency_table)

# Print the test results
print(f'Chi-square statistic: {chi2}')
print(f'p-value: {p}')
print(f'Degrees of freedom: {dof}')
print('Expected frequencies:')
print(expected)

# Interpretation of the results
alpha = 0.05
if p < alpha:
    print("Reject the null hypothesis: There is a significant relationship between COVID-19 symptom manifestation and age group.")
else:
    print("Fail to reject the null hypothesis: There is no significant relationship between COVID-19 symptom manifestation and age group.")
