## Libraries

In [2]:
import pandas as pd

## Data

In [3]:
clin = pd.read_csv("../data-clean/clinical/tb_cases.csv")

## Descriptives

In [7]:
# 1. Counts and percentages of each category in tb_status overall
tb_status_counts = clin['tb_status'].value_counts()
tb_status_percentages = (clin['tb_status'].value_counts(normalize=True) * 100).round()
print("TB Status Counts:\n", tb_status_counts)
print("\nTB Status Percentages:\n", tb_status_percentages)

# 1.1 Count of all TB patients (number clinic_id)
tb_patients_count = clin['clinic_id'].nunique()
print(f"\nCount of all TB patients: {tb_patients_count}")

# 1.2 Count and percentage of presumptive TB (tb_status == "presumptive")
presumptive_tb_count = clin[clin['tb_status'] == 'presumptive']['clinic_id'].nunique()
presumptive_tb_percentage = round((presumptive_tb_count / tb_patients_count) * 100)
print(f"\nCount of presumptive TB cases: {presumptive_tb_count}")
print(f"Percentage of presumptive TB cases: {presumptive_tb_percentage}%")

# 1.3 Count and percentage of newly diagnosed TB (tb_test_res == "positive" or chest_xray_res == "positive")
newly_diagnosed_tb_count = clin[(clin['tb_test_res'] == 'positive') | (clin['chest_xray_res'] == 'positive')]['clinic_id'].nunique()
newly_diagnosed_tb_percentage = round((newly_diagnosed_tb_count / tb_patients_count) * 100)
print(f"\nCount of newly diagnosed TB cases: {newly_diagnosed_tb_count}")
print(f"Percentage of newly diagnosed TB cases: {newly_diagnosed_tb_percentage}%")

# 1.4 Count and percentage of TB patients already on treatment (tb_treat_status not NaN)
on_treatment_tb_count = clin[clin['tb_treat_status'].notna()]['clinic_id'].nunique()
on_treatment_tb_percentage = round((on_treatment_tb_count / tb_patients_count) * 100)
print(f"\nCount of TB patients already on treatment: {on_treatment_tb_count}")
print(f"Percentage of TB patients already on treatment: {on_treatment_tb_percentage}%")

# 2. Median, Q25, and Q75 of diagnosed and presumptive TB cases per day (date)
tb_cases_per_day = clin[clin['tb_status'].isin(['diagnosed', 'presumptive'])].groupby('date').size()
tb_summary = tb_cases_per_day.describe(percentiles=[0.25, 0.75])
tb_median = round(tb_summary['50%'])
tb_q25 = round(tb_summary['25%'])
tb_q75 = round(tb_summary['75%'])
print(f"\nMedian TB cases per day: {tb_median}")
print(f"25th percentile (Q25) TB cases per day: {tb_q25}")
print(f"75th percentile (Q75) TB cases per day: {tb_q75}")

TB Status Counts:
 tb_status
presumptive       984
infectious         70
not infectious      4
Name: count, dtype: int64

TB Status Percentages:
 tb_status
presumptive       93.0
infectious         7.0
not infectious     0.0
Name: proportion, dtype: float64

Count of all TB patients: 1058

Count of presumptive TB cases: 984
Percentage of presumptive TB cases: 93%

Count of newly diagnosed TB cases: 69
Percentage of newly diagnosed TB cases: 7%

Count of TB patients already on treatment: 9
Percentage of TB patients already on treatment: 1%

Median TB cases per day: 11
25th percentile (Q25) TB cases per day: 8
75th percentile (Q75) TB cases per day: 17


In [9]:
# 3. Median, Q25, and Q75 of age overall
age_summary = clin['age'].describe(percentiles=[0.25, 0.75])
age_median = round(age_summary['50%'])
age_q25 = round(age_summary['25%'])
age_q75 = round(age_summary['75%'])
print(f"\nMedian age: {age_median}")
print(f"25th percentile (Q25) age: {age_q25}")
print(f"75th percentile (Q75) age: {age_q75}")

# 4. Count and percentage of each gender overall
gender_counts = clin['gender'].value_counts()
gender_percentages = (clin['gender'].value_counts(normalize=True) * 100).round()
print("\nGender Counts:\n", gender_counts)
print("\nGender Percentages:\n", gender_percentages)

# 5. Count and percentage of each HIV status overall
hiv_status_counts = clin['hiv_status'].value_counts()
hiv_status_percentages = (clin['hiv_status'].value_counts(normalize=True) * 100).round()
print("\nHIV Status Counts:\n", hiv_status_counts)
print("\nHIV Status Percentages:\n", hiv_status_percentages)


Median age: 32
25th percentile (Q25) age: 24
75th percentile (Q75) age: 44

Gender Counts:
 gender
Man      588
Woman    470
Name: count, dtype: int64

Gender Percentages:
 gender
Man      56.0
Woman    44.0
Name: proportion, dtype: float64

HIV Status Counts:
 hiv_status
Negative    743
Positive    165
Unknown     144
Name: count, dtype: int64

HIV Status Percentages:
 hiv_status
Negative    71.0
Positive    16.0
Unknown     14.0
Name: proportion, dtype: float64
