In [None]:
pwd

In [None]:
!pip install xlrd
!pip install plotly

In [None]:
# import pandas for structuring the data
import pandas as pd

# import numpy for numerical analysis
import numpy as np

# import libs for diagrams inline with the text
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# other utilities
from sklearn import datasets, preprocessing, metrics

In [None]:
# read the CSV file from your data folder into a data frame
df = pd.read_csv ('../data/TB_Burden_Country.csv', index_col=None, na_values=['NA'])

In [None]:
df.shape

In [None]:
df.columns

In [None]:
# see the first 10 records
df_sorted = df.sort_values(by='Estimated prevalence of TB (all forms) per 100 000 population', ascending=False)
df_sorted.head(10)

In [None]:
df.isnull().sum()

In [None]:
print(df.columns)

In [None]:
columns_to_drop = [

    'Estimated mortality of TB cases who are HIV-positive, per 100 000 population, high bound',
    'Estimated number of deaths from TB in people who are HIV-positive, low bound',
    'Estimated number of deaths from TB in people who are HIV-positive, high bound',
    'Estimated number of incident cases (all forms), low bound',
    'Estimated number of incident cases (all forms), high bound',
    'Estimated HIV in incident TB (percent), low bound',
    'Estimated HIV in incident TB (percent), high bound',
    'Estimated incidence of TB cases who are HIV-positive per 100 000 population, low bound',
    'Estimated incidence of TB cases who are HIV-positive per 100 000 population, high bound',
    'Estimated incidence of TB cases who are HIV-positive, low bound',
    'Estimated incidence of TB cases who are HIV-positive, high bound',
    'Case detection rate (all forms), percent, low bound',
    'Case detection rate (all forms), percent, high bound',
    'Estimated mortality of TB cases (all forms, excluding HIV), per 100 000 population, low bound',
    'Estimated mortality of TB cases (all forms, excluding HIV), per 100 000 population, high bound',
    'Estimated number of deaths from TB (all forms, excluding HIV), low bound',
    'Estimated number of deaths from TB (all forms, excluding HIV), high bound',
    'Estimated mortality of TB cases who are HIV-positive, per 100 000 population, low bound',
    'Estimated incidence (all forms) per 100 000 population, low bound',
    'Estimated incidence (all forms) per 100 000 population, high bound',
    'Estimated prevalence of TB (all forms) per 100 000 population, low bound',
    'Estimated prevalence of TB (all forms) per 100 000 population, high bound',
    'Estimated prevalence of TB (all forms), low bound',
    'Estimated prevalence of TB (all forms), high bound'
]

df = df.drop(columns_to_drop, axis=1)

In [None]:
df.count()

In [None]:
df['Year'] = pd.to_datetime(df['Year'], format='%Y')
# Sort the DataFrame by 'Country or territory name' and 'Year', then keep the latest year for each country
latest_data = df.sort_values(by=['Country or territory name', 'Year'], ignore_index=True).groupby('Country or territory name').tail(1)

In [None]:
# Display all countries sorted by Estimated prevalence of TB
sorted_by_prevalence = latest_data.sort_values(by='Estimated prevalence of TB (all forms) per 100 000 population', ascending=False)

In [None]:
# Top 50 lande af Estimeret forkomst af TB per 100.000 indbyggere
sorted_by_prevalence.head(50)

In [None]:
# Group by region and aggregate data for each region
aggregated_data_by_region = latest_data.groupby('Region').agg({
    'Estimated total population number': 'sum',
    'Estimated prevalence of TB (all forms) per 100 000 population': 'mean',  # You can use sum or other aggregation functions
    'Estimated mortality of TB cases (all forms, excluding HIV) per 100 000 population':'mean',
    'Estimated mortality of TB cases who are HIV-positive, per 100 000 population':'mean',
    # Add other columns you want to aggregate
}).reset_index()

# Round the aggregated values to two decimals
aggregated_data_by_region = aggregated_data_by_region.round(2)

# Sort the DataFrame by 'Estimated prevalence of TB (all forms) per 100 000 population' in descending order
sorted_by_prevalence_by_region = aggregated_data_by_region.sort_values(by='Estimated prevalence of TB (all forms) per 100 000 population', ascending=False)

In [None]:
# Top 6 regions af Estimeret forkomst af TB per 100.000 indbyggere
sorted_by_prevalence_by_region.head(6)

In [None]:
# Sample data (replace this with your actual data)
regions = sorted_by_prevalence_by_region['Region']
prevalence_data = sorted_by_prevalence_by_region['Estimated prevalence of TB (all forms) per 100 000 population']
mortality_data = sorted_by_prevalence_by_region['Estimated mortality of TB cases (all forms, excluding HIV) per 100 000 population']
hiv_mortality_data = sorted_by_prevalence_by_region['Estimated mortality of TB cases who are HIV-positive, per 100 000 population']

# Bar chart settings
bar_width = 0.4
opacity = 0.7

# Plotting the bars
fig, ax = plt.subplots(figsize=(10, 6))

# Transparent lightblue bar for Estimated prevalence of TB
rects1 = ax.bar(regions, prevalence_data, bar_width, alpha=opacity, color='lightblue', label='Prevalence')

# Calculate positions for the red and green bars (half the size of light blue bar)
bar_positions = [np.arange(len(regions)) - bar_width / 4, np.arange(len(regions)) + bar_width / 4]

# Red bar for Estimated mortality of TB cases (excluding HIV)
rects2 = ax.bar(bar_positions[0], mortality_data, bar_width / 2, alpha=opacity, color='red', label='Mortality (TB)')

# Green bar for Estimated mortality of TB cases with HIV
rects3 = ax.bar(bar_positions[1], hiv_mortality_data, bar_width / 2, alpha=opacity, color='green', label='Mortality (HIV)')

# Adding labels, title, and legend
ax.set_xlabel('Region')
ax.set_ylabel('Values per 100,000 population')
ax.set_title('TB Metrics by Region')
ax.legend()

# Display the plot
plt.show()