# TB Burden Country — Data Visualization

This notebook performs exploratory data analysis and visualization for the provided dataset. Code is intentionally simple and uses pandas and matplotlib.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
DATA_PATH = '/mnt/data/TB_Burden_Country.csv'
df = pd.read_csv(DATA_PATH)
df.head()

## Data inspection
Check data types, missing values and basic statistics.

In [None]:
df.info()
print(df.isnull().sum())
print(df.describe())

## Visualizations
1. Missing values per column
2. Histograms for key numeric columns
3. Scatter plot between two numeric columns
4. Trend lines by country (if year present)
5. Top countries bar chart

In [None]:
missing = df.isnull().sum()
missing[missing>0].sort_values(ascending=False).plot.bar(figsize=(8,4))
plt.title('Missing values per column (non-zero only)')
plt.ylabel('Count missing')
plt.tight_layout()
plt.show()

In [None]:
numeric_cols = df.select_dtypes(include=['number']).columns.tolist()
for c in numeric_cols[:3]:
    if df[c].dropna().empty:
        continue
    plt.figure(figsize=(6,4))
    plt.hist(df[c].dropna(), bins=30)
    plt.title(f'Distribution of {c}')
    plt.xlabel(c)
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.show()

In [None]:
numeric_cols = df.select_dtypes(include=['number']).columns.tolist()
if len(numeric_cols) >= 2:
    x = numeric_cols[0]
    y = numeric_cols[1]
    plt.figure(figsize=(6,6))
    plt.scatter(df[x], df[y], alpha=0.6)
    plt.xlabel(x)
    plt.ylabel(y)
    plt.title(f'Scatter: {x} vs {y}')
    plt.tight_layout()
    plt.show()

In [None]:
cols = df.columns.tolist()
year_col = next((c for c in cols if 'year' in c.lower()), None)
country_col = next((c for c in cols if any(k in c.lower() for k in ['country','territory','name'])), None)
numeric_cols = df.select_dtypes(include=['number']).columns.tolist()
if year_col and country_col and numeric_cols:
    metric = numeric_cols[0]
    top_countries = df.groupby(country_col)[metric].mean().nlargest(5).index.tolist()
    for country in top_countries:
        sub = df[df[country_col]==country].sort_values(year_col)
        plt.plot(sub[year_col], sub[metric], marker='o', label=country)
    plt.title(f'{metric} over time for top countries')
    plt.xlabel(year_col)
    plt.ylabel(metric)
    plt.legend()
    plt.tight_layout()
    plt.show()

In [None]:
if country_col and numeric_cols:
    metric = numeric_cols[0]
    top = df.groupby(country_col)[metric].mean().nlargest(10)
    top.plot.bar(figsize=(8,5))
    plt.title(f'Top 10 countries by mean {metric}')
    plt.ylabel(metric)
    plt.xlabel(country_col)
    plt.tight_layout()
    plt.show()

## Conclusions
- Visualizations help identify missing data, distributions, relationships and temporal trends.
- Next steps: handle missing values, normalize metrics where needed, and build more advanced visualizations (e.g., maps, treemaps) if geographic data is available.