# Task 2: Exploratory Data Analysis (Revised)
Analyze the data to understand patterns and factors influencing financial inclusion in Ethiopia.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Set style
sns.set_theme(style="whitegrid")
plt.rcParams['figure.figsize'] = [12, 6]

In [None]:
# Load Data
data_path = "../data/raw/ethiopia_fi_unified_data.csv"
df = pd.read_csv(data_path)

# --- 1. Dataset Overview ---
print("--- Dataset Overview ---")
print(df.groupby(['record_type', 'pillar']).size())
print("\nUnique Indicators:\n", df['indicator'].unique())

### Temporal Coverage

In [None]:
# Temporal Coverage
obs = df[df['record_type'] == 'observation'].copy()
obs['year'] = pd.to_datetime(obs['observation_date']).dt.year

plt.figure(figsize=(12, 5))
pivot_coverage = obs.pivot_table(index='indicator', columns='year', values='value_numeric', aggfunc='count')
sns.heatmap(pivot_coverage.fillna(0), cmap="Blues", cbar=False, linewidths=.5)
plt.title("Data Coverage by Indicator and Year")
plt.tight_layout()
plt.show()

### Data Quality Assessment & Limitations
Analysis of confidence levels and identifying coverage gaps.

In [None]:
# Confidence level distribution
plt.figure(figsize=(8, 5))
sns.countplot(data=df, x='confidence', palette='viridis')
plt.title("Distribution of Data Confidence Levels")
plt.show()

# Gaps in Schema
print("Missing Value Analysis (Gaps):")
print(df[['pillar', 'indicator', 'value_numeric']].isnull().sum())

### Access Analysis: Account Ownership Trajectory (2011-2024)
Growth rates and the 2021-2024 slowdown investigation.

In [None]:
# Account Ownership Trajectory
acc = obs[obs['indicator_code'] == 'account_ownership'].sort_values('observation_date')
acc_men = obs[obs['indicator_code'] == 'account_ownership_men'].sort_values('observation_date')
acc_women = obs[obs['indicator_code'] == 'account_ownership_women'].sort_values('observation_date')

plt.figure(figsize=(10, 6))
plt.plot(pd.to_datetime(acc['observation_date']), acc['value_numeric'], marker='o', linewidth=2, label='Overall')
if not acc_men.empty:
    plt.scatter(pd.to_datetime(acc_men['observation_date']), acc_men['value_numeric'], color='blue', marker='s', label='Men')
if not acc_women.empty:
    plt.scatter(pd.to_datetime(acc_women['observation_date']), acc_women['value_numeric'], color='green', marker='^', label='Women')

# Annotate Growth
for i in range(1, len(acc)):
    prev = acc.iloc[i-1]['value_numeric']
    curr = acc.iloc[i]['value_numeric']
    diff = curr - prev
    plt.annotate(f"+{diff*100:.1f}pp", 
                 (pd.to_datetime(acc.iloc[i]['observation_date']), curr),
                 xytext=(0, 10), textcoords='offset points', ha='center')

plt.title("Account Ownership Trajectory (2011-2024)")
plt.ylabel("Ownership Rate")
plt.ylim(0, 0.8)
plt.legend()
plt.show()

### Registered vs. Active Gap (The +3pp Mystery)
Comparing Telebirr's 54M user base vs only a 3 percentage point increase in formal account ownership.

In [None]:
print("Growth Check:")
print("2017 to 2021: +11pp (35% to 46%)")
print("2021 to 2024: +3pp (46% to 49%)")
print("Telebirr Users (2024): 54 Million")
print("\nLimitation: High 'Registration' does not reflect 'New-to-Institution' users.")
print("Hypothesis: Multi-banking / Overlap. Many new MM users already had accounts, or MM accounts are not transacting.")

### Event Timeline Overlay

In [None]:
# Event Timeline Overlay
events = df[df['record_type'] == 'event'].sort_values('observation_date')

plt.figure(figsize=(14, 7))
plt.plot(pd.to_datetime(acc['observation_date']), acc['value_numeric'], marker='o', linewidth=2, color='black', label='Account Ownership')

# Overlay Events
colors = {'policy': 'red', 'product_launch': 'blue', 'infrastructure': 'green'}
for _, event in events.iterrows():
    ev_date = pd.to_datetime(event['observation_date'])
    plt.axvline(ev_date, color=colors.get(event['category'], 'gray'), linestyle='--', alpha=0.7)
    plt.text(ev_date, 0.05, event['indicator'], rotation=90, verticalalignment='bottom', fontsize=9)

plt.title("Timeline of Events vs Account Ownership Transition")
plt.ylabel("Account Ownership Rate")
plt.legend(['Account Ownership'])
plt.show()