In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings

# Settings
sns.set_theme(style="whitegrid")
warnings.filterwarnings('ignore')
%matplotlib inline

# File Definitions
ENROL_FILES = [f for f in os.listdir() if 'enrolment' in f and f.endswith('.csv')]
DEMO_FILES = [f for f in os.listdir() if 'demographic' in f and f.endswith('.csv')]
BIO_FILES = [f for f in os.listdir() if 'biometric' in f and f.endswith('.csv')]

In [None]:
def load_and_preprocess(files, val_cols):
    """Loads multiple CSVs, converts dates, and returns a combined dataframe."""
    print(f"Loading {len(files)} files...")
    df_list = [pd.read_csv(f) for f in files]
    df = pd.concat(df_list, ignore_index=True)
    
    # Date processing
    df['date'] = pd.to_datetime(df['date'], dayfirst=True)
    df['month_year'] = df['date'].dt.to_period('M')
    
    # Aggregation to State/Month level
    agg_df = df.groupby(['state', 'month_year'])[val_cols].sum().reset_index()
    return agg_df

# Execute Loading
enrol_agg = load_and_preprocess(ENROL_FILES, ['age_0_5', 'age_5_17', 'age_18_greater'])
demo_agg = load_and_preprocess(DEMO_FILES, ['demo_age_5_17', 'demo_age_17_'])
bio_agg = load_and_preprocess(BIO_FILES, ['bio_age_5_17', 'bio_age_17_'])

In [None]:
# Merging all streams
master = pd.merge(enrol_agg, demo_agg, on=['state', 'month_year'], how='outer')
master = pd.merge(master, bio_agg, on=['state', 'month_year'], how='outer').fillna(0)

# Feature Engineering: Totals
master['total_enrolment'] = master[['age_0_5', 'age_5_17', 'age_18_greater']].sum(axis=1)
master['total_demographic'] = master[['demo_age_5_17', 'demo_age_17_']].sum(axis=1)
master['total_biometric'] = master[['bio_age_5_17', 'bio_age_17_']].sum(axis=1)
master['total_activity'] = master['total_enrolment'] + master['total_demographic'] + master['total_biometric']

print("Master dataset ready. Total records:", len(master))
master.to_csv('processed_aadhaar_summary.csv', index=False)

In [None]:
plt.figure(figsize=(12, 6))
# Convert Period to string for plotting
trend_df = master.groupby('month_year')[['total_enrolment', 'total_demographic', 'total_biometric']].sum()
trend_df.index = trend_df.index.astype(str)

plt.plot(trend_df.index, trend_df['total_enrolment'], marker='o', label='New Enrolments', linewidth=2)
plt.plot(trend_df.index, trend_df['total_demographic'], marker='s', label='Demographic Updates', linewidth=2)
plt.plot(trend_df.index, trend_df['total_biometric'], marker='^', label='Biometric Updates', linewidth=2)

plt.title('National Aadhaar Activity Trends (2025)', fontsize=15)
plt.xlabel('Month', fontsize=12)
plt.ylabel('Transaction Count', fontsize=12)
plt.legend()
plt.xticks(rotation=45)
plt.show()

In [None]:
state_totals = master.drop(columns='month_year').groupby('state')['total_activity'].sum().sort_values(ascending=False).head(10)

plt.figure(figsize=(10, 6))
sns.barplot(x=state_totals.values, y=state_totals.index, palette='viridis')
plt.title('Top 10 States by Total Aadhaar Activity (2025)', fontsize=14)
plt.xlabel('Cumulative Transactions')
plt.show()