# UIDAI Data Hackathon - Initial Exploration
## Aadhaar Enrollment and Update Analysis

**Objective:** Discover patterns in the data that can lead to meaningful insights

---

## Setup & Imports

In [1]:
# Core libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# Custom utilities
import sys
sys.path.append('../src')
from data_loader import DataLoader
from eda_utils import EDAAnalyzer, quick_eda
from visualization_utils import VisualizationTools, save_figure

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
plt.style.use('seaborn-v0_8-darkgrid')

print("All libraries loaded successfully!")

All libraries loaded successfully!


## 1. Load All Datasets

In [2]:
# Load all three datasets
loader = DataLoader()
datasets = loader.load_all_data()

# Extract individual datasets
enrolment_df = datasets['enrolment']
demographic_df = datasets['demographic']
biometric_df = datasets['biometric']

UIDAI Data Loader - Loading All Datasets
Loading Enrolment Data...
   Found 3 CSV files
   Loading api_data_aadhar_enrolment_0_500000.csv...
   Loading api_data_aadhar_enrolment_1000000_1006029.csv...
   Loading api_data_aadhar_enrolment_500000_1000000.csv...
   Records before dedup: 1,006,029
   Duplicates found: 386,095
   Records after dedup: 619,912
   Dedup loss: 38.38%
Loaded 619,912 records
Date range: 2025-01-04 00:00:00 to 2025-12-11 00:00:00

 Loading Demographic Update Data...
   Found 5 CSV files
   Loading api_data_aadhar_demographic_0_500000.csv...
   Loading api_data_aadhar_demographic_1000000_1500000.csv...
   Loading api_data_aadhar_demographic_1500000_2000000.csv...
   Loading api_data_aadhar_demographic_2000000_2071700.csv...
   Loading api_data_aadhar_demographic_500000_1000000.csv...
   Records before dedup: 2,071,700
   Duplicates found: 824,910
   Records after dedup: 1,246,788
   Dedup loss: 39.82%
   Loaded 1,246,788 records
   Date range: 2025-01-03 00:00:00 t

In [3]:
# Quick summary
loader.get_summary_stats(datasets)


 DATASET SUMMARY

ENROLMENT:
  Rows: 619,912
  Columns: 7
  Memory: 97.25 MB
  Null values: 300,359
  Unique states: 36
  Unique districts: 984
  Unique pincodes: 19462

DEMOGRAPHIC:
  Rows: 1,246,788
  Columns: 6
  Memory: 186.41 MB
  Null values: 570,694
  Unique states: 46
  Unique districts: 982
  Unique pincodes: 19741

BIOMETRIC:
  Rows: 1,527,796
  Columns: 6
  Memory: 228.27 MB
  Null values: 656,066
  Unique states: 39
  Unique districts: 974
  Unique pincodes: 19707


## 2. Initial Data Inspection

In [4]:
# Peek at each dataset
if enrolment_df is not None:
    print("ENROLMENT DATA:")
    display(enrolment_df.head())
    # print("\n" + "="*80 + "\n")

ENROLMENT DATA:


Unnamed: 0,date,state,district,pincode,age_0_5,age_5_17,age_18_greater
0,2025-02-03,Meghalaya,East Khasi Hills,793121,11,61,37
1,2025-09-03,Karnataka,Bengaluru Urban,560043,14,33,39
2,2025-09-03,Uttar Pradesh,Kanpur Nagar,208001,29,82,12
3,2025-09-03,Uttar Pradesh,Aligarh,202133,62,29,15
4,2025-09-03,Karnataka,Bengaluru Urban,560016,14,16,21


In [5]:
if demographic_df is not None:
    print("DEMOGRAPHIC UPDATE DATA:")
    display(demographic_df.head())
    # print("\n" + "="*80 + "\n")

DEMOGRAPHIC UPDATE DATA:


Unnamed: 0,date,state,district,pincode,demo_age_5_17,demo_age_17_
0,2025-01-03,Uttar Pradesh,Gorakhpur,273213,49,529
1,2025-01-03,Andhra Pradesh,Chittoor,517132,22,375
2,2025-01-03,Gujarat,Rajkot,360006,65,765
3,2025-01-03,Andhra Pradesh,Srikakulam,532484,24,314
4,2025-01-03,Rajasthan,Udaipur,313801,45,785


In [6]:
if biometric_df is not None:
    print("BIOMETRIC UPDATE DATA:")
    display(biometric_df.head())
    # print("\n" + "="*80 + "\n")

BIOMETRIC UPDATE DATA:


Unnamed: 0,date,state,district,pincode,bio_age_5_17,bio_age_17_
0,2025-01-03,Haryana,Mahendragarh,123029,280,577
1,2025-01-03,Bihar,Madhepura,852121,144,369
2,2025-01-03,Jammu And Kashmir,Punch,185101,643,1091
3,2025-01-03,Bihar,Bhojpur,802158,256,980
4,2025-01-03,Tamil Nadu,Madurai,625514,271,815


## 3. Automated EDA for Each Dataset

In [7]:
# Run comprehensive EDA on Enrolment data
if enrolment_df is not None:
    enrol_analyzer = quick_eda(enrolment_df, "Enrolment Dataset")


 BASIC INFO - Enrolment Dataset
Shape: 619,912 rows × 7 columns
Memory usage: 97.25 MB

Columns: ['date', 'state', 'district', 'pincode', 'age_0_5', 'age_5_17', 'age_18_greater']

Data types:
date              datetime64[ns]
state                     object
district                  object
pincode                    int64
age_0_5                    int64
age_5_17                   int64
age_18_greater             int64
dtype: object

 MISSING DATA ANALYSIS - Enrolment Dataset
Column  Missing Count  Missing %
  date         300359  48.451877

 NUMERIC SUMMARY - Enrolment Dataset
             pincode        age_0_5      age_5_17  age_18_greater
count  619912.000000  619912.000000  619912.00000   619912.000000
mean   516643.035900       4.646342       2.50089        0.265641
std    211156.190829      22.215468      18.23966        4.091559
min    110001.000000       0.000000       0.00000        0.000000
25%    335801.000000       1.000000       0.00000        0.000000
50%    508377.0000

In [8]:
# Run comprehensive EDA on Demographic Update data
if demographic_df is not None:
    demo_analyzer = quick_eda(demographic_df, "Demographic Update Dataset")


 BASIC INFO - Demographic Update Dataset
Shape: 1,246,788 rows × 6 columns
Memory usage: 186.41 MB

Columns: ['date', 'state', 'district', 'pincode', 'demo_age_5_17', 'demo_age_17_']

Data types:
date             datetime64[ns]
state                    object
district                 object
pincode                   int64
demo_age_5_17             int64
demo_age_17_              int64
dtype: object

 MISSING DATA ANALYSIS - Demographic Update Dataset
Column  Missing Count  Missing %
  date         570694  45.773139

 NUMERIC SUMMARY - Demographic Update Dataset
            pincode  demo_age_5_17  demo_age_17_
count  1.246788e+06   1.246788e+06  1.246788e+06
mean   5.235808e+05   2.795253e+00  2.530569e+01
std    1.998919e+05   1.571075e+01  1.328572e+02
min    1.100010e+05   0.000000e+00  0.000000e+00
25%    3.911250e+05   0.000000e+00  3.000000e+00
50%    5.213240e+05   1.000000e+00  8.000000e+00
75%    6.955010e+05   2.000000e+00  1.900000e+01
max    8.554560e+05   2.690000e+03  1.6

In [9]:
# Run comprehensive EDA on Biometric Update data
if biometric_df is not None:
    bio_analyzer = quick_eda(biometric_df, "Biometric Update Dataset")


 BASIC INFO - Biometric Update Dataset
Shape: 1,527,796 rows × 6 columns
Memory usage: 228.27 MB

Columns: ['date', 'state', 'district', 'pincode', 'bio_age_5_17', 'bio_age_17_']

Data types:
date            datetime64[ns]
state                   object
district                object
pincode                  int64
bio_age_5_17             int64
bio_age_17_              int64
dtype: object

 MISSING DATA ANALYSIS - Biometric Update Dataset
Column  Missing Count  Missing %
  date         656066   42.94199

 NUMERIC SUMMARY - Biometric Update Dataset
            pincode  bio_age_5_17   bio_age_17_
count  1.527796e+06  1.527796e+06  1.527796e+06
mean   5.175921e+05  2.165102e+01  2.248184e+01
std    1.994521e+05  9.196934e+01  9.679626e+01
min    1.100010e+05  0.000000e+00  0.000000e+00
25%    3.873600e+05  1.000000e+00  2.000000e+00
50%    5.182200e+05  4.000000e+00  5.000000e+00
75%    6.855150e+05  1.300000e+01  1.200000e+01
max    8.554560e+05  8.002000e+03  7.625000e+03

 CATEGORICAL

## 4. Temporal Analysis - Looking for Trends

In [10]:
# Create visualization helper
viz = VisualizationTools()

# Analyze enrolment trends
if enrolment_df is not None:
    # Calculate total enrollments per day
    enrolment_df['total_enrolments'] = enrolment_df[['age_0_5', 'age_5_17', 'age_18_greater']].sum(axis=1)
    
    daily_enrolments = enrolment_df.groupby('date')['total_enrolments'].sum().reset_index()
    
    fig = viz.plot_time_series(daily_enrolments, 'date', ['total_enrolments'], 
                               title='Daily Aadhaar Enrollments Over Time')
    fig.show()

In [11]:
# Compare enrollment trends by age group
if enrolment_df is not None:
    age_trends = enrolment_df.groupby('date')[['age_0_5', 'age_5_17', 'age_18_greater']].sum().reset_index()
    
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=age_trends['date'], y=age_trends['age_0_5'], 
                            mode='lines', name='0-5 years'))
    fig.add_trace(go.Scatter(x=age_trends['date'], y=age_trends['age_5_17'], 
                            mode='lines', name='5-17 years'))
    fig.add_trace(go.Scatter(x=age_trends['date'], y=age_trends['age_18_greater'], 
                            mode='lines', name='18+ years'))
    
    fig.update_layout(title='Enrollment Trends by Age Group',
                     xaxis_title='Date', yaxis_title='Enrollments',
                     template='plotly_white', hovermode='x unified')
    fig.show()

## 5. Geographic Analysis - Regional Patterns

In [12]:
# Top states by enrollment
if enrolment_df is not None:
    fig = viz.plot_state_comparison(enrolment_df, 'state', 'total_enrolments')
    fig.show()

In [13]:
# Top districts by enrollment
if enrolment_df is not None:
    fig = viz.plot_top_districts(enrolment_df, 'district', 'total_enrolments', top_n=25)
    fig.show()

## 6. Age Distribution Analysis

In [14]:
# Overall age distribution in enrollments
if enrolment_df is not None:
    age_cols = ['age_0_5', 'age_5_17', 'age_18_greater']
    fig = viz.plot_age_distribution(enrolment_df, age_cols, 
                                   title='Total Enrollments by Age Group')
    fig.show()

## 7. Cross-Dataset Comparisons

In [15]:
# Compare demographic vs biometric updates for children (5-17)
if demographic_df is not None and biometric_df is not None:
    demo_child = demographic_df.groupby('date')['demo_age_5_17'].sum().reset_index()
    bio_child = biometric_df.groupby('date')['bio_age_5_17'].sum().reset_index()
    
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=demo_child['date'], y=demo_child['demo_age_5_17'],
                            mode='lines', name='Demographic Updates (5-17)'))
    fig.add_trace(go.Scatter(x=bio_child['date'], y=bio_child['bio_age_5_17'],
                            mode='lines', name='Biometric Updates (5-17)'))
    
    fig.update_layout(title='Demographic vs Biometric Updates for Children (5-17 years)',
                     xaxis_title='Date', yaxis_title='Update Count',
                     template='plotly_white', hovermode='x unified')
    fig.show()