# UIDAI Data Hackathon - Initial Exploration
## Aadhaar Enrollment and Update Analysis

**Objective:** Discover patterns in the data that can lead to meaningful insights

---

## Setup & Imports

In [6]:
# Core libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# Custom utilities
import sys
sys.path.append('../src')
from data_loader import DataLoader
from eda_utils import EDAAnalyzer, quick_eda
from visualization_utils import VisualizationTools, save_figure

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
plt.style.use('seaborn-v0_8-darkgrid')

print("All libraries loaded successfully!")

All libraries loaded successfully!


## 1. Load All Datasets

In [7]:
# Load all three datasets
loader = DataLoader()
datasets = loader.load_all_data()

# Extract individual datasets
enrolment_df = datasets['enrolment']
demographic_df = datasets['demographic']
biometric_df = datasets['biometric']

üöÄ UIDAI Data Loader - Loading All Datasets
üì• Loading Enrolment Data...
   Found 3 CSV files
   Loading api_data_aadhar_enrolment_0_500000.csv...
   Loading api_data_aadhar_enrolment_1000000_1006029.csv...
   Loading api_data_aadhar_enrolment_500000_1000000.csv...
   ‚úÖ Loaded 620,911 records
   Date range: 2025-01-04 00:00:00 to 2025-12-11 00:00:00

üì• Loading Demographic Update Data...
   Found 5 CSV files
   Loading api_data_aadhar_demographic_0_500000.csv...
   Loading api_data_aadhar_demographic_1000000_1500000.csv...
   Loading api_data_aadhar_demographic_1500000_2000000.csv...
   Loading api_data_aadhar_demographic_2000000_2071700.csv...
   Loading api_data_aadhar_demographic_500000_1000000.csv...
   ‚úÖ Loaded 1,248,473 records
   Date range: 2025-01-03 00:00:00 to 2025-12-12 00:00:00

üì• Loading Biometric Update Data...
   Found 4 CSV files
   Loading api_data_aadhar_biometric_0_500000.csv...
   Loading api_data_aadhar_biometric_1000000_1500000.csv...
   Loading api_

In [8]:
# Quick summary
loader.get_summary_stats(datasets)


üìä DATASET SUMMARY

ENROLMENT:
  Rows: 620,911
  Columns: 7
  Memory: 97.40 MB
  Null values: 301,193
  Unique states: 55
  Unique districts: 985
  Unique pincodes: 19463

DEMOGRAPHIC:
  Rows: 1,248,473
  Columns: 6
  Memory: 186.65 MB
  Null values: 572,106
  Unique states: 65
  Unique districts: 983
  Unique pincodes: 19742

BIOMETRIC:
  Rows: 1,529,485
  Columns: 6
  Memory: 228.50 MB
  Null values: 657,521
  Unique states: 57
  Unique districts: 974
  Unique pincodes: 19707


## 2. Initial Data Inspection

In [None]:
# Peek at each dataset
if enrolment_df is not None:
    print("ENROLMENT DATA:")
    display(enrolment_df.head())
    # print("\n" + "="*80 + "\n")

ENROLMENT DATA:


Unnamed: 0,date,state,district,pincode,age_0_5,age_5_17,age_18_greater
0,2025-02-03,Meghalaya,East Khasi Hills,793121,11,61,37
1,2025-09-03,Karnataka,Bengaluru Urban,560043,14,33,39
2,2025-09-03,Uttar Pradesh,Kanpur Nagar,208001,29,82,12
3,2025-09-03,Uttar Pradesh,Aligarh,202133,62,29,15
4,2025-09-03,Karnataka,Bengaluru Urban,560016,14,16,21






In [10]:
if demographic_df is not None:
    print("DEMOGRAPHIC UPDATE DATA:")
    display(demographic_df.head())
    # print("\n" + "="*80 + "\n")

DEMOGRAPHIC UPDATE DATA:


Unnamed: 0,date,state,district,pincode,demo_age_5_17,demo_age_17_
0,2025-01-03,Uttar Pradesh,Gorakhpur,273213,49,529
1,2025-01-03,Andhra Pradesh,Chittoor,517132,22,375
2,2025-01-03,Gujarat,Rajkot,360006,65,765
3,2025-01-03,Andhra Pradesh,Srikakulam,532484,24,314
4,2025-01-03,Rajasthan,Udaipur,313801,45,785


In [11]:
if biometric_df is not None:
    print("BIOMETRIC UPDATE DATA:")
    display(biometric_df.head())
    # print("\n" + "="*80 + "\n")

BIOMETRIC UPDATE DATA:


Unnamed: 0,date,state,district,pincode,bio_age_5_17,bio_age_17_
0,2025-01-03,Haryana,Mahendragarh,123029,280,577
1,2025-01-03,Bihar,Madhepura,852121,144,369
2,2025-01-03,Jammu and Kashmir,Punch,185101,643,1091
3,2025-01-03,Bihar,Bhojpur,802158,256,980
4,2025-01-03,Tamil Nadu,Madurai,625514,271,815


## 3. Automated EDA for Each Dataset

In [12]:
# Run comprehensive EDA on Enrolment data
if enrolment_df is not None:
    enrol_analyzer = quick_eda(enrolment_df, "Enrolment Dataset")


üìä BASIC INFO - Enrolment Dataset
Shape: 620,911 rows √ó 7 columns
Memory usage: 97.40 MB

Columns: ['date', 'state', 'district', 'pincode', 'age_0_5', 'age_5_17', 'age_18_greater']

Data types:
date              datetime64[ns]
state                     object
district                  object
pincode                    int64
age_0_5                    int64
age_5_17                   int64
age_18_greater             int64
dtype: object

‚ùì MISSING DATA ANALYSIS - Enrolment Dataset
Column  Missing Count  Missing %
  date         301193   48.50824

üî¢ NUMERIC SUMMARY - Enrolment Dataset
             pincode        age_0_5       age_5_17  age_18_greater
count  620911.000000  620911.000000  620911.000000   620911.000000
mean   516949.831773       4.640549       2.497202        0.265526
std    211214.478947      22.198077      18.225221        4.093390
min    100000.000000       0.000000       0.000000        0.000000
25%    335804.000000       1.000000       0.000000        0.000000


In [13]:
# Run comprehensive EDA on Demographic Update data
if demographic_df is not None:
    demo_analyzer = quick_eda(demographic_df, "Demographic Update Dataset")


üìä BASIC INFO - Demographic Update Dataset
Shape: 1,248,473 rows √ó 6 columns
Memory usage: 186.65 MB

Columns: ['date', 'state', 'district', 'pincode', 'demo_age_5_17', 'demo_age_17_']

Data types:
date             datetime64[ns]
state                    object
district                 object
pincode                   int64
demo_age_5_17             int64
demo_age_17_              int64
dtype: object

‚ùì MISSING DATA ANALYSIS - Demographic Update Dataset
Column  Missing Count  Missing %
  date         572106  45.824459

üî¢ NUMERIC SUMMARY - Demographic Update Dataset
            pincode  demo_age_5_17  demo_age_17_
count  1.248473e+06   1.248473e+06  1.248473e+06
mean   5.238341e+05   2.791729e+00  2.527436e+01
std    1.999359e+05   1.570044e+01  1.327703e+02
min    1.000000e+05   0.000000e+00  0.000000e+00
25%    3.911450e+05   0.000000e+00  3.000000e+00
50%    5.213440e+05   1.000000e+00  8.000000e+00
75%    6.955640e+05   2.000000e+00  1.900000e+01
max    8.554560e+05   2.690

In [14]:
# Run comprehensive EDA on Biometric Update data
if biometric_df is not None:
    bio_analyzer = quick_eda(biometric_df, "Biometric Update Dataset")


üìä BASIC INFO - Biometric Update Dataset
Shape: 1,529,485 rows √ó 6 columns
Memory usage: 228.50 MB

Columns: ['date', 'state', 'district', 'pincode', 'bio_age_5_17', 'bio_age_17_']

Data types:
date            datetime64[ns]
state                   object
district                object
pincode                  int64
bio_age_5_17             int64
bio_age_17_              int64
dtype: object

‚ùì MISSING DATA ANALYSIS - Biometric Update Dataset
Column  Missing Count  Missing %
  date         657521  42.989699

üî¢ NUMERIC SUMMARY - Biometric Update Dataset
            pincode  bio_age_5_17   bio_age_17_
count  1.529485e+06  1.529485e+06  1.529485e+06
mean   5.178163e+05  2.162773e+01  2.245846e+01
std    1.994971e+05  9.192122e+01  9.674536e+01
min    1.100010e+05  0.000000e+00  0.000000e+00
25%    3.873800e+05  1.000000e+00  2.000000e+00
50%    5.183190e+05  4.000000e+00  5.000000e+00
75%    6.855870e+05  1.300000e+01  1.200000e+01
max    8.554560e+05  8.002000e+03  7.625000e+03



## 4. Temporal Analysis - Looking for Trends

In [16]:
# Create visualization helper
viz = VisualizationTools()

# Analyze enrolment trends
if enrolment_df is not None:
    # Calculate total enrollments per day
    enrolment_df['total_enrolments'] = enrolment_df[['age_0_5', 'age_5_17', 'age_18_greater']].sum(axis=1)
    
    daily_enrolments = enrolment_df.groupby('date')['total_enrolments'].sum().reset_index()
    
    fig = viz.plot_time_series(daily_enrolments, 'date', ['total_enrolments'], 
                               title='Daily Aadhaar Enrollments Over Time')
    fig.show()

In [17]:
# Compare enrollment trends by age group
if enrolment_df is not None:
    age_trends = enrolment_df.groupby('date')[['age_0_5', 'age_5_17', 'age_18_greater']].sum().reset_index()
    
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=age_trends['date'], y=age_trends['age_0_5'], 
                            mode='lines', name='0-5 years'))
    fig.add_trace(go.Scatter(x=age_trends['date'], y=age_trends['age_5_17'], 
                            mode='lines', name='5-17 years'))
    fig.add_trace(go.Scatter(x=age_trends['date'], y=age_trends['age_18_greater'], 
                            mode='lines', name='18+ years'))
    
    fig.update_layout(title='Enrollment Trends by Age Group',
                     xaxis_title='Date', yaxis_title='Enrollments',
                     template='plotly_white', hovermode='x unified')
    fig.show()

## 5. Geographic Analysis - Regional Patterns

In [18]:
# Top states by enrollment
if enrolment_df is not None:
    fig = viz.plot_state_comparison(enrolment_df, 'state', 'total_enrolments')
    fig.show()

In [19]:
# Top districts by enrollment
if enrolment_df is not None:
    fig = viz.plot_top_districts(enrolment_df, 'district', 'total_enrolments', top_n=25)
    fig.show()

## 6. Age Distribution Analysis

In [20]:
# Overall age distribution in enrollments
if enrolment_df is not None:
    age_cols = ['age_0_5', 'age_5_17', 'age_18_greater']
    fig = viz.plot_age_distribution(enrolment_df, age_cols, 
                                   title='Total Enrollments by Age Group')
    fig.show()

## 7. Cross-Dataset Comparisons

In [21]:
# Compare demographic vs biometric updates for children (5-17)
if demographic_df is not None and biometric_df is not None:
    demo_child = demographic_df.groupby('date')['demo_age_5_17'].sum().reset_index()
    bio_child = biometric_df.groupby('date')['bio_age_5_17'].sum().reset_index()
    
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=demo_child['date'], y=demo_child['demo_age_5_17'],
                            mode='lines', name='Demographic Updates (5-17)'))
    fig.add_trace(go.Scatter(x=bio_child['date'], y=bio_child['bio_age_5_17'],
                            mode='lines', name='Biometric Updates (5-17)'))
    
    fig.update_layout(title='Demographic vs Biometric Updates for Children (5-17 years)',
                     xaxis_title='Date', yaxis_title='Update Count',
                     template='plotly_white', hovermode='x unified')
    fig.show()

## 8. Pattern Discovery - Your Analysis Space

Use the cells below to explore specific patterns you've noticed:

In [None]:
# TODO: Investigate interesting patterns you discovered above
# Ideas:
# - States with unusual enrollment patterns
# - Seasonal trends
# - Outlier detection
# - Regional disparities

In [None]:
# Your custom analysis here

## 9. Document Your Findings

### Key Patterns Discovered:
1. 
2. 
3. 

### Potential Problem Statements:
1. 
2. 
3. 

### Next Steps:
1. 
2. 
3. 