# UIDAI Aadhaar Data Analysis

This notebook analyzes Aadhaar enrolment, demographic, and biometric update data.

In [None]:
# Install Prophet if not already installed
!pip install prophet

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from prophet import Prophet
import numpy as np

# Set plot style
sns.set_theme(style="whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

## 1. Load Data

In [None]:
# Define file paths
enrolment_path = 'api_data_aadhar_enrolment/api_data_aadhar_enrolment_0_500000.csv'
demographic_path = 'api_data_aadhar_demographic/api_data_aadhar_demographic_0_500000.csv'
biometric_path = 'api_data_aadhar_biometric/api_data_aadhar_biometric_0_500000.csv'

# Load CSVs
df_enrolment = pd.read_csv(enrolment_path)
df_demographic = pd.read_csv(demographic_path)
df_biometric = pd.read_csv(biometric_path)

print("Data loaded successfully.")

## 2. Data Preprocessing & Basic Info

In [None]:
# Convert date columns to datetime
df_enrolment['date'] = pd.to_datetime(df_enrolment['date'], format='%Y%m%d', errors='coerce')
df_demographic['date'] = pd.to_datetime(df_demographic['date'], format='%Y%m%d', errors='coerce')
df_biometric['date'] = pd.to_datetime(df_biometric['date'], format='%Y%m%d', errors='coerce')

# Display basic info
print("--- Enrolment Data ---")
print(f"Shape: {df_enrolment.shape}")
print(df_enrolment.info())
print("\nMissing Values:\n", df_enrolment.isnull().sum())

print("\n--- Demographic Data ---")
print(f"Shape: {df_demographic.shape}")
print(df_demographic.info())
print("\nMissing Values:\n", df_demographic.isnull().sum())

print("\n--- Biometric Data ---")
print(f"Shape: {df_biometric.shape}")
print(df_biometric.info())
print("\nMissing Values:\n", df_biometric.isnull().sum())

## 3. Aggregation to Monthly Totals

In [None]:
# Add month column for aggregation
df_enrolment['month'] = df_enrolment['date'].dt.to_period('M')
df_demographic['month'] = df_demographic['date'].dt.to_period('M')
df_biometric['month'] = df_biometric['date'].dt.to_period('M')

# Aggregate Enrolment
enrolment_agg = df_enrolment.groupby(['state', 'district', 'pincode', 'month'])[['age_0_5', 'age_5_17', 'age_18_greater']].sum().reset_index()
enrolment_agg['total_enrolment'] = enrolment_agg['age_0_5'] + enrolment_agg['age_5_17'] + enrolment_agg['age_18_greater']

# Aggregate Demographic
demographic_agg = df_demographic.groupby(['state', 'district', 'pincode', 'month'])[['demo_age_5_17', 'demo_age_17_']].sum().reset_index()
demographic_agg['total_demographic'] = demographic_agg['demo_age_5_17'] + demographic_agg['demo_age_17_']

# Aggregate Biometric
biometric_agg = df_biometric.groupby(['state', 'district', 'pincode', 'month'])[['bio_age_5_17', 'bio_age_17_']].sum().reset_index()
biometric_agg['total_biometric'] = biometric_agg['bio_age_5_17'] + biometric_agg['bio_age_17_']

print("Aggregation complete.")

## 4. Sample Rows (Uttar Pradesh & Bihar)

In [None]:
print("--- Sample: Uttar Pradesh (Enrolment) ---")
display(df_enrolment[df_enrolment['state'] == 'Uttar Pradesh'].head())

print("\n--- Sample: Bihar (Enrolment) ---")
display(df_enrolment[df_enrolment['state'] == 'Bihar'].head())

## 5. Visualizations

In [None]:
# Aggregate by month for the whole country
monthly_enrolment = df_enrolment.groupby('month')[['age_0_5', 'age_5_17', 'age_18_greater']].sum().sum(axis=1)
monthly_demographic = df_demographic.groupby('month')[['demo_age_5_17', 'demo_age_17_']].sum().sum(axis=1)
monthly_biometric = df_biometric.groupby('month')[['bio_age_5_17', 'bio_age_17_']].sum().sum(axis=1)

# Create a combined dataframe for plotting
plot_df = pd.DataFrame({
    'Enrolment': monthly_enrolment,
    'Demographic Update': monthly_demographic,
    'Biometric Update': monthly_biometric
})

# Plot
plot_df.plot(kind='line', marker='o')
plt.title('Monthly Trends: Enrolment vs Updates')
plt.xlabel('Month')
plt.ylabel('Count')
plt.legend()
plt.show()

## 6. State-wise Summary Statistics

In [None]:
# Merge aggregated dataframes for a complete view (outer join to keep all states)
# Note: This is a simplified merge on state for summary purposes

state_enrolment = df_enrolment.groupby('state')[['age_0_5', 'age_5_17', 'age_18_greater']].sum()
state_enrolment['Total Enrolment'] = state_enrolment.sum(axis=1)

state_demographic = df_demographic.groupby('state')[['demo_age_5_17', 'demo_age_17_']].sum()
state_demographic['Total Demographic'] = state_demographic.sum(axis=1)

state_biometric = df_biometric.groupby('state')[['bio_age_5_17', 'bio_age_17_']].sum()
state_biometric['Total Biometric'] = state_biometric.sum(axis=1)

# Combine
state_summary = pd.concat([state_enrolment['Total Enrolment'], 
                           state_demographic['Total Demographic'], 
                           state_biometric['Total Biometric']], axis=1)

# Sort by Total Enrolment and show top 10
print("--- Top 10 States by Enrolment ---")
display(state_summary.sort_values('Total Enrolment', ascending=False).head(10))

# Plot State-wise comparison for top 10
state_summary.sort_values('Total Enrolment', ascending=False).head(10).plot(kind='bar', stacked=False)
plt.title('Top 10 States: Enrolment vs Updates')
plt.xlabel('State')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()

## 7. MBU Risk Command Centre

In [None]:
# 1. Data Preparation
# Merge Enrolment and Biometric data on key columns
mbu_data = pd.merge(enrolment_agg, biometric_agg, 
                    on=['state', 'district', 'pincode', 'month'], 
                    how='outer').fillna(0)

# Calculate E_5_17 and B_5_17
mbu_data['E_5_17'] = mbu_data['age_5_17']
mbu_data['B_5_17'] = mbu_data['bio_age_5_17']

# 2. Risk Calculation
# R_bio_5_17 = B_5_17 / (E_5_17 + 1)
mbu_data['R_bio_5_17'] = mbu_data['B_5_17'] / (mbu_data['E_5_17'] + 1)

# Normalize R_bio_5_17 within each state to 0-1
def normalize_group(group):
    min_val = group.min()
    max_val = group.max()
    if max_val - min_val == 0:
        return pd.Series([0] * len(group), index=group.index)
    return (group - min_val) / (max_val - min_val)

mbu_data['normalized_R_bio_5_17'] = mbu_data.groupby('state')['R_bio_5_17'].transform(normalize_group)

# MBU_Risk = 1 - normalized_R_bio_5_17
mbu_data['MBU_Risk'] = 1 - mbu_data['normalized_R_bio_5_17']

# Add columns: mbu_risk_score, mbu_risk_category, children_at_risk
mbu_data['mbu_risk_score'] = mbu_data['MBU_Risk']

def get_risk_category(score):
    if score < 0.3:
        return 'Green'
    elif score <= 0.6:
        return 'Yellow'
    else:
        return 'Red'

mbu_data['mbu_risk_category'] = mbu_data['mbu_risk_score'].apply(get_risk_category)
mbu_data['children_at_risk'] = mbu_data['E_5_17'] * mbu_data['mbu_risk_score']

print("MBU Risk Calculation Complete.")
display(mbu_data.head())

In [None]:
# 3. Visualization & Reporting

# 1. Top 10 highest risk pincodes
print("--- Top 10 Highest Risk Pincodes ---")
top_10_risk = mbu_data.sort_values('MBU_Risk', ascending=False).head(10)
display(top_10_risk[['state', 'district', 'pincode', 'month', 'MBU_Risk', 'mbu_risk_category', 'children_at_risk']])

# 2. Heatmap for Uttar Pradesh by district
print("\n--- Heatmap: Uttar Pradesh MBU Risk by District ---")
up_data = mbu_data[mbu_data['state'] == 'Uttar Pradesh']
if not up_data.empty:
    heatmap_data = up_data.pivot_table(index='district', columns='month', values='MBU_Risk', aggfunc='mean')
    plt.figure(figsize=(12, 10))
    sns.heatmap(heatmap_data, cmap='RdYlGn_r', annot=False)
    plt.title('Uttar Pradesh: Average MBU Risk by District and Month')
    plt.show()
else:
    print("No data found for Uttar Pradesh.")

# 3. Time trend for top 3 risky districts
print("\n--- Time Trend: Top 3 Risky Districts ---")
# Find top 3 districts by average risk across all time
district_risk = mbu_data.groupby('district')['MBU_Risk'].mean().sort_values(ascending=False)
top_3_districts = district_risk.head(3).index.tolist()

plt.figure(figsize=(12, 6))
for district in top_3_districts:
    district_data = mbu_data[mbu_data['district'] == district]
    # Group by month to handle multiple pincodes in a district
    monthly_risk = district_data.groupby('month')['MBU_Risk'].mean()
    monthly_risk.plot(label=district, marker='o')

plt.title(f'MBU Risk Trend for Top 3 Risky Districts: {top_3_districts}')
plt.xlabel('Month')
plt.ylabel('Average MBU Risk')
plt.legend()
plt.show()

# 4. State-wise risk distribution boxplot
print("\n--- State-wise Risk Distribution ---")
plt.figure(figsize=(15, 8))
sns.boxplot(x='state', y='MBU_Risk', data=mbu_data)
plt.xticks(rotation=90)
plt.title('Distribution of MBU Risk Scores by State')
plt.show()

In [None]:
# Save Results
mbu_data.to_csv('mbu_risk_data.csv', index=False)
print("Results saved to 'mbu_risk_data.csv'.")

## 8. Fraud Radar

In [None]:
# 1. Feature Engineering

# Merge all three aggregations
fraud_data = pd.merge(enrolment_agg, demographic_agg, 
                      on=['state', 'district', 'pincode', 'month'], 
                      how='outer').fillna(0)
fraud_data = pd.merge(fraud_data, biometric_agg, 
                      on=['state', 'district', 'pincode', 'month'], 
                      how='outer').fillna(0)

# Calculate Totals
fraud_data['E_total'] = fraud_data['total_enrolment']
fraud_data['D_total'] = fraud_data['total_demographic']
fraud_data['B_total'] = fraud_data['total_biometric']

# Calculate Ratios
fraud_data['update_to_enrol'] = (fraud_data['D_total'] + fraud_data['B_total']) / (fraud_data['E_total'] + 1)
fraud_data['bio_to_demo'] = fraud_data['B_total'] / (fraud_data['D_total'] + 1)

# Calculate Volume Z-Score (grouped by district)
def calculate_zscore(group):
    if group.std() == 0:
        return pd.Series([0] * len(group), index=group.index)
    return (group - group.mean()) / group.std()

fraud_data['volume_zscore'] = fraud_data.groupby('district')['E_total'].transform(calculate_zscore)

print("Feature Engineering Complete.")
display(fraud_data.head())

In [None]:
# 2. Model Training

features = ['E_total', 'D_total', 'B_total', 'update_to_enrol', 'bio_to_demo']
X = fraud_data[features]

# Handle infinite values if any (though +1 denominator should prevent it, good practice)
X = X.replace([float('inf'), -float('inf')], 0)

# KMeans Clustering
kmeans = KMeans(n_clusters=3, random_state=42)
fraud_data['cluster_label'] = kmeans.fit_predict(X)

# Isolation Forest for Anomaly Detection
iso_forest = IsolationForest(contamination=0.05, random_state=42)
iso_forest.fit(X)

# Calculate Anomaly Score
# decision_function returns negative for anomalies, positive for normal.
# We want a score where higher is more anomalous.
# One way: 0.5 - decision_function (roughly)
# Or just use the decision function and invert/scale it.
scores = iso_forest.decision_function(X)
fraud_data['raw_anomaly_score'] = scores

# Normalize score to 0-1 range where 1 is most anomalous
# The lower the score, the more anomalous. So we invert it.
min_score = scores.min()
max_score = scores.max()
fraud_data['anomaly_score'] = 1 - ((scores - min_score) / (max_score - min_score))

print("Model Training Complete.")

In [None]:
# 3. Rule-based Flagging

fraud_data['HIGH_VOLUME'] = fraud_data['volume_zscore'] > 3
fraud_data['MIX_ANOMALY'] = (fraud_data['bio_to_demo'] > 0.9) | (fraud_data['bio_to_demo'] < 0.1)
fraud_data['BULK_EVENT'] = fraud_data['update_to_enrol'] > 2.0

# Flag Top 1% as Hotspot for Audit
threshold = fraud_data['anomaly_score'].quantile(0.99)
fraud_data['HOTSPOT_AUDIT'] = fraud_data['anomaly_score'] > threshold

print(f"Hotspot Threshold (Top 1%): {threshold:.4f}")
print(f"Number of Hotspots: {fraud_data['HOTSPOT_AUDIT'].sum()}")

In [None]:
# 4. Visualization & Reporting

# 1. Anomaly Score Distribution
plt.figure(figsize=(10, 6))
sns.histplot(fraud_data['anomaly_score'], bins=50, kde=True)
plt.title('Distribution of Anomaly Scores')
plt.xlabel('Anomaly Score (Higher = More Anomalous)')
plt.show()

# 2. Top 20 Anomalous Pincodes
print("--- Top 20 Anomalous Pincodes ---")
top_20_anomalies = fraud_data.sort_values('anomaly_score', ascending=False).head(20)
display(top_20_anomalies[['state', 'district', 'pincode', 'month', 'anomaly_score', 
                          'HIGH_VOLUME', 'MIX_ANOMALY', 'BULK_EVENT', 'HOTSPOT_AUDIT']])

# 3. Scatter: update_to_enrol vs bio_to_demo (color by anomaly_score)
plt.figure(figsize=(10, 8))
sns.scatterplot(data=fraud_data, x='update_to_enrol', y='bio_to_demo', 
                hue='anomaly_score', palette='viridis', size='E_total', sizes=(20, 200))
plt.title('Fraud Radar: Update Ratio vs Bio-Demo Mix (Colored by Anomaly Score)')
plt.xlabel('Update to Enrolment Ratio')
plt.ylabel('Biometric to Demographic Ratio')
plt.show()

In [None]:
# Save Results
fraud_data.to_csv('fraud_radar_data.csv', index=False)
print("Results saved to 'fraud_radar_data.csv'.")

## 9. Migration Planner

In [None]:
# 1. Identify Top 20 Urban Pincodes

# Calculate average D_total per pincode
pincode_avg_updates = demographic_agg.groupby('pincode')['total_demographic'].mean().sort_values(ascending=False)
top_20_pincodes = pincode_avg_updates.head(20).index.tolist()

print(f"Top 20 Urban Pincodes identified: {top_20_pincodes}")

# Filter data for these pincodes
migration_data = demographic_agg[demographic_agg['pincode'].isin(top_20_pincodes)].copy()
migration_data['ds'] = migration_data['month'].dt.to_timestamp()
migration_data = migration_data.rename(columns={'total_demographic': 'y'})

# 2. Forecasting Loop

forecast_results = []

plt.figure(figsize=(15, 10))
plot_count = 0

for pincode in top_20_pincodes:
    # Prepare data for this pincode
    df_prophet = migration_data[migration_data['pincode'] == pincode][['ds', 'y']].sort_values('ds')
    
    if len(df_prophet) < 2:
        continue # Not enough data points
        
    # Fit Prophet Model
    m = Prophet(yearly_seasonality=True, weekly_seasonality=False, daily_seasonality=False)
    m.fit(df_prophet)
    
    # Forecast 6 months ahead
    future = m.make_future_dataframe(periods=6, freq='M')
    forecast = m.predict(future)
    
    # 3. Capacity Planning
    historical_avg = df_prophet['y'].mean()
    # Get only the future forecast part (last 6 rows)
    future_forecast = forecast.tail(6)
    forecast_avg = future_forecast['yhat'].mean()
    forecast_monthly = forecast_avg # Using average of next 6 months as the monthly demand
    
    # Surge Score
    surge_score = (forecast_avg - historical_avg) / historical_avg if historical_avg > 0 else 0
    
    # M/M/1 Queue Model
    # Î» = forecast_monthly / (22 days * 8 hours)
    arrival_rate_lambda = forecast_monthly / (22 * 8)
    service_rate_mu = 2 # updates/hour
    
    # Recommended machines
    # Simple capacity: machines >= lambda / mu
    recommended_machines = np.ceil(arrival_rate_lambda / service_rate_mu)
    
    forecast_results.append({
        'pincode': pincode,
        'historical_avg': historical_avg,
        'forecast_demand': forecast_monthly,
        'surge_pct': surge_score * 100,
        'rec_machines': int(recommended_machines)
    })
    
    # Plot top 5
    if plot_count < 5:
        plt.subplot(3, 2, plot_count + 1)
        plt.plot(df_prophet['ds'], df_prophet['y'], label='Actual')
        plt.plot(forecast['ds'], forecast['yhat'], label='Forecast', linestyle='--')
        plt.title(f'Pincode: {pincode}')
        plt.legend()
        plot_count += 1

plt.tight_layout()
plt.show()

forecast_df = pd.DataFrame(forecast_results)
print("Forecasting Complete.")

In [None]:
# 4. Visualization & Reporting

# 1. Summary Table
print("--- Migration Forecast Summary ---")
display(forecast_df.sort_values('surge_pct', ascending=False))

# 2. Heatmap: Pincodes by Forecasted Surge
plt.figure(figsize=(12, 8))
# Create a dummy matrix for heatmap visualization of 1D data
surge_data = forecast_df.set_index('pincode')[['surge_pct']].sort_values('surge_pct', ascending=False)
sns.heatmap(surge_data, cmap='coolwarm', annot=True, fmt='.1f')
plt.title('Forecasted Demand Surge (%) by Pincode')
plt.show()

In [None]:
# Save Results
forecast_df.to_csv('migration_forecast_data.csv', index=False)
print("Results saved to 'migration_forecast_data.csv'.")