## Install packages for Data Manipulation

In [None]:
%pip install pandas numpy pyarrow

## Install packages for visualization

In [None]:
%pip install matplotlib seaborn plotly

## Scikit-learn for ML models

In [None]:
%pip install scikit-learn

## Statsmodels for time series analysis

In [None]:
%pip install statsmodels

## Geopandas for geographic analysis

In [None]:
%pip install geopandas

## TASK 1: Loading and Cleaning Dataset

### Importing Libraries

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

In [2]:
# Configure pandas to show more data
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

### Define File Paths

In [3]:
ENROLMENT_FILES = [
    'api_data_aadhar_enrolment/api_data_aadhar_enrolment_0_500000.csv',
    'api_data_aadhar_enrolment/api_data_aadhar_enrolment_500000_1000000.csv',
    'api_data_aadhar_enrolment/api_data_aadhar_enrolment_1000000_1006029.csv'
]

DEMOGRAPHIC_FILES = [
    'api_data_aadhar_demographic/api_data_aadhar_demographic_0_500000.csv',
    'api_data_aadhar_demographic/api_data_aadhar_demographic_500000_1000000.csv',
    'api_data_aadhar_demographic/api_data_aadhar_demographic_1000000_1500000.csv',
    'api_data_aadhar_demographic/api_data_aadhar_demographic_1500000_2000000.csv',
    'api_data_aadhar_demographic/api_data_aadhar_demographic_2000000_2071700.csv'
]

BIOMETRIC_FILES = [
    'api_data_aadhar_biometric/api_data_aadhar_biometric_0_500000.csv',
    'api_data_aadhar_biometric/api_data_aadhar_biometric_500000_1000000.csv',
    'api_data_aadhar_biometric/api_data_aadhar_biometric_1000000_1500000.csv',
    'api_data_aadhar_biometric/api_data_aadhar_biometric_1500000_1861108.csv'
]

CHUNK_SIZE = 100000 # To read 100k rows at a time

### To Read One Sample File

In [4]:
sample_file = ENROLMENT_FILES[0]
df_sample = pd.read_csv(sample_file, nrows=1000) # Just read the first 1000 rows of the file.

In [5]:
df_sample.head()

Unnamed: 0,date,state,district,pincode,age_0_5,age_5_17,age_18_greater
0,02-03-2025,Meghalaya,East Khasi Hills,793121,11,61,37
1,09-03-2025,Karnataka,Bengaluru Urban,560043,14,33,39
2,09-03-2025,Uttar Pradesh,Kanpur Nagar,208001,29,82,12
3,09-03-2025,Uttar Pradesh,Aligarh,202133,62,29,15
4,09-03-2025,Karnataka,Bengaluru Urban,560016,14,16,21


In [6]:
print("="*80)
print("DATA TYPES:")
print("="*80)
print(df_sample.dtypes)

print("\n" + "="*80)
print("MISSING VALUES:")
print("="*80)
print(df_sample.isnull().sum())

print("\n" + "="*80)
print("BASIC STATISTICS:")
print("="*80)
df_sample.describe()

DATA TYPES:
date              object
state             object
district          object
pincode            int64
age_0_5            int64
age_5_17           int64
age_18_greater     int64
dtype: object

MISSING VALUES:
date              0
state             0
district          0
pincode           0
age_0_5           0
age_5_17          0
age_18_greater    0
dtype: int64

BASIC STATISTICS:


Unnamed: 0,pincode,age_0_5,age_5_17,age_18_greater
count,1000.0,1000.0,1000.0,1000.0
mean,530680.44,143.711,96.94,28.223
std,237349.207787,162.210986,127.390921,40.458079
min,110003.0,10.0,10.0,10.0
25%,342301.75,40.0,24.75,12.0
50%,474001.0,90.0,50.5,17.0
75%,785630.75,188.0,120.25,27.0
max,854304.0,1519.0,1812.0,642.0


### Create a Loading Function

In [7]:
def load_and_merge_files(file_list, dataset_name):
    """
    Load multiple CSV files in chunks and merge them.

    Parameters:
    - file_list: List of file paths
    - dataset_name: Name for display (e.g., "ENROLMENT")

    Returns:
    - Merged DataFrame
    """
    all_data = []
    total_rows = 0

    for i, file in enumerate(file_list, 1):
        print(f"File {i}/{len(file_list)}: {file}")

        try:
            chunks = []

            # Read in chunks
            for chunk in pd.read_csv(file, chunksize=CHUNK_SIZE):
                chunks.append(chunk)
                total_rows += len(chunk)
                print(f"Loaded {total_rows:,} rows so far...", end='\r')

            df = pd.concat(chunks, ignore_index=True)
            all_data.append(df)

            print(f"Loaded {len(df):,} rows from this file")

        except FileNotFoundError:
            print(f"Error: File not found: {file}")
            continue
        except Exception as e:
            print(f"Error: {e}")
            continue

    # Merge all files
    if all_data:
        print(f"\nMerging all {dataset_name} files...", end=" ")
        merged_df = pd.concat(all_data, ignore_index=True)
        memory_mb = merged_df.memory_usage(deep=True).sum() / (1024**2)
        print(f"Total: {len(merged_df):,} rows")
        print(f"Memory: {memory_mb:.2f} MB")
        print("\n")
        return merged_df
    else:
        print(f"No data loaded for {dataset_name}")
        print("\n")
        return None

### Loading all the datasets

In [8]:
df_enrolment = load_and_merge_files(ENROLMENT_FILES, "ENROLMENT")
df_demographic = load_and_merge_files(DEMOGRAPHIC_FILES, "DEMOGRAPHIC UPDATE")
df_biometric = load_and_merge_files(BIOMETRIC_FILES, "BIOMETRIC UPDATE")

File 1/3: api_data_aadhar_enrolment/api_data_aadhar_enrolment_0_500000.csv
Loaded 500,000 rows from this file
File 2/3: api_data_aadhar_enrolment/api_data_aadhar_enrolment_500000_1000000.csv
Loaded 500,000 rows from this file
File 3/3: api_data_aadhar_enrolment/api_data_aadhar_enrolment_1000000_1006029.csv
Loaded 6,029 rows from this file

Merging all ENROLMENT files... Total: 1,006,029 rows
Memory: 199.12 MB


File 1/5: api_data_aadhar_demographic/api_data_aadhar_demographic_0_500000.csv
Loaded 500,000 rows from this file
File 2/5: api_data_aadhar_demographic/api_data_aadhar_demographic_500000_1000000.csv
Loaded 500,000 rows from this file
File 3/5: api_data_aadhar_demographic/api_data_aadhar_demographic_1000000_1500000.csv
Loaded 500,000 rows from this file
File 4/5: api_data_aadhar_demographic/api_data_aadhar_demographic_1500000_2000000.csv
Loaded 500,000 rows from this file
File 5/5: api_data_aadhar_demographic/api_data_aadhar_demographic_2000000_2071700.csv
Loaded 71,700 rows from

### Verifying all Dataset

In [9]:
print("\n" + "="*80)
print("DATA LOADING SUMMARY")
print("="*80)
print()

if df_enrolment is not None:
    print(f"Enrolment: {len(df_enrolment):,} rows × {df_enrolment.shape[1]} columns")
else:
    print("Enrolment: Failed to load")

if df_demographic is not None:
    print(f"Demographic: {len(df_demographic):,} rows × {df_demographic.shape[1]} columns")
else:
    print("Demographic: Failed to load")

if df_biometric is not None:
    print(f"Biometric: {len(df_biometric):,} rows × {df_biometric.shape[1]} columns")
else:
    print("Biometric: Failed to load")

total_rows = sum([
    len(df_enrolment) if df_enrolment is not None else 0,
    len(df_demographic) if df_demographic is not None else 0,
    len(df_biometric) if df_biometric is not None else 0
])

print(f"\nTotal rows loaded: {total_rows:,}")
print("\nAll data loaded successfully!")


DATA LOADING SUMMARY

Enrolment: 1,006,029 rows × 7 columns
Demographic: 2,071,700 rows × 6 columns
Biometric: 1,861,108 rows × 6 columns

Total rows loaded: 4,938,837

All data loaded successfully!


In [10]:
df_enrolment.head()

Unnamed: 0,date,state,district,pincode,age_0_5,age_5_17,age_18_greater
0,02-03-2025,Meghalaya,East Khasi Hills,793121,11,61,37
1,09-03-2025,Karnataka,Bengaluru Urban,560043,14,33,39
2,09-03-2025,Uttar Pradesh,Kanpur Nagar,208001,29,82,12
3,09-03-2025,Uttar Pradesh,Aligarh,202133,62,29,15
4,09-03-2025,Karnataka,Bengaluru Urban,560016,14,16,21


In [11]:
df_demographic.head()

Unnamed: 0,date,state,district,pincode,demo_age_5_17,demo_age_17_
0,01-03-2025,Uttar Pradesh,Gorakhpur,273213,49,529
1,01-03-2025,Andhra Pradesh,Chittoor,517132,22,375
2,01-03-2025,Gujarat,Rajkot,360006,65,765
3,01-03-2025,Andhra Pradesh,Srikakulam,532484,24,314
4,01-03-2025,Rajasthan,Udaipur,313801,45,785


In [12]:
df_biometric.head()

Unnamed: 0,date,state,district,pincode,bio_age_5_17,bio_age_17_
0,01-03-2025,Haryana,Mahendragarh,123029,280,577
1,01-03-2025,Bihar,Madhepura,852121,144,369
2,01-03-2025,Jammu and Kashmir,Punch,185101,643,1091
3,01-03-2025,Bihar,Bhojpur,802158,256,980
4,01-03-2025,Tamil Nadu,Madurai,625514,271,815


### Cleaning the Datasets - Enrolment Dataset

#### Parsing Dates

In [13]:
df_enrolment['date'] = pd.to_datetime(
    df_enrolment['date'], 
    format='%d-%m-%Y',
    errors='coerce'  # Invalid dates become NaT
)

# Check for invalid dates
invalid_dates = df_enrolment['date'].isna().sum()
print(f"Invalid dates found: {invalid_dates:,}")
print(f"Data type: {df_enrolment['date'].dtype}")
print(f"Date range: {df_enrolment['date'].min()} to {df_enrolment['date'].max()}")
print(f"Shape: {df_enrolment.shape}")

Invalid dates found: 0
Data type: datetime64[ns]
Date range: 2025-03-02 00:00:00 to 2025-12-31 00:00:00
Shape: (1006029, 7)


#### Standardize Text Fields

In [14]:
df_enrolment['state'] = df_enrolment['state'].str.strip().str.title()
df_enrolment['district'] = df_enrolment['district'].str.strip().str.title()
print(df_enrolment['state'].value_counts())

state
Uttar Pradesh                                   110369
Tamil Nadu                                       92552
Maharashtra                                      77191
West Bengal                                      76530
Karnataka                                        70198
Andhra Pradesh                                   65663
Bihar                                            60567
Rajasthan                                        56159
Madhya Pradesh                                   50225
Gujarat                                          46624
Odisha                                           43692
Telangana                                        42774
Kerala                                           39145
Assam                                            31827
Jharkhand                                        23218
Punjab                                           20439
Chhattisgarh                                     18550
Haryana                                          15997
Jamm

In [15]:
df_enrolment.loc[df_enrolment['state'] == '100000']

Unnamed: 0,date,state,district,pincode,age_0_5,age_5_17,age_18_greater
23108,2025-09-02,100000,100000,100000,0,0,3
46946,2025-09-03,100000,100000,100000,0,0,1
97816,2025-09-08,100000,100000,100000,0,0,1
115798,2025-09-09,100000,100000,100000,0,0,1
153156,2025-09-11,100000,100000,100000,0,0,2
160195,2025-09-12,100000,100000,100000,0,0,2
261778,2025-09-19,100000,100000,100000,0,0,1
272731,2025-09-20,100000,100000,100000,0,0,1
470934,2025-10-24,100000,100000,100000,0,1,0
762744,2025-11-15,100000,100000,100000,0,0,3


In [16]:
state_corrections = {
    # Typos & Spacing
    "West  Bengal": "West Bengal",
    "West Bangal": "West Bengal",
    "Westbengal": "West Bengal",
    
    # Old Names -> New Names
    "Orissa": "Odisha",
    "Pondicherry": "Puducherry",
    
    # Standardization (& -> And)
    "Jammu & Kashmir": "Jammu And Kashmir",
    "Andaman & Nicobar Islands": "Andaman And Nicobar Islands",
    
    # Merging Daman & Diu, Dadra & Nagar Haveli as they are now a single union territory
    "Dadra And Nagar Haveli": "Dadra and Nagar Haveli and Daman and Diu",
    "Daman And Diu": "Dadra and Nagar Haveli and Daman and Diu",
    "Dadra & Nagar Haveli": "Dadra and Nagar Haveli and Daman and Diu",
    "Daman & Diu": "Dadra and Nagar Haveli and Daman and Diu",
    "Dadra And Nagar Haveli And Daman And Diu": "Dadra and Nagar Haveli and Daman and Diu",
    "The Dadra And Nagar Haveli And Daman And Diu": "Dadra and Nagar Haveli and Daman and Diu",

    # 100000 is the internal code for Delhi
    '100000': "Delhi"
}
df_enrolment['state'] = df_enrolment['state'].replace(state_corrections)
df_enrolment['district'] = df_enrolment['district'].replace('100000', 'Delhi')
print(df_enrolment['state'].value_counts())

state
Uttar Pradesh                               110369
Tamil Nadu                                   92552
Maharashtra                                  77191
West Bengal                                  76561
Karnataka                                    70198
Andhra Pradesh                               65663
Bihar                                        60567
Rajasthan                                    56159
Madhya Pradesh                               50225
Odisha                                       47011
Gujarat                                      46624
Telangana                                    42774
Kerala                                       39145
Assam                                        31827
Jharkhand                                    23218
Punjab                                       20439
Chhattisgarh                                 18550
Haryana                                      15997
Jammu And Kashmir                            11455
Himachal Pradesh         

### Validating Pincodes

In [17]:
df_enrolment['pincode'] = df_enrolment['pincode'].astype(str).str.strip()
df_enrolment['pincode_valid'] = df_enrolment['pincode'].str.match(r'^\d{6}$')
invalid_count = (~df_enrolment['pincode_valid']).sum()
print(f"({invalid_count:,} invalid)")

(0 invalid)


### Cleaning Numeric Fields

In [18]:
age_cols = ['age_0_5', 'age_5_17', 'age_18_greater']
for col in age_cols:
    print(f"{col}:")
    # Convert to numeric
    df_enrolment[col] = pd.to_numeric(df_enrolment[col], errors='coerce')

    # Count and fill NaN
    nan_count = df_enrolment[col].isna().sum()
    print(nan_count)

    # Count and fix negative values
    neg_count = (df_enrolment[col] < 0).sum()
    print(neg_count)

    # Convert to integer
    df_enrolment[col] = df_enrolment[col].astype(int)

    # Show stats
    print(f"Range: {df_enrolment[col].min()} to {df_enrolment[col].max()}, Mean: {df_enrolment[col].mean():.2f}")
    print()

age_0_5:
0
0
Range: 0 to 2688, Mean: 3.53

age_5_17:
0
0
Range: 0 to 1812, Mean: 1.71

age_18_greater:
0
0
Range: 0 to 855, Mean: 0.17



### Creating Derived Columns

In [19]:
df_enrolment['total_enrolments'] = df_enrolment[age_cols].sum(axis=1)

df_enrolment['year'] = df_enrolment['date'].dt.year

df_enrolment['month'] = df_enrolment['date'].dt.month

df_enrolment['day_of_week'] = df_enrolment['date'].dt.dayofweek

df_enrolment['week_of_year'] = df_enrolment['date'].dt.isocalendar().week
print(f"Weeks: {df_enrolment['week_of_year'].min()}-{df_enrolment['week_of_year'].max()}")
df_enrolment.head()

Weeks: 1-52


Unnamed: 0,date,state,district,pincode,age_0_5,age_5_17,age_18_greater,pincode_valid,total_enrolments,year,month,day_of_week,week_of_year
0,2025-03-02,Meghalaya,East Khasi Hills,793121,11,61,37,True,109,2025,3,6,9
1,2025-03-09,Karnataka,Bengaluru Urban,560043,14,33,39,True,86,2025,3,6,10
2,2025-03-09,Uttar Pradesh,Kanpur Nagar,208001,29,82,12,True,123,2025,3,6,10
3,2025-03-09,Uttar Pradesh,Aligarh,202133,62,29,15,True,106,2025,3,6,10
4,2025-03-09,Karnataka,Bengaluru Urban,560016,14,16,21,True,51,2025,3,6,10


### Checking for Duplicates

In [20]:
# Define the columns that define a "unique" entry
subset_cols = ['state', 'district', 'date', 'pincode']

# Create a filter for duplicates
# keep=False ensures you see ALL copies of the duplicate data
duplicate_rows = df_enrolment[df_enrolment.duplicated(subset=subset_cols, keep=False)]

# Sort them so identical rows appear next to each other
duplicate_rows = duplicate_rows.sort_values(by=subset_cols)
dup_count = df_enrolment.duplicated(subset=subset_cols).sum()

print(duplicate_rows)
print(f"Duplicate records: {dup_count:,}")
print(f"Percentage: {dup_count/len(df_enrolment)*100:.2f}%")

             date                        state        district pincode  \
529382 2025-10-29  Andaman And Nicobar Islands        Andamans  744105   
530289 2025-10-29  Andaman And Nicobar Islands        Andamans  744105   
699380 2025-11-11  Andaman And Nicobar Islands        Andamans  744103   
701138 2025-11-11  Andaman And Nicobar Islands        Andamans  744103   
699381 2025-11-11  Andaman And Nicobar Islands         Nicobar  744301   
...           ...                          ...             ...     ...   
882031 2025-12-22                  West Bengal  West Midnapore  721166   
879950 2025-12-22                  West Bengal  West Midnapore  721306   
882032 2025-12-22                  West Bengal  West Midnapore  721306   
879951 2025-12-22                  West Bengal  West Midnapore  721506   
882033 2025-12-22                  West Bengal  West Midnapore  721506   

        age_0_5  age_5_17  age_18_greater  pincode_valid  total_enrolments  \
529382        1         0        

#### Dropping Duplicates
We are only keeping the first occurrence and deleting the rest and since the percentage is very less, it won't affect the overall result

In [21]:
df_enrolment = df_enrolment.drop_duplicates(subset=subset_cols, keep='first')
df_enrolment.shape

(980128, 13)

### Sorting the data

In [22]:
df_enrolment = df_enrolment.sort_values(
    ['date', 'state', 'district', 'pincode']
).reset_index(drop=True)
df_enrolment.head()

Unnamed: 0,date,state,district,pincode,age_0_5,age_5_17,age_18_greater,pincode_valid,total_enrolments,year,month,day_of_week,week_of_year
0,2025-03-02,Meghalaya,East Khasi Hills,793121,11,61,37,True,109,2025,3,6,9
1,2025-03-09,Bihar,Bhagalpur,812005,13,40,18,True,71,2025,3,6,10
2,2025-03-09,Bihar,Madhubani,847108,18,120,22,True,160,2025,3,6,10
3,2025-03-09,Bihar,Purbi Champaran,845304,18,72,12,True,102,2025,3,6,10
4,2025-03-09,Bihar,Purbi Champaran,845418,30,48,10,True,88,2025,3,6,10


### Final Processing

In [23]:
def clean_dataset(df, dataset_name, age_columns):
    """
    Clean and validate a dataset.

    Parameters:
    - df: DataFrame to clean
    - dataset_name: Name for display (e.g., "Demographic")
    - age_columns: List of age column names

    Returns:
    - cleaned DataFrame
    - statistics dictionary
    """

    initial_rows = len(df)

    # 1. Parse dates
    df['date'] = pd.to_datetime(df['date'], format='%d-%m-%Y', errors='coerce')
    invalid_dates = df['date'].isna().sum()
    df = df.dropna(subset=['date'])
    print(f"({invalid_dates:,} invalid removed)")

    # 2. Standardize text
    df['state'] = df['state'].str.strip().str.title()
    df['district'] = df['district'].str.strip().str.title()
    
    state_corrections = {
        # Typos & Spacing
        "West  Bengal": "West Bengal",
        "West Bangal": "West Bengal",
        "Westbengal": "West Bengal",
        
        # Old Names -> New Names
        "Orissa": "Odisha",
        "Pondicherry": "Puducherry",
        
        # Standardization (& -> And)
        "Jammu & Kashmir": "Jammu And Kashmir",
        "Andaman & Nicobar Islands": "Andaman And Nicobar Islands",
        
        # Merging Daman & Diu, Dadra & Nagar Haveli as they are now a single union territory
        "Dadra And Nagar Haveli": "Dadra and Nagar Haveli and Daman and Diu",
        "Daman And Diu": "Dadra and Nagar Haveli and Daman and Diu",
        "Dadra & Nagar Haveli": "Dadra and Nagar Haveli and Daman and Diu",
        "Daman & Diu": "Dadra and Nagar Haveli and Daman and Diu",
        "Dadra And Nagar Haveli And Daman And Diu": "Dadra and Nagar Haveli and Daman and Diu",
        "The Dadra And Nagar Haveli And Daman And Diu": "Dadra and Nagar Haveli and Daman and Diu",
    
        # 100000 is the internal code for Delhi
        '100000': "Delhi"
    }
    
    df['state'] = df['state'].replace(state_corrections)
    df['district'] = df['district'].replace('100000', 'Delhi')

    # 3. Validate pincodes
    df['pincode'] = df['pincode'].astype(str).str.strip()
    df['pincode_valid'] = df['pincode'].str.match(r'^\d{6}$')
    invalid_pins = (~df['pincode_valid']).sum()
    print(f"({invalid_pins:,} invalid)")

    # 4. Clean numeric columns
    for col in age_columns:
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).astype(int)
        df.loc[df[col] < 0, col] = 0

    # 5. Create derived columns
    df['total'] = df[age_columns].sum(axis=1)
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day_of_week'] = df['date'].dt.dayofweek
    df['week_of_year'] = df['date'].dt.isocalendar().week

    # 6. Remove duplicates
    duplicate_rows = df[df.duplicated(subset=['date', 'state', 'district', 'pincode'], keep=False)]
    duplicate_rows = duplicate_rows.sort_values(by=['date', 'state', 'district', 'pincode'])
    print(duplicate_rows)
    
    dup_count = df.duplicated(subset=['date', 'state', 'district', 'pincode']).sum()

    print(duplicate_rows)
    print(f"Duplicate records: {dup_count:,}")
    print(f"Percentage: {dup_count/len(df)*100:.2f}%")
    
    df = df.drop_duplicates(subset=['date', 'state', 'district', 'pincode'], keep='first')
    

    # 7. Sort
    df = df.sort_values(['date', 'state', 'district', 'pincode']).reset_index(drop=True)
    
    return df

In [24]:
# Define demographic age columns
demographic_age_cols = ['demo_age_5_17', 'demo_age_17_']

# Clean the dataset
df_demographic_clean = clean_dataset(
    df_demographic,
    "Demographic Update",
    demographic_age_cols
)

# Rename 'total' column for clarity
df_demographic_clean = df_demographic_clean.rename(columns={'total': 'total_demo_updates'})


(0 invalid removed)
(0 invalid)
              date                        state                  district  \
831438  2025-03-01  Andaman And Nicobar Islands                   Nicobar   
1658910 2025-03-01  Andaman And Nicobar Islands                   Nicobar   
418053  2025-03-01  Andaman And Nicobar Islands  North And Middle Andaman   
1245525 2025-03-01  Andaman And Nicobar Islands  North And Middle Andaman   
416304  2025-03-01               Andhra Pradesh                  Adilabad   
...            ...                          ...                       ...   
1241380 2025-12-29                  West Bengal                    Howrah   
413984  2025-12-29                  West Bengal                     Malda   
2071515 2025-12-29                  West Bengal                     Malda   
827568  2025-12-29                  West Bengal         South 24 Parganas   
827914  2025-12-29                  West Bengal         South 24 Parganas   

        pincode  demo_age_5_17  demo_age_17

In [25]:
biometric_age_cols = ['bio_age_5_17', 'bio_age_17_']

# Clean the dataset
df_biometric_clean = clean_dataset(
    df_biometric,
    "Biometric Update",
    biometric_age_cols
)

# Rename 'total' column for clarity
df_biometric_clean = df_biometric_clean.rename(columns={'total': 'total_bio_updates'})

(0 invalid removed)
(0 invalid)
              date                                     state  \
12711   2025-03-01  Dadra and Nagar Haveli and Daman and Diu   
21641   2025-03-01  Dadra and Nagar Haveli and Daman and Diu   
12770   2025-03-01  Dadra and Nagar Haveli and Daman and Diu   
18587   2025-03-01  Dadra and Nagar Haveli and Daman and Diu   
2277    2025-03-01  Dadra and Nagar Haveli and Daman and Diu   
...            ...                                       ...   
1859288 2025-12-29                                Puducherry   
1851565 2025-12-29                               West Bengal   
1851649 2025-12-29                               West Bengal   
1842093 2025-12-29                               West Bengal   
1856476 2025-12-29                               West Bengal   

                       district pincode  bio_age_5_17  bio_age_17_  \
12711    Dadra And Nagar Haveli  396230            88          261   
21641    Dadra And Nagar Haveli  396230           732      

### Saving Data in Paraquet format

In [27]:
# Save enrolment
df_enrolment.to_parquet('enrolment_cleaned.parquet', index=False, compression='snappy')
size_mb = pd.read_parquet('enrolment_cleaned.parquet').memory_usage(deep=True).sum() / (1024**2)
print(f"({size_mb:.1f} MB)")

# Save demographic
df_demographic_clean.to_parquet('demographic_cleaned.parquet', index=False, compression='snappy')
size_mb = pd.read_parquet('demographic_cleaned.parquet').memory_usage(deep=True).sum() / (1024**2)
print(f"({size_mb:.1f} MB)")

# Save biometric
df_biometric_clean.to_parquet('biometric_cleaned.parquet', index=False, compression='snappy')
size_mb = pd.read_parquet('biometric_cleaned.parquet').memory_usage(deep=True).sum() / (1024**2)
print(f"({size_mb:.1f} MB)")

(214.6 MB)
(335.9 MB)
(370.5 MB)


### Saving Data in CSV Format

In [29]:
# Save enrolment
df_enrolment.to_csv('enrolment_cleaned.csv', index=False)
size_mb = pd.read_csv('enrolment_cleaned.csv').memory_usage(deep=True).sum() / (1024**2)
print(f"({size_mb:.1f} MB)")


# Save demographic
df_demographic_clean.to_csv('demographic_cleaned.csv', index=False)
size_mb = pd.read_csv('demographic_cleaned.csv').memory_usage(deep=True).sum() / (1024**2)
print(f"({size_mb:.1f} MB)")

# Save biometric
df_biometric_clean.to_csv('biometric_cleaned.csv', index=False)
size_mb = pd.read_csv('biometric_cleaned.csv').memory_usage(deep=True).sum() / (1024**2)
print(f"({size_mb:.1f} MB)")

(232.3 MB)
(364.6 MB)
(402.2 MB)
