In [1]:
import pandas as pd

In [2]:
df1 = pd.read_csv('/content/first_false_enrol.csv')
df2 = pd.read_csv('/content/second_false_enrol.csv')

In [3]:
df1.shape

(23337, 9)

In [4]:
df2.shape

(134795, 9)

In [5]:
df = pd.concat([df1,df2])

In [6]:
df.shape

(158132, 10)

In [7]:
df.head()

Unnamed: 0.1,Unnamed: 0,date,state,district,pincode,age_0_5,age_5_17,age_18_greater,match,match_district
0,123.0,27-03-2025,chhattisgarh,bijapur,494444,43,15,42,False,
1,127.0,27-03-2025,chhattisgarh,bijapur,494447,23,16,21,False,
2,290.0,01-04-2025,chhattisgarh,bijapur,494447,83,66,66,False,
3,391.0,01-04-2025,gujarat,surat,394230,66,25,25,False,
4,393.0,01-04-2025,punjab,amritsar,143110,71,23,14,False,


In [8]:
df.drop(['Unnamed: 0', 'match', 'match_district'],axis=1,inplace=True)

In [9]:
df.head()

Unnamed: 0,date,state,district,pincode,age_0_5,age_5_17,age_18_greater
0,27-03-2025,chhattisgarh,bijapur,494444,43,15,42
1,27-03-2025,chhattisgarh,bijapur,494447,23,16,21
2,01-04-2025,chhattisgarh,bijapur,494447,83,66,66
3,01-04-2025,gujarat,surat,394230,66,25,25
4,01-04-2025,punjab,amritsar,143110,71,23,14


In [10]:
# Checking
import pandas as pd
import json

# -----------------------------
# 1. Load JSON (true data)
# -----------------------------
with open("/content/pin-code-data.json", "r") as f:
    pin_data = json.load(f)

# -----------------------------
# 2. Helper: clean text
# -----------------------------
def clean_text(x):
    return str(x).strip().upper()

# -----------------------------
# 3. Build DISTRICT → PINCODES mapping
# -----------------------------
district_to_pins = {}

for pin, info in pin_data.items():
    district = clean_text(info["district"])
    district_to_pins.setdefault(district, []).append(int(pin))

# -----------------------------
# 4. Validation + correction logic
# -----------------------------
def check_and_fix(row):
    df_district = clean_text(row["district"])
    df_pin = row["pincode"]

    # District not found in JSON → cannot validate
    if df_district not in district_to_pins:
        return df_pin, "unknown"

    valid_pins = district_to_pins[df_district]

    # Pincode correct
    if df_pin in valid_pins:
        return df_pin, "yes"

    # Pincode wrong → replace
    return valid_pins[0], "no"


# -----------------------------
# 5. Apply logic
# -----------------------------
df[["pincode", "same"]] = df.apply(
    lambda r: pd.Series(check_and_fix(r)),
    axis=1
)

# -----------------------------
# 6. Result
# -----------------------------
print(df.head())


         date         state  district  pincode  age_0_5  age_5_17  \
0  27-03-2025  chhattisgarh   bijapur   494450       43        15   
1  27-03-2025  chhattisgarh   bijapur   494450       23        16   
2  01-04-2025  chhattisgarh   bijapur   494450       83        66   
3  01-04-2025       gujarat     surat   396510       66        25   
4  01-04-2025        punjab  amritsar   143102       71        23   

   age_18_greater same  
0              42   no  
1              21   no  
2              66   no  
3              25   no  
4              14   no  


In [11]:
df[df['same']=='unknown'].shape

(65940, 8)

In [12]:
df.head()

Unnamed: 0,date,state,district,pincode,age_0_5,age_5_17,age_18_greater,same
0,27-03-2025,chhattisgarh,bijapur,494450,43,15,42,no
1,27-03-2025,chhattisgarh,bijapur,494450,23,16,21,no
2,01-04-2025,chhattisgarh,bijapur,494450,83,66,66,no
3,01-04-2025,gujarat,surat,396510,66,25,25,no
4,01-04-2025,punjab,amritsar,143102,71,23,14,no


In [None]:
df['pincode'].isna().sum()

np.int64(0)

In [None]:
df.to_csv('check_bio.csv',index=False)

In [13]:
import numpy as np

df["not found"] = np.where(
    df["same"] == "unknown",
    df["district"],
    np.nan
)


In [14]:
df.head()

Unnamed: 0,date,state,district,pincode,age_0_5,age_5_17,age_18_greater,same,not found
0,27-03-2025,chhattisgarh,bijapur,494450,43,15,42,no,
1,27-03-2025,chhattisgarh,bijapur,494450,23,16,21,no,
2,01-04-2025,chhattisgarh,bijapur,494450,83,66,66,no,
3,01-04-2025,gujarat,surat,396510,66,25,25,no,
4,01-04-2025,punjab,amritsar,143102,71,23,14,no,


In [18]:
df[~df['not found'].isna()]['not found'].unique()

array(['ahmedabad', 'east singhbhum', 'yanam', 'kancheepuram',
       'viluppuram', 'bara banki', 'sant kabir nagar', 'cooch behar',
       'malda', 'south 24 parganas', 'narayanpur', 'paschim medinipur',
       'puducherry', 'angul', 'dakshin bastar dantewada', 'davanagere',
       'purba medinipur', 'uttar dinajpur', 'udham singh nagar',
       'dakshin dinajpur', 'north 24 parganas', 'kamle',
       'uttar bastar kanker', 'mohla-manpur-ambagarh chouki',
       'sri potti sriramulu nellore', 'ananthapuramu', 'lohardaga',
       'narsimhapur', 'leh', 'saitual', 'dahod', 'shrawasti',
       'siddharthnagar', 'kushinagar', 'visakhapatnam', 'sribhumi',
       'sakti', 'sarangarh-bilaigarh', 'charkhi dadri', 'chamarajanagar',
       'vijayanagara', 'maihar', 'narmadapuram',
       'chhatrapati sambhajinagar', 'nicobar', 'south andaman', 'jajpur',
       'ferozepur', 'thoothukkudi', 'rudraprayag', 'uttarkashi',
       'kurung kumey', 'gaurela-pendra-marwahi', 'kabeerdham',
       'lakshadw