In [25]:
import pandas as pd

In [26]:
df1 = pd.read_csv('/content/first_false_bio.csv')
df2 = pd.read_csv('/content/second_false_bio.csv')

In [27]:
df1.shape

(49211, 7)

In [28]:
df2.shape

(269559, 8)

In [29]:
df = pd.concat([df1,df2])

In [30]:
df.shape

(318770, 8)

In [31]:
df.head()

Unnamed: 0,date,state,district,pincode,bio_age_5_17,bio_age_17_,match,match_district
0,01-03-2025,uttar pradesh,bara banki,225408,17,3,False,
1,01-03-2025,bihar,saharsa,852210,58,118,False,
2,01-03-2025,bihar,patna,800018,21,20,False,
3,01-03-2025,west bengal,uttar dinajpur,733252,13,14,False,
4,01-03-2025,uttar pradesh,unnao,241501,1265,369,False,


In [32]:
df.drop(['match','match_district'],axis=1,inplace=True)

In [33]:
df.head()

Unnamed: 0,date,state,district,pincode,bio_age_5_17,bio_age_17_
0,01-03-2025,uttar pradesh,bara banki,225408,17,3
1,01-03-2025,bihar,saharsa,852210,58,118
2,01-03-2025,bihar,patna,800018,21,20
3,01-03-2025,west bengal,uttar dinajpur,733252,13,14
4,01-03-2025,uttar pradesh,unnao,241501,1265,369


In [34]:
# Checking
import pandas as pd
import json

# -----------------------------
# 1. Load JSON (true data)
# -----------------------------
with open("/content/pin-code-data.json", "r") as f:
    pin_data = json.load(f)

# -----------------------------
# 2. Helper: clean text
# -----------------------------
def clean_text(x):
    return str(x).strip().upper()

# -----------------------------
# 3. Build DISTRICT → PINCODES mapping
# -----------------------------
district_to_pins = {}

for pin, info in pin_data.items():
    district = clean_text(info["district"])
    district_to_pins.setdefault(district, []).append(int(pin))

# -----------------------------
# 4. Validation + correction logic
# -----------------------------
def check_and_fix(row):
    df_district = clean_text(row["district"])
    df_pin = row["pincode"]

    # District not found in JSON → cannot validate
    if df_district not in district_to_pins:
        return df_pin, "unknown"

    valid_pins = district_to_pins[df_district]

    # Pincode correct
    if df_pin in valid_pins:
        return df_pin, "yes"

    # Pincode wrong → replace
    return valid_pins[0], "no"


# -----------------------------
# 5. Apply logic
# -----------------------------
df[["pincode", "same"]] = df.apply(
    lambda r: pd.Series(check_and_fix(r)),
    axis=1
)

# -----------------------------
# 6. Result
# -----------------------------
print(df.head())


         date          state        district  pincode  bio_age_5_17  \
0  01-03-2025  uttar pradesh      bara banki   225408            17   
1  01-03-2025          bihar         saharsa   852212            58   
2  01-03-2025          bihar           patna   803211            21   
3  01-03-2025    west bengal  uttar dinajpur   733252            13   
4  01-03-2025  uttar pradesh           unnao   209870          1265   

   bio_age_17_     same  
0            3  unknown  
1          118       no  
2           20       no  
3           14  unknown  
4          369       no  


In [37]:
df[df['same']=='unknown'].shape

(113763, 7)

In [36]:
df.head()

Unnamed: 0,date,state,district,pincode,bio_age_5_17,bio_age_17_,same
0,01-03-2025,uttar pradesh,bara banki,225408,17,3,unknown
1,01-03-2025,bihar,saharsa,852212,58,118,no
2,01-03-2025,bihar,patna,803211,21,20,no
3,01-03-2025,west bengal,uttar dinajpur,733252,13,14,unknown
4,01-03-2025,uttar pradesh,unnao,209870,1265,369,no


In [23]:
df['pincode'].isna().sum()

np.int64(0)

In [20]:
df.to_csv('check_bio.csv',index=False)

In [38]:
import numpy as np

df["not found"] = np.where(
    df["same"] == "unknown",
    df["district"],
    np.nan
)


In [39]:
df.head()

Unnamed: 0,date,state,district,pincode,bio_age_5_17,bio_age_17_,same,not found
0,01-03-2025,uttar pradesh,bara banki,225408,17,3,unknown,bara banki
1,01-03-2025,bihar,saharsa,852212,58,118,no,
2,01-03-2025,bihar,patna,803211,21,20,no,
3,01-03-2025,west bengal,uttar dinajpur,733252,13,14,unknown,uttar dinajpur
4,01-03-2025,uttar pradesh,unnao,209870,1265,369,no,


In [52]:
df[~df['not found'].isna()]['not found'].unique()

array(['bara banki', 'uttar dinajpur', 'dakshin bastar dantewada',
       'south 24 parganas', 'paschim medinipur', 'narayanpur', 'malda',
       'uttar bastar kanker', 'viluppuram', 'puducherry', 'cooch behar',
       'north 24 parganas', 'sant kabir nagar', 'ahmedabad',
       'dakshin dinajpur', 'purba medinipur', 'ananthapuramu',
       'kancheepuram', 'east singhbhum', 'angul',
       'mohla-manpur-ambagarh chouki', 'kamle', 'yanam', 'davanagere',
       'lohardaga', 'leh', 'visakhapatnam', 'sri potti sriramulu nellore',
       'udham singh nagar', 'thoothukkudi', 'narsimhapur', 'saitual',
       'khairthal-tijara', 'dahod', 'siddharthnagar', 'kushinagar',
       'narmadapuram', 'charkhi dadri', 'jajpur', 'ferozepur',
       'rudraprayag', 'vijayanagara', 'shrawasti', 'uttarkashi',
       'nicobar', 'south andaman', 'chhatrapati sambhajinagar',
       'kabeerdham', 'lakshadweep', 'sakti', 'siang', 'kurung kumey',
       'gaurela-pendra-marwahi', 'chamarajanagar', 'maihar', 'dharas