In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('/content/bio_anomalies_final.csv')

In [None]:
df.shape

(359359, 7)

In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,date,state,district,pincode,bio_age_5_17,bio_age_17_
0,0,01-03-2025,jammu and kashmir,punch,185101,643,1091
1,1,01-03-2025,gujarat,sabarkantha,383440,56,91
2,2,01-03-2025,odisha,nabarangapur,764075,438,605
3,3,01-03-2025,punjab,shaheed bhagat singh nagar,144514,134,249
4,4,01-03-2025,karnataka,davangere,577002,219,386


In [None]:
df['state'].unique()

array(['jammu and kashmir', 'gujarat', 'odisha', 'punjab', 'karnataka',
       'andhra pradesh', 'madhya pradesh', 'nagaland', 'assam',
       'west bengal', 'delhi', 'maharashtra', 'tamil nadu',
       'chhattisgarh', 'telangana', 'uttar pradesh', 'meghalaya',
       'rajasthan', 'haryana', 'bihar', 'puducherry', 'manipur', 'sikkim',
       'jharkhand', 'tripura', 'mizoram', 'arunachal pradesh',
       'andaman and nicobar islands', 'kerala',
       'dadra and nagar haveli and daman and diu', 'himachal pradesh',
       'uttarakhand', 'goa', 'chandigarh', 'tamilnadu'], dtype=object)

In [None]:
pip install rapidfuzz




In [None]:
import json
import numpy as np
from rapidfuzz import process, fuzz

# 1. Load state-district JSON
with open("/content/state_district_reference.json", "r", encoding="utf-8") as f:
    state_district_map = json.load(f)  # :contentReference[oaicite:0]{index=0}

# normalize JSON keys (safety)
state_district_map = {
    k.strip().lower(): [d.strip().lower() for d in v["districts"]]
    for k, v in state_district_map.items()
}

# 2. Normalise df columns
df["state_norm"] = df["state"].str.strip().str.lower()
df["district_norm"] = df["district"].str.strip().str.lower()

# 3. Fuzzy matching function
def match_district(row, threshold=80):
    state = row["state_norm"]
    district = row["district_norm"]

    # state not found in JSON
    if state not in state_district_map or pd.isna(district):
        return np.nan

    district_list = state_district_map[state]

    match = process.extractOne(
        district,
        district_list,
        scorer=fuzz.token_sort_ratio
    )

    if match and match[1] >= threshold:
        return match[0]   # best matched district name
    else:
        return np.nan

# 4. Apply to dataframe
df["district_new"] = df.apply(match_district, axis=1)

# (optional) drop helper columns
df.drop(columns=["state_norm", "district_norm"], inplace=True)


In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,date,state,district,pincode,bio_age_5_17,bio_age_17_,district_new
0,0,01-03-2025,jammu and kashmir,punch,185101,643,1091,
1,1,01-03-2025,gujarat,sabarkantha,383440,56,91,
2,2,01-03-2025,odisha,nabarangapur,764075,438,605,nabarangpur
3,3,01-03-2025,punjab,shaheed bhagat singh nagar,144514,134,249,shahid bhagat singh nagar
4,4,01-03-2025,karnataka,davangere,577002,219,386,davanagere


In [None]:
df[~df['district_new'].isna()].shape

(116070, 8)

In [None]:
df[~df['district_new'].isna()]['district_new'].unique()

array(['nabarangpur', 'shahid bhagat singh nagar', 'davanagere',
       'sundargarh', 'ananthapuramu', 'harda', 'tumakuru',
       'south salmara mancachar', 'banas kantha', 'khordha', 'jajpur',
       'malda', 'viluppuram', 'thiruvallur', 'tirupathur', 'firozabad',
       'jhunjhunu', 'bagalkote', 'jalore', 'sonepur', 'kanniyakumari',
       'budgam', 'gondia', 'chamarajanagar', 'baramulla', 'shivamogga',
       'hassan', 'boudh', 'agar-malwa', 'chikkamagaluru', 'ranga reddy',
       'ferozepur', 'dr. b.r. ambedkar konaseema', 'buldhana', 'keonjhar',
       'purnia', 'pakur', 'ahmedabad', 'purulia', 'angul', 'mamit',
       'jagatsinghapur', 'sahebganj', 'ashoknagar', 'haveri', 'gadag',
       'darjeeling', 'narsimhapur', 'mahrajganj', 'bandipora', 'shopian',
       'jayashankar bhupalapally', 'shi yomi', 'bengaluru rural',
       'garhwa', 'chikkaballapura', 'hazaribagh', 'sivasagar',
       'kasaragod', 'bargarh', 'chittorgarh', 'koderma', 'palamu',
       'siaha', 'dadra and nagar 

In [None]:
df_final = df[~df['district_new'].isna()]

In [None]:
df_final.head()

Unnamed: 0.1,Unnamed: 0,date,state,district,pincode,bio_age_5_17,bio_age_17_,district_new
2,2,01-03-2025,odisha,nabarangapur,764075,438,605,nabarangpur
3,3,01-03-2025,punjab,shaheed bhagat singh nagar,144514,134,249,shahid bhagat singh nagar
4,4,01-03-2025,karnataka,davangere,577002,219,386,davanagere
5,5,01-03-2025,odisha,sundergarh,769004,152,186,sundargarh
7,7,01-03-2025,andhra pradesh,anantapur,515301,83,105,ananthapuramu


In [None]:
df_final.drop(['district'],axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final.drop(['district'],axis=1, inplace=True)


In [None]:
df_final.head()

Unnamed: 0.1,Unnamed: 0,date,state,pincode,bio_age_5_17,bio_age_17_,district_new
2,2,01-03-2025,odisha,764075,438,605,nabarangpur
3,3,01-03-2025,punjab,144514,134,249,shahid bhagat singh nagar
4,4,01-03-2025,karnataka,577002,219,386,davanagere
5,5,01-03-2025,odisha,769004,152,186,sundargarh
7,7,01-03-2025,andhra pradesh,515301,83,105,ananthapuramu


In [None]:
df_final.shape

(116070, 7)

In [None]:
# 1. Rename the column
df_final = df_final.rename(columns={"district_new": "district"})

# 2. Reorder columns
cols = list(df_final.columns)

# Remove district from current position
cols.remove("district")

# Find index of state
state_index = cols.index("state")

# Insert district right after state
cols.insert(state_index + 1, "district")

# Apply new order
df_final = df_final[cols]


In [None]:
df_final.head()

Unnamed: 0.1,Unnamed: 0,date,state,district,pincode,bio_age_5_17,bio_age_17_
2,2,01-03-2025,odisha,nabarangpur,764075,438,605
3,3,01-03-2025,punjab,shahid bhagat singh nagar,144514,134,249
4,4,01-03-2025,karnataka,davanagere,577002,219,386
5,5,01-03-2025,odisha,sundargarh,769004,152,186
7,7,01-03-2025,andhra pradesh,ananthapuramu,515301,83,105


In [None]:
df_final['district'].isna().sum()

np.int64(0)

In [None]:
df_final.drop(['Unnamed: 0'],axis=1, inplace=True)

In [None]:
df_final.head()

Unnamed: 0,date,state,district,pincode,bio_age_5_17,bio_age_17_
2,01-03-2025,odisha,nabarangpur,764075,438,605
3,01-03-2025,punjab,shahid bhagat singh nagar,144514,134,249
4,01-03-2025,karnataka,davanagere,577002,219,386
5,01-03-2025,odisha,sundargarh,769004,152,186
7,01-03-2025,andhra pradesh,ananthapuramu,515301,83,105


In [None]:
df_final.shape

(116070, 6)

In [None]:
df_final.to_csv('bio_anomalies_cleaned.csv',index=False)

In [None]:
#now saving False data
df_final_false = df[df['district_new'].isna()]

In [None]:
df_final_false.head()

Unnamed: 0.1,Unnamed: 0,date,state,district,pincode,bio_age_5_17,bio_age_17_,district_new
0,0,01-03-2025,jammu and kashmir,punch,185101,643,1091,
1,1,01-03-2025,gujarat,sabarkantha,383440,56,91,
6,6,01-03-2025,andhra pradesh,warangal,506163,39,92,
9,9,01-03-2025,nagaland,mokokchung,798613,18,70,
12,12,01-03-2025,jammu and kashmir,leh,194401,52,43,


In [None]:
# Simple check
df_final_false[~df_final_false['district_new'].isna()]

Unnamed: 0.1,Unnamed: 0,date,state,district,pincode,bio_age_5_17,bio_age_17_,district_new


In [None]:
df_final_false.drop(['district_new'],axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final_false.drop(['district_new'],axis=1, inplace=True)


In [None]:
df_final_false.head()

Unnamed: 0.1,Unnamed: 0,date,state,district,pincode,bio_age_5_17,bio_age_17_
0,0,01-03-2025,jammu and kashmir,punch,185101,643,1091
1,1,01-03-2025,gujarat,sabarkantha,383440,56,91
6,6,01-03-2025,andhra pradesh,warangal,506163,39,92
9,9,01-03-2025,nagaland,mokokchung,798613,18,70
12,12,01-03-2025,jammu and kashmir,leh,194401,52,43


In [None]:
df_final_false.shape

(243289, 7)

In [None]:
df_final_false.drop(['Unnamed: 0'],axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final_false.drop(['Unnamed: 0'],axis=1, inplace=True)


In [None]:
df_final_false.head()

Unnamed: 0,date,state,district,pincode,bio_age_5_17,bio_age_17_
0,01-03-2025,jammu and kashmir,punch,185101,643,1091
1,01-03-2025,gujarat,sabarkantha,383440,56,91
6,01-03-2025,andhra pradesh,warangal,506163,39,92
9,01-03-2025,nagaland,mokokchung,798613,18,70
12,01-03-2025,jammu and kashmir,leh,194401,52,43


In [None]:
df_final_false.to_csv('bio_anomalies_false.csv',index=False)