In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('/content/enrol_anomalies_final.csv')

In [4]:
df.shape

(176844, 7)

In [5]:
df.head()

Unnamed: 0,date,state,district,pincode,age_0_5,age_5_17,age_18_greater
0,02-03-2025,meghalaya,east khasi hills,793121,11,61,37
1,09-03-2025,uttar pradesh,maharajganj,273164,31,70,13
2,09-03-2025,maharashtra,aurangabad,431001,42,46,12
3,09-03-2025,delhi,west delhi,110018,29,11,15
4,09-03-2025,delhi,west delhi,110059,93,42,42


In [6]:
df['state'].unique()

array(['meghalaya', 'uttar pradesh', 'maharashtra', 'delhi',
       'west bengal', 'andhra pradesh', 'tamil nadu', 'gujarat',
       'madhya pradesh', 'assam', 'karnataka', 'nagaland', 'rajasthan',
       'manipur', 'bihar', 'haryana', 'telangana', 'tripura',
       'jammu and kashmir', 'sikkim', 'punjab', 'odisha', 'mizoram',
       'chhattisgarh', 'the dadra and nagar haveli and daman and diu',
       'jharkhand', 'kerala', 'puducherry', 'uttarakhand',
       'andaman and nicobar islands',
       'dadra and nagar haveli and daman and diu', '100000',
       'arunachal pradesh', 'himachal pradesh', 'chandigarh', 'goa'],
      dtype=object)

In [7]:
pip install rapidfuzz


Collecting rapidfuzz
  Downloading rapidfuzz-3.14.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (12 kB)
Downloading rapidfuzz-3.14.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m3.1/3.2 MB[0m [31m88.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m52.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz
Successfully installed rapidfuzz-3.14.3


In [8]:
import json
import numpy as np
from rapidfuzz import process, fuzz

# -----------------------------
# 1. Load state-district JSON
# -----------------------------
with open("/content/state_district_reference.json", "r", encoding="utf-8") as f:
    state_district_map = json.load(f)  # :contentReference[oaicite:0]{index=0}

# normalize JSON keys (safety)
state_district_map = {
    k.strip().lower(): [d.strip().lower() for d in v["districts"]]
    for k, v in state_district_map.items()
}

# -----------------------------
# 2. Normalise df columns
# -----------------------------
df["state_norm"] = df["state"].str.strip().str.lower()
df["district_norm"] = df["district"].str.strip().str.lower()

# -----------------------------
# 3. Fuzzy matching function
# -----------------------------
def match_district(row, threshold=80):
    state = row["state_norm"]
    district = row["district_norm"]

    # state not found in JSON
    if state not in state_district_map or pd.isna(district):
        return np.nan

    district_list = state_district_map[state]

    match = process.extractOne(
        district,
        district_list,
        scorer=fuzz.token_sort_ratio
    )

    if match and match[1] >= threshold:
        return match[0]   # best matched district name
    else:
        return np.nan

# -----------------------------
# 4. Apply to dataframe
# -----------------------------
df["district_new"] = df.apply(match_district, axis=1)

# (optional) drop helper columns
df.drop(columns=["state_norm", "district_norm"], inplace=True)


In [9]:
df.head()

Unnamed: 0,date,state,district,pincode,age_0_5,age_5_17,age_18_greater,district_new
0,02-03-2025,meghalaya,east khasi hills,793121,11,61,37,
1,09-03-2025,uttar pradesh,maharajganj,273164,31,70,13,mahrajganj
2,09-03-2025,maharashtra,aurangabad,431001,42,46,12,
3,09-03-2025,delhi,west delhi,110018,29,11,15,
4,09-03-2025,delhi,west delhi,110059,93,42,42,


In [10]:
df[~df['district_new'].isna()].shape

(56423, 8)

In [11]:
df[~df['district_new'].isna()]['state'].unique()

array(['uttar pradesh', 'west bengal', 'tamil nadu', 'madhya pradesh',
       'rajasthan', 'gujarat', 'assam', 'odisha', 'mizoram', 'karnataka',
       'jammu and kashmir', 'andhra pradesh', 'jharkhand', 'chhattisgarh',
       'kerala', 'maharashtra', 'punjab', 'telangana', 'bihar',
       'dadra and nagar haveli and daman and diu', 'uttarakhand',
       'arunachal pradesh', 'himachal pradesh',
       'andaman and nicobar islands', 'haryana', 'delhi'], dtype=object)

In [12]:
df[~df['district_new'].isna()]['district_new'].unique()

array(['mahrajganj', 'uttar dinajpur', 'kancheepuram', 'ashoknagar',
       'jalore', 'banas kantha', 'south salmara mancachar', 'sivasagar',
       'nabarangpur', 'siaha', 'bengaluru rural', 'baramulla',
       'ahmedabad', 'north 24 parganas', 'dangs', 'shrawasti',
       'visakhapatnam', 'south 24 parganas', 'east singhbhum',
       'dakshin dinajpur', 'narsimhapur', 'kushinagar',
       'manendragarh-chirmiri-bharatpur (m c b)', 'budgam', 'bandipora',
       'garhwa', 'hazaribagh', 'palamu', 'bagalkote', 'chamarajanagar',
       'chikkamagaluru', 'chikkaballapura', 'davanagere', 'gadag',
       'shivamogga', 'tumakuru', 'kasaragod', 'agar-malwa', 'buldhana',
       'gondia', 'ananthapuramu', 'dr. b.r. ambedkar konaseema', 'angul',
       'boudh', 'bargarh', 'jagatsinghapur', 'jajpur', 'keonjhar',
       'khordha', 'sonepur', 'sundargarh', 'ferozepur',
       'shahid bhagat singh nagar', 'chittorgarh', 'jhunjhunu',
       'kanniyakumari', 'tirupathur', 'thiruvallur', 'viluppuram',
 

In [13]:
df_final_cleaned = df[~df['district_new'].isna()]

In [14]:
df_final_cleaned.head()

Unnamed: 0,date,state,district,pincode,age_0_5,age_5_17,age_18_greater,district_new
1,09-03-2025,uttar pradesh,maharajganj,273164,31,70,13,mahrajganj
6,09-03-2025,west bengal,dinajpur uttar,733129,26,18,27,uttar dinajpur
8,15-03-2025,uttar pradesh,maharajganj,273164,12,55,12,mahrajganj
14,20-03-2025,tamil nadu,kanchipuram,600069,12,12,10,kancheepuram
22,27-03-2025,uttar pradesh,maharajganj,273164,11,56,22,mahrajganj


In [15]:
df_final_cleaned.drop(['district'],axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final_cleaned.drop(['district'],axis=1, inplace=True)


In [16]:
df_final_cleaned.head()

Unnamed: 0,date,state,pincode,age_0_5,age_5_17,age_18_greater,district_new
1,09-03-2025,uttar pradesh,273164,31,70,13,mahrajganj
6,09-03-2025,west bengal,733129,26,18,27,uttar dinajpur
8,15-03-2025,uttar pradesh,273164,12,55,12,mahrajganj
14,20-03-2025,tamil nadu,600069,12,12,10,kancheepuram
22,27-03-2025,uttar pradesh,273164,11,56,22,mahrajganj


In [17]:
df_final_cleaned.shape

(56423, 7)

In [18]:
# 1. Rename the column
df_final_cleaned = df_final_cleaned.rename(columns={"district_new": "district"})

# 2. Reorder columns
cols = list(df_final_cleaned.columns)

# Remove district from current position
cols.remove("district")

# Find index of state
state_index = cols.index("state")

# Insert district right after state
cols.insert(state_index + 1, "district")

# Apply new order
df_final_cleaned = df_final_cleaned[cols]


In [19]:
df_final_cleaned.head()

Unnamed: 0,date,state,district,pincode,age_0_5,age_5_17,age_18_greater
1,09-03-2025,uttar pradesh,mahrajganj,273164,31,70,13
6,09-03-2025,west bengal,uttar dinajpur,733129,26,18,27
8,15-03-2025,uttar pradesh,mahrajganj,273164,12,55,12
14,20-03-2025,tamil nadu,kancheepuram,600069,12,12,10
22,27-03-2025,uttar pradesh,mahrajganj,273164,11,56,22


In [20]:
df_final_cleaned['district'].isna().sum()

np.int64(0)

In [None]:
df_final_cleaned.drop(['Unnamed: 0'],axis=1, inplace=True)

In [21]:
df_final_cleaned.head()

Unnamed: 0,date,state,district,pincode,age_0_5,age_5_17,age_18_greater
1,09-03-2025,uttar pradesh,mahrajganj,273164,31,70,13
6,09-03-2025,west bengal,uttar dinajpur,733129,26,18,27
8,15-03-2025,uttar pradesh,mahrajganj,273164,12,55,12
14,20-03-2025,tamil nadu,kancheepuram,600069,12,12,10
22,27-03-2025,uttar pradesh,mahrajganj,273164,11,56,22


In [22]:
df_final_cleaned.shape

(56423, 7)

In [23]:
df_final_cleaned.to_csv('enrol_anomalies_cleaned.csv',index=False)

In [24]:
#now saving False data
df_final_false = df[df['district_new'].isna()]

In [25]:
df_final_false.head()

Unnamed: 0,date,state,district,pincode,age_0_5,age_5_17,age_18_greater,district_new
0,02-03-2025,meghalaya,east khasi hills,793121,11,61,37,
2,09-03-2025,maharashtra,aurangabad,431001,42,46,12,
3,09-03-2025,delhi,west delhi,110018,29,11,15,
4,09-03-2025,delhi,west delhi,110059,93,42,42,
5,09-03-2025,west bengal,coochbehar,736135,19,12,19,


In [26]:
# Simple check
df_final_false[~df_final_false['district_new'].isna()]

Unnamed: 0,date,state,district,pincode,age_0_5,age_5_17,age_18_greater,district_new


In [27]:
df_final_false.drop(['district_new'],axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final_false.drop(['district_new'],axis=1, inplace=True)


In [28]:
df_final_false.head()

Unnamed: 0,date,state,district,pincode,age_0_5,age_5_17,age_18_greater
0,02-03-2025,meghalaya,east khasi hills,793121,11,61,37
2,09-03-2025,maharashtra,aurangabad,431001,42,46,12
3,09-03-2025,delhi,west delhi,110018,29,11,15
4,09-03-2025,delhi,west delhi,110059,93,42,42
5,09-03-2025,west bengal,coochbehar,736135,19,12,19


In [29]:
df_final_false.shape

(120421, 7)

In [None]:
df_final_false.drop(['Unnamed: 0'],axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final_false.drop(['Unnamed: 0'],axis=1, inplace=True)


In [30]:
df_final_false.head()

Unnamed: 0,date,state,district,pincode,age_0_5,age_5_17,age_18_greater
0,02-03-2025,meghalaya,east khasi hills,793121,11,61,37
2,09-03-2025,maharashtra,aurangabad,431001,42,46,12
3,09-03-2025,delhi,west delhi,110018,29,11,15
4,09-03-2025,delhi,west delhi,110059,93,42,42
5,09-03-2025,west bengal,coochbehar,736135,19,12,19


In [31]:
df_final_false.to_csv('enrol_anomalies_false.csv',index=False)