In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('/content/demo_anomalies_final.csv')

In [5]:
df.shape

(401930, 6)

In [6]:
df.head()

Unnamed: 0,date,state,district,pincode,demo_age_5_17,demo_age_17_
0,01-03-2025,karnataka,belgaum,591118,27,56
1,01-03-2025,karnataka,bengaluru,560072,92,438
2,01-03-2025,tripura,dhalai,799285,18,517
3,01-03-2025,odisha,nabarangapur,764077,37,465
4,01-03-2025,rajasthan,jalor,343021,20,355


In [7]:
df['state'].unique()

array(['karnataka', 'tripura', 'odisha', 'rajasthan', 'tamil nadu',
       'delhi', 'andhra pradesh', 'gujarat', 'chhattisgarh',
       'west bengal', 'punjab', 'madhya pradesh', 'nagaland', 'telangana',
       'jammu and kashmir', 'meghalaya', 'maharashtra', 'bihar', 'assam',
       'haryana', 'uttar pradesh', 'manipur', 'sikkim', 'jharkhand',
       'mizoram', 'puducherry', 'andaman and nicobar islands',
       'uttarakhand', 'himachal pradesh', 'kerala',
       'dadra and nagar haveli and daman and diu', 'chandigarh',
       'arunachal pradesh', 'goa', 'darbhanga', 'puttenahalli',
       'balanagar', '100000', 'jaipur', 'madanapalle', 'nagpur',
       'raja annamalai puram'], dtype=object)

In [8]:
pip install rapidfuzz


Collecting rapidfuzz
  Downloading rapidfuzz-3.14.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (12 kB)
Downloading rapidfuzz-3.14.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m32.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz
Successfully installed rapidfuzz-3.14.3


In [9]:
import json
import numpy as np
from rapidfuzz import process, fuzz

# -----------------------------
# 1. Load state-district JSON
# -----------------------------
with open("/content/state_district_reference.json", "r", encoding="utf-8") as f:
    state_district_map = json.load(f)  # :contentReference[oaicite:0]{index=0}

# normalize JSON keys (safety)
state_district_map = {
    k.strip().lower(): [d.strip().lower() for d in v["districts"]]
    for k, v in state_district_map.items()
}

# -----------------------------
# 2. Normalise df columns
# -----------------------------
df["state_norm"] = df["state"].str.strip().str.lower()
df["district_norm"] = df["district"].str.strip().str.lower()

# -----------------------------
# 3. Fuzzy matching function
# -----------------------------
def match_district(row, threshold=80):
    state = row["state_norm"]
    district = row["district_norm"]

    # state not found in JSON
    if state not in state_district_map or pd.isna(district):
        return np.nan

    district_list = state_district_map[state]

    match = process.extractOne(
        district,
        district_list,
        scorer=fuzz.token_sort_ratio
    )

    if match and match[1] >= threshold:
        return match[0]   # best matched district name
    else:
        return np.nan

# -----------------------------
# 4. Apply to dataframe
# -----------------------------
df["district_new"] = df.apply(match_district, axis=1)

# (optional) drop helper columns
df.drop(columns=["state_norm", "district_norm"], inplace=True)


In [10]:
df.head()

Unnamed: 0,date,state,district,pincode,demo_age_5_17,demo_age_17_,district_new
0,01-03-2025,karnataka,belgaum,591118,27,56,
1,01-03-2025,karnataka,bengaluru,560072,92,438,
2,01-03-2025,tripura,dhalai,799285,18,517,
3,01-03-2025,odisha,nabarangapur,764077,37,465,nabarangpur
4,01-03-2025,rajasthan,jalor,343021,20,355,jalore


In [11]:
df[~df['district_new'].isna()].shape

(128123, 7)

In [13]:
df[~df['district_new'].isna()]['state'].unique()

array(['odisha', 'rajasthan', 'tamil nadu', 'west bengal', 'punjab',
       'jammu and kashmir', 'karnataka', 'madhya pradesh',
       'uttar pradesh', 'andhra pradesh', 'gujarat', 'assam', 'jharkhand',
       'maharashtra', 'mizoram', 'telangana', 'bihar', 'chhattisgarh',
       'uttarakhand', 'himachal pradesh', 'kerala',
       'dadra and nagar haveli and daman and diu', 'arunachal pradesh',
       'andaman and nicobar islands'], dtype=object)

In [14]:
df[~df['district_new'].isna()]['district_new'].unique()

array(['nabarangpur', 'jalore', 'kanniyakumari', 'purulia', 'ferozepur',
       'thiruvallur', 'keonjhar', 'baramulla', 'davanagere',
       'chamarajanagar', 'jajpur', 'agar-malwa', 'chikkaballapura',
       'mahrajganj', 'budgam', 'khordha', 'ananthapuramu', 'bagalkote',
       'viluppuram', 'tumakuru', 'narsimhapur', 'firozabad',
       'dr. b.r. ambedkar konaseema', 'tirupathur', 'jhunjhunu',
       'banas kantha', 'gadag', 'sivasagar', 'bandipora', 'ashoknagar',
       'bengaluru rural', 'chikkamagaluru', 'sundargarh', 'koderma',
       'bargarh', 'sonepur', 'gondia', 'hassan', 'shivamogga',
       'south salmara mancachar', 'siaha', 'jayashankar bhupalapally',
       'harda', 'buldhana', 'purbi champaran', 'purnia', 'sheikhpura',
       'manendragarh-chirmiri-bharatpur (m c b)', 'ahmedabad', 'garhwa',
       'hazaribagh', 'pakur', 'palamu', 'mumbai suburban', 'mamit',
       'angul', 'boudh', 'jagatsinghapur', 'shahid bhagat singh nagar',
       'chittorgarh', 'bulandshahr', 'kus

In [15]:
df_final_cleaned = df[~df['district_new'].isna()]

In [16]:
df_final_cleaned.head()

Unnamed: 0,date,state,district,pincode,demo_age_5_17,demo_age_17_,district_new
3,01-03-2025,odisha,nabarangapur,764077,37,465,nabarangpur
4,01-03-2025,rajasthan,jalor,343021,20,355,jalore
6,01-03-2025,tamil nadu,kanyakumari,629803,10,19,kanniyakumari
13,01-03-2025,west bengal,puruliya,723151,34,490,purulia
17,01-03-2025,punjab,firozpur,152022,14,325,ferozepur


In [17]:
df_final_cleaned.drop(['district'],axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final_cleaned.drop(['district'],axis=1, inplace=True)


In [18]:
df_final_cleaned.head()

Unnamed: 0,date,state,pincode,demo_age_5_17,demo_age_17_,district_new
3,01-03-2025,odisha,764077,37,465,nabarangpur
4,01-03-2025,rajasthan,343021,20,355,jalore
6,01-03-2025,tamil nadu,629803,10,19,kanniyakumari
13,01-03-2025,west bengal,723151,34,490,purulia
17,01-03-2025,punjab,152022,14,325,ferozepur


In [19]:
df_final_cleaned.shape

(128123, 6)

In [20]:
# 1. Rename the column
df_final_cleaned = df_final_cleaned.rename(columns={"district_new": "district"})

# 2. Reorder columns
cols = list(df_final_cleaned.columns)

# Remove district from current position
cols.remove("district")

# Find index of state
state_index = cols.index("state")

# Insert district right after state
cols.insert(state_index + 1, "district")

# Apply new order
df_final_cleaned = df_final_cleaned[cols]


In [21]:
df_final_cleaned.head()

Unnamed: 0,date,state,district,pincode,demo_age_5_17,demo_age_17_
3,01-03-2025,odisha,nabarangpur,764077,37,465
4,01-03-2025,rajasthan,jalore,343021,20,355
6,01-03-2025,tamil nadu,kanniyakumari,629803,10,19
13,01-03-2025,west bengal,purulia,723151,34,490
17,01-03-2025,punjab,ferozepur,152022,14,325


In [22]:
df_final_cleaned['district'].isna().sum()

np.int64(0)

In [None]:
df_final_cleaned.drop(['Unnamed: 0'],axis=1, inplace=True)

In [23]:
df_final_cleaned.head()

Unnamed: 0,date,state,district,pincode,demo_age_5_17,demo_age_17_
3,01-03-2025,odisha,nabarangpur,764077,37,465
4,01-03-2025,rajasthan,jalore,343021,20,355
6,01-03-2025,tamil nadu,kanniyakumari,629803,10,19
13,01-03-2025,west bengal,purulia,723151,34,490
17,01-03-2025,punjab,ferozepur,152022,14,325


In [26]:
df_final_cleaned.shape

(128123, 6)

In [25]:
df_final_cleaned.to_csv('demo_anomalies_cleaned.csv',index=False)

In [27]:
#now saving False data
df_final_false = df[df['district_new'].isna()]

In [28]:
df_final_false.head()

Unnamed: 0,date,state,district,pincode,demo_age_5_17,demo_age_17_,district_new
0,01-03-2025,karnataka,belgaum,591118,27,56,
1,01-03-2025,karnataka,bengaluru,560072,92,438,
2,01-03-2025,tripura,dhalai,799285,18,517,
5,01-03-2025,karnataka,mysore,571121,10,104,
7,01-03-2025,delhi,central delhi,110055,109,1458,


In [29]:
# Simple check
df_final_false[~df_final_false['district_new'].isna()]

Unnamed: 0,date,state,district,pincode,demo_age_5_17,demo_age_17_,district_new


In [30]:
df_final_false.drop(['district_new'],axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final_false.drop(['district_new'],axis=1, inplace=True)


In [31]:
df_final_false.head()

Unnamed: 0,date,state,district,pincode,demo_age_5_17,demo_age_17_
0,01-03-2025,karnataka,belgaum,591118,27,56
1,01-03-2025,karnataka,bengaluru,560072,92,438
2,01-03-2025,tripura,dhalai,799285,18,517
5,01-03-2025,karnataka,mysore,571121,10,104
7,01-03-2025,delhi,central delhi,110055,109,1458


In [32]:
df_final_false.shape

(273807, 6)

In [None]:
df_final_false.drop(['Unnamed: 0'],axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final_false.drop(['Unnamed: 0'],axis=1, inplace=True)


In [33]:
df_final_false.head()

Unnamed: 0,date,state,district,pincode,demo_age_5_17,demo_age_17_
0,01-03-2025,karnataka,belgaum,591118,27,56
1,01-03-2025,karnataka,bengaluru,560072,92,438
2,01-03-2025,tripura,dhalai,799285,18,517
5,01-03-2025,karnataka,mysore,571121,10,104
7,01-03-2025,delhi,central delhi,110055,109,1458


In [34]:
df_final_false.to_csv('demo_anomalies_false.csv',index=False)