# Getting to the bottom of the name discrepancies

In [42]:
import pandas as pd
from bs4 import BeautifulSoup
import os
import glob
import numpy as np
import matplotlib.pyplot as plt
import geopandas as gpd
import re

# 1. Are all the names the same in all of the data files from the Ministry of Agriculture?

In [2]:
#############################################################
# Read in the raw files and convert it to a pandas DataFrame
#############################################################
def read_file(filename):
    # Read the file as text
    with open(filename, 'r', encoding='utf-8') as file:
        content = file.read()

    # Parse the HTML
    soup = BeautifulSoup(content, 'html.parser')
    table = soup.find('table')

    # Convert to pandas DataFrame
    df = pd.read_html(str(table))[0]

    # Clean up any potential messy data
    df = df.replace({r'\s+$': ''}, regex=True)  # Remove trailing whitespace
    df = df.replace({r'^\s+': ''}, regex=True)  # Remove leading whitespace

    return df

In [3]:
#############################################################
# Clean the DataFrame by removing rows with all NaN values and resetting the index
#############################################################
def restack(df):
    # Keep the first three columns as they are
    id_cols = df.iloc[:, :3].copy()  # State, District, Year
    id_cols.columns = ['State', 'District', 'Year']  # Flatten these column names

    # Get the measurement columns (everything except first three)
    measure_cols = df.iloc[:, 3:]

    # Stack only crop and season levels, keeping indicators as columns
    stacked = (measure_cols
        .stack(level=0, dropna=False)  # Stack first level (Crop)
        .stack(level=0, dropna=False)  # Stack second level (Season)
    )

    # Convert to DataFrame and reset index
    stacked_df = stacked.reset_index()

    # Rename the columns appropriately
    stacked_df = stacked_df.rename(columns={
        'level_1': 'Crop',
        'level_2': 'Season',
        0: 'Area (Hectare)',
        1: 'Production (Tonnes)',
        2: 'Yield (Tonne/Hectare)'
    })

    # Create final dataframe
    final_df = pd.concat([
        id_cols.loc[stacked_df['level_0']].reset_index(drop=True),
        stacked_df.drop('level_0', axis=1)
    ], axis=1)

    # Sort the data
    final_df = final_df.sort_values(['State', 'District', 'Year', 'Crop', 'Season'])

    # Reset the index for clean output
    final_df = final_df.reset_index(drop=True)

    return final_df

In [4]:
#############################################################
# Add state abbreviations to the District column
#############################################################
def add_state_abbr(final_df):
    #Add state abbreviations
    abbr_map = {
    'Andhra Pradesh': '(AP)',
    'Arunachal Pradesh': '(AR)',
    'Assam': '(AS)',
    'Bihar': '(BR)',
    'Chhattisgarh': '(CG)',
    'Goa': '(GA)',
    'Gujarat': '(GJ)',
    'Haryana': '(HR)',
    'Himachal Pradesh': '(HP)',
    'Jharkhand': '(JH)',
    'Jammu and Kashmir': '(JK)',
    'Karnataka': '(KA)',
    'Kerala': '(KL)',
    'Madhya Pradesh': '(MP)',
    'Maharashtra': '(MH)',
    'Manipur': '(MN)',
    'Meghalaya': '(ML)',
    'Mizoram': '(MZ)',
    'Nagaland': '(NL)',
    'Odisha': '(OD)',
    'Punjab': '(PB)',
    'Rajasthan': '(RJ)',
    'Sikkim': '(SK)',
    'Tamil Nadu': '(TN)',
    'Telangana': '(TG)',
    'Tripura': '(TR)',
    'Uttar Pradesh': '(UP)',
    'Uttarakhand': '(UK)',
    'West Bengal': '(WB)',
    'Andaman and Nicobar Islands': '(AN)',
    'Chandigarh': '(CH)',
    'The Dadra & Nagar Haveli and Daman and Diu': '(DD)',
    'The Dadra and Nagar Haveli and Daman and Diu': '(DD)',
    'Dadra and Nagar Haveli and Daman and Diu': '(DD)',
    'Dadra and Nagar Haveli': '(DN)',
    'Daman and Diu': '(DA)',
    'Delhi': '(DL)',
    'Ladakh': '(LA)',
    'Lakshadweep': '(LD)',
    'Puducherry': '(PY)'
    }

    final_df['District'] = final_df['District'] + ' ' + final_df['State'].map(abbr_map)

    return final_df

In [30]:
#########################################################################################
# Fix district names by changing ag stats names to match the names in the hybrid boundary
#########################################################################################
def district_fix(final_df):
    district_mapping = {
    # Format: 'Old name': 'New name'
    '24 Paraganas North (WB)': 'North Twenty-Four Paraganas (WB)',
    '24 Paraganas South (WB)': 'South Twenty-Four Paraganas (WB)',
    '24 paraganas north (WB)': 'North Twenty-Four Paraganas (WB)',
    '24 paraganas south (WB)': 'South Twenty-Four Paraganas (WB)',
    'Ahmadabad (GJ)': 'Ahmedabad (GJ)',
    'Allahabad (UP)': 'Prayagraj (UP)',
    'Anugul (OD)': 'Angul (OD)',
    'Baghpat (UP)': 'Bagpat (UP)',
    'Bellary (KA)': 'Ballari (KA)',
    'Badgam (JK)': 'Budgam (JK)',
    'Baleshwar (OD)': 'Balasore (OD)',
    'Balrampur (CG)': 'Balrampur-Ramanujganj (CG)',
    'Bandipora (JK)': 'Bandipore (JK)',
    'Banas Kantha (GJ)': 'Banaskantha (GJ)',
    'Banas kantha (GJ)': 'Banaskantha (GJ)',
    'Bangalore Rural (KA)': 'Bengaluru Rural (KA)',
    'Barabanki (UP)': 'Bara Banki (UP)',
    'Baramulla (JK)': 'Baramula (JK)',
    'Belgaum (KA)': 'Belagavi (KA)',
    'Bhadradri (TG)': 'Bhadradri Kothagudem (TG)',
    'Charki Dadri (HR)': 'Charkhi Dadri (HR)',
    'Chhotaudepur (GJ)': 'Chhota Udaipur (GJ)',
    'Coochbehar (WB)': 'Koch Bihar (WB)',
    'Dakshin Kannad (KA)': 'Dakshina Kannada (KA)',
    'Dang (GJ)': 'Dangs (GJ)',
    'Davangere (KA)': 'Davanagere (KA)',
    'Deogarh (OD)': 'Debagarh (OD)',
    'Devbhumi dwarka (GJ)': 'Devbhumi Dwarka (GJ)',
    'Dinajpur Dakshin (WB)': 'Dakshin Dinajpur (WB)',
    'Dinajpur Uttar (WB)': 'Uttar Dinajpur (WB)',
    'Dohad (GJ)': 'Dahod (GJ)',
    'East Singhbum (JH)': 'Purba Singhbhum (JH)',
    'Faizabad (UP)': 'Ayodhya (UP)',
    'Firozepur (PB)': 'Firozpur (PB)',
    'Ganganagar (RJ)': 'Sri Ganganagar (RJ)',
    'Gariyaband (CG)': 'Gariaband (CG)',
    'Geyzing (SK)': 'Gyalshing (SK)',
    'Gondia (MH)': 'Gondiya (MH)',
    'Gulbarga (KA)': 'Kalaburagi (KA)',
    'Jagatsinghapur (OD)': 'Jagatsinghpur (OD)',
    'Jagitial (TG)': 'Jagtial (TG)',
    'Jajapur (OD)': 'Jajpur (OD)',
    'Jangoan (TG)': 'Jangaon (TG)',
    'Jayashankar (TG)': 'Jayashankar Bhupalpally (TG)',
    'Jogulamba (TG)': 'Jogulamba Gadwal (TG)',
    'Kadapa (AP)': 'YSR Kadapa (AP)',
    'Kaimur (Bhabua) (BR)': 'Kaimur (BR)',
    'Kamrup Rural (AS)': 'Kamrup (AS)',
    'Kamrup Metro (AS)': 'Kamrup Metropolitan (AS)',
    'Kanniyakumari (TN)': 'Kanyakumari (TN)',
    'Kanpur nagar (UP)': 'Kanpur Nagar (UP)',
    'Kasaragod (KL)': 'Kasargod (KL)',
    'Khairgarh Chhuikhadan Gandai (CG)': 'Khairagarh (CG)',
    'Kheri (UP)': 'Lakhimpur Kheri (UP)',
    'Komaram Bheem Asifabad (TG)': 'Kumuram Bheem Asifabad (TG)',
    'Korea (CG)': 'Koriya (CG)',
    'Kushi Nagar (UP)': 'Kushinagar (UP)',
    'Lahul And Spiti (HP)': 'Lahaul and Spiti (HP)',
    'Lahul and spiti (HP)': 'Lahaul and Spiti (HP)',
    'Leh Ladakh (LA)': 'Leh (LA)',
    'Leh Ladakh (JK)': 'Leh (LA)',
    'Leparada (AR)': 'Lepa Rada (AR)',
    'Maldah (WB)': 'Malda (WB)',
    'Manendragarh Chirimiri Bharatpur (CG)': 'Manendragarh (CG)',
    'Medchal (TG)': 'Medchal-Malkajgiri (TG)',
    'Medinipur East (WB)': 'Purba Medinipur (WB)',
    'Medinipur West (WB)': 'Paschim Medinipur (WB)',
    'Mewat (HR)': 'Nuh (HR)',
    'Mohla Manpur Ambagarh Chouki (CG)': 'Mohla-Manpur (CG)',
    'Muktsar (PB)': 'Sri Muktsar Sahib (PB)',
    'Mumbai (MH)': 'Mumbai City (MH)',
    'Mumbai suburban (MH)': 'Mumbai Suburban (MH)',
    'Muzaffarpur (BR)': 'Muzzafarpur (BR)',
    'Narayanapet (TG)': 'Narayanpet (TG)',
    'Nawanshahr (PB)': 'Shahid Bhagat Singh Nagar (PB)',
    'North And Middle Andaman (AN)': 'North and Middle Andaman (AN)',
    'North Twenty-four Paraganas (WB)': 'North Twenty-Four Parganas (WB)',
    'Ntr (AP)': 'NT Rama Rao (AP)',
    'Nuiland (NL)': 'Niuland (NL)',
    'Pakke Kessang (AR)': 'Pakke Kesang (AR)',
    'Panch Mahals (GJ)': 'Panchmahal (GJ)',
    'Pashchim Champaran (BR)': 'Paschim Champaran (BR)',
    'Pondicherry (PY)': 'Puducherry (PY)',
    'Purbi Champaran (BR)': 'Purba Champaran (BR)',
    'Rae Bareli (UP)': 'Raebareli (UP)',
    'Rajanna (TG)': 'Rajanna Sircilla (TG)',
    'Rajauri (JK)': 'Rajouri (JK)',
    'Rangareddi (TG)': 'Ranga Reddy (TG)',
    'Ri Bhoi (ML)': 'Ri-Bhoi (ML)',
    'S.A.S Nagar (PB)': 'Sahibzada Ajit Singh Nagar (PB)',
    'Sabar Kantha (GJ)': 'Sabarkantha (GJ)',
    'Sahebganj (JH)': 'Sahibganj (JH)',
    'Siaha (MZ)': 'Saiha (MZ)',
    'Sakti (CG)': 'Shakti (CG)',
    'Sant Kabeer Nagar (UP)': 'Sant Kabir Nagar (UP)',
    'Sant Ravidas Nagar (UP)': 'Bhadohi (UP)',
    'Saraikela Kharsawan (JH)': 'Seraikela Kharsawan (JH)',
    'Sarangarh Bilaigarh (CG)': 'Sarangarh-Bilaigarh (CG)',
    'Sepahijala (TR)': 'Sipahijala (TR)',
    'Shimoga (KA)': 'Shivamogga (KA)',
    'Siaha (MZ)': 'Saiha (MZ)',
    'Siddharth nagar (UP)': 'Siddharth Nagar (UP)',
    'Sonepur (OD)': 'Subarnapur (OD)',
    'South Salmara Mancachar (AS)': 'South Salmara-Mankachar (AS)',
    'South Twenty-four Paraganas (WB)': 'South Twenty-Four Parganas (WB)',
    'Spsr Nellore (AP)': 'Nellore (AP)',
    'The Nilgiris (TN)': 'Nilgiris (TN)',
    'Thenkasi (TN)': 'Tenkasi (TN)',
    'Thiruvallur (TN)': 'Tiruvallur (TN)',
    'Tirupathur (TN)': 'Tirupattur (TN)',
    'Udam Singh Nagar (UK)': 'Udham Singh Nagar (UK)',
    'Uttar Kashi (UK)': 'Uttarkashi (UK)',
    'Vijayanagar (KA)': 'Vijayanagara (KA)',
    'Visakhapatanam (AP)': 'Visakhapatnam (AP)',
    'West Singhbhum (JH)': 'Paschim Singhbhum (JH)',
    'Yadadri (TG)': 'Yadadri Bhuvanagiri (TG)',
    'Yadgir (KA)': 'Yadagiri (KA)',
    'Yamunanagar (HR)': 'Yamuna Nagar (HR)',
    'nan': 'NA'
    }
    
    final_df['District'] = final_df['District'].replace(district_mapping)

    return final_df

In [31]:
#Grab all files in the directory except the banana one which we already did
os.chdir('/Users/michaelfoley/Google Drive/My Drive/Subnational_Yield_Database/data/raw/IND/')

all_files = glob.glob('*.xls')

In [32]:
district_list = {}

for file in all_files:
    current_crops = file.split('.')[0].split('/')[-1]
    print(f'\nWorking on {current_crops}')

    #Read in
    df = read_file(file)
    final_df = restack(df)

    #Fix numbers in state and district columns
    final_df['State'] = final_df['State'].str.split('.', n=1).str[1].str.strip()
    final_df['District'] = final_df['District'].str.split('.', n=1).str[1].str.strip()
    final_df['District'] = final_df['District'].str.title()

    #Add state abbreviations
    final_df = add_state_abbr(final_df)

    #Change district names to common spellings from Gary's file
    final_df = district_fix(final_df)

    # Add to district list
    districts = final_df['District'].unique().tolist()
    print(f'Found {len(districts)} districts in {current_crops}')
    if 'Siaha (MZ)' in districts:
        print('Found Siaha (MZ), changing to Saiha (MZ)')
        
    district_list[current_crops] = districts



Working on ragi_barley_wheat_smallmillets


  df = pd.read_html(str(table))[0]
  .stack(level=0, dropna=False)  # Stack first level (Crop)
  .stack(level=0, dropna=False)  # Stack second level (Season)


Found 721 districts in ragi_barley_wheat_smallmillets

Working on blackpepper_drychillies_ginger_cashewnut


  df = pd.read_html(str(table))[0]
  .stack(level=0, dropna=False)  # Stack first level (Crop)
  .stack(level=0, dropna=False)  # Stack second level (Season)


Found 620 districts in blackpepper_drychillies_ginger_cashewnut

Working on rice_jowar_bajra_maize


  df = pd.read_html(str(table))[0]
  .stack(level=0, dropna=False)  # Stack first level (Crop)
  .stack(level=0, dropna=False)  # Stack second level (Season)


Found 746 districts in rice_jowar_bajra_maize

Working on moth_khesari_horsegram_cowpea


  df = pd.read_html(str(table))[0]
  .stack(level=0, dropna=False)  # Stack first level (Crop)
  .stack(level=0, dropna=False)  # Stack second level (Season)


Found 560 districts in moth_khesari_horsegram_cowpea

Working on masoor_peasbeans_linseed_castorseed


  df = pd.read_html(str(table))[0]
  .stack(level=0, dropna=False)  # Stack first level (Crop)
  .stack(level=0, dropna=False)  # Stack second level (Season)


Found 681 districts in masoor_peasbeans_linseed_castorseed

Working on soyabean_safflower_cotton_jute


  df = pd.read_html(str(table))[0]
  .stack(level=0, dropna=False)  # Stack first level (Crop)
  .stack(level=0, dropna=False)  # Stack second level (Season)


Found 664 districts in soyabean_safflower_cotton_jute

Working on onion_potato_sweetpotato_tumeric_guarseed


  df = pd.read_html(str(table))[0]
  .stack(level=0, dropna=False)  # Stack first level (Crop)
  .stack(level=0, dropna=False)  # Stack second level (Season)


Found 707 districts in onion_potato_sweetpotato_tumeric_guarseed

Working on tobacco_garlic_tapioca


  df = pd.read_html(str(table))[0]
  .stack(level=0, dropna=False)  # Stack first level (Crop)
  .stack(level=0, dropna=False)  # Stack second level (Season)


Found 601 districts in tobacco_garlic_tapioca

Working on mesta_sannhamp_coconut_sugarcane


  df = pd.read_html(str(table))[0]
  .stack(level=0, dropna=False)  # Stack first level (Crop)
  .stack(level=0, dropna=False)  # Stack second level (Season)


Found 686 districts in mesta_sannhamp_coconut_sugarcane

Working on banana_arecanut_cardamom_coriander


  df = pd.read_html(str(table))[0]
  .stack(level=0, dropna=False)  # Stack first level (Crop)
  .stack(level=0, dropna=False)  # Stack second level (Season)


Found 580 districts in banana_arecanut_cardamom_coriander

Working on othercereals_arhar_moong_urad


  df = pd.read_html(str(table))[0]
  .stack(level=0, dropna=False)  # Stack first level (Crop)
  .stack(level=0, dropna=False)  # Stack second level (Season)


Found 727 districts in othercereals_arhar_moong_urad

Working on rapeseed_nigerseed_sesamum_groundnut


  df = pd.read_html(str(table))[0]
  .stack(level=0, dropna=False)  # Stack first level (Crop)
  .stack(level=0, dropna=False)  # Stack second level (Season)


Found 737 districts in rapeseed_nigerseed_sesamum_groundnut


In [34]:
# Convert all lists to sets for efficient set operations
sets_dict = {key: set(value_list) for key, value_list in district_list.items()}

# Find names that appear in ALL entries (intersection of all sets)
common_names = set.intersection(*sets_dict.values())

# Find ALL unique names across any entry (union of all sets)
all_names = set.union(*sets_dict.values())

# Names that are distinct (not in all entries)
distinct_names = all_names - common_names

In [35]:
len(all_names), len(distinct_names)

(747, 341)

Seems like almost all of the districts are represented (as of 2022-2023) in at least one file, but a number of them don't have data for all crops. That is totally fine. We just need to make sure that all of these names are represented in the hybrid boundary.

# How many of the unique names are represented in the hybrid boundary?

In [36]:
shp_path = "/Users/michaelfoley/Google Drive/My Drive/Subnational_Yield_Database/boundaries/country/IND/2016_2023_hybrid_boundary.shp" 
hybrid = gpd.read_file(shp_path)

In [37]:
merged_list = hybrid['merged_dis'].tolist()
merges = [name.split(', ') for name in merged_list if name is not None]
flat_merges = [item for sublist in merges for item in sublist]

In [38]:
flat_merges

['Chhindwara (MP)',
 'Pandhurna (MP)',
 'Maihar (MP)',
 'Satna (MP)',
 'Didwana-Kuchaman (RJ)',
 'Nagaur (RJ)',
 'Mauganj (MP)',
 'Rewa (MP)',
 'Balotra (RJ)',
 'Barmer (RJ)',
 'Bharatpur (RJ)',
 'Deeg (RJ)',
 'Salumbar (RJ)',
 'Udaipur (RJ)',
 'Alwar (RJ)',
 'Dudu (RJ)',
 'Jaipur (RJ)',
 'Jaipur(Rural) (RJ)',
 'Kherthal-Tijara (RJ)',
 'Kotputli-Behror (RJ)',
 'Ajmer (RJ)',
 'Beaware (RJ)',
 'Kekri (RJ)',
 'Pali (RJ)',
 'Tonk (RJ)',
 'Jalora (RJ)',
 'Sanchore (RJ)',
 'Bhilwara (RJ)',
 'Shahpura (RJ)',
 'Gangapur City (RJ)',
 'Karauli (RJ)',
 'Sawai Madhopur (RJ)',
 'Jodhpur (RJ)',
 'Jodhpur(Rural) (RJ)',
 'Phalodi (RJ)',
 'Jhunjhunu (RJ)',
 'Neem Ka Thana (RJ)',
 'Sikar (RJ)',
 'Anupgarh (RJ)',
 'Bikaner (RJ)',
 'Sri Ganganagar (RJ)',
 'Parvathipuram Manyam (AP)',
 'Srikakulam (AP)',
 'Vizianagaram (AP)',
 'Annamayya (AP)',
 'Chittoor (AP)',
 'Nellore (AP)',
 'Tirupati (AP)',
 'YSR Kadapa (AP)',
 'Eluru (AP)',
 'West Godavari (AP)',
 'Kurnool (AP)',
 'Nandyal (AP)',
 'Krishna (AP)',
 '

In [39]:
#Get all standalone names as well as merged names
test = set(hybrid['name_state'].unique().tolist() + flat_merges)

# Remove everything from ' (' to the end of the string
#cleaned_names = set(re.sub(r' \(.*\)$', '', name) for name in test if name is not None)

In [40]:
missing = all_names - test
len(missing)
missing

{'Aurangabad (MH)',
 'Dadra And Nagar Haveli (DD)',
 'Dadra And Nagar Haveli (DN)',
 'Daman (DA)',
 'Daman (DD)',
 'Delhi_Total (DL)',
 'Diu (DA)',
 'Diu (DD)',
 'Kargil (JK)',
 'Siddharth Nagar (UP)'}

In [15]:
len(all_names)

748

In [16]:
len(test)

810

Seems like all but 25 are represented in the hybrid boundary. I need to fix up those names and also figure out what to do with the Daman, Diu, Dadar, and Nagar Haveli ones.

In [41]:
test

{'Adilabad (TG)',
 'Agar Malwa (MP)',
 'Agra (UP)',
 'Ahmedabad (GJ)',
 'Ahmednagar (MH)',
 'Aizawl (MZ)',
 'Ajmer (RJ)',
 'Ajmer (RJ), Pali (RJ), Tonk (RJ)',
 'Akola (MH)',
 'Alappuzha (KL)',
 'Aligarh (UP)',
 'Alipurduar (WB)',
 'Alirajpur (MP)',
 'Alluri Sitharama Raju (AP)',
 'Almora (UK)',
 'Alwar (RJ)',
 'Ambala (HR)',
 'Ambedkar Nagar (UP)',
 'Amethi (UP)',
 'Amravati (MH)',
 'Amreli (GJ)',
 'Amritsar (PB)',
 'Amroha (UP)',
 'Anakapalli (AP)',
 'Anand (GJ)',
 'Anantapur (AP)',
 'Anantnag (JK)',
 'Angul (OD)',
 'Anjaw (AR)',
 'Annamayya (AP)',
 'Anupgarh (RJ)',
 'Anuppur (MP)',
 'Araria (BR)',
 'Aravalli (GJ)',
 'Ariyalur (TN)',
 'Arwal (BR)',
 'Ashoknagar (MP)',
 'Auraiya (UP)',
 'Aurangabad (BR)',
 'Aurangabad (MH) (MH)',
 'Ayodhya (UP)',
 'Azamgarh (UP)',
 'Bagalkot (KA)',
 'Bageshwar (UK)',
 'Bagpat (UP)',
 'Bahraich (UP)',
 'Bajali (AS)',
 'Baksa (AS)',
 'Balaghat (MP)',
 'Balangir (OD)',
 'Balasore (OD)',
 'Ballari (KA)',
 'Ballia (UP)',
 'Balod (CG)',
 'Baloda Bazar (CG)',