Packages Used

In [1]:
import sys
import os 
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz 
from fuzzywuzzy import process
from indic_transliteration import sanscript
from indic_transliteration.sanscript import SchemeMap, SCHEMES, transliterate
from collections import OrderedDict

### Set directory paths and variables

In [2]:
#change per block
block = "गौरा"
block_no = 448
all_matches_path = r"C:\Users\Madhavan\Dropbox\electoral_rolls_matching\up_matchfiles\include_left\all_matches_60_7_gaura.xlsx"

#change for every new AC
electoral_roll_names_hin_path = r"C:\Users\Madhavan\Dropbox\electoral_rolls_matching\up_ocr\include_left\processed\68_1.xlsx"

#change for every new census district
census_data_path = r"C:\Users\Madhavan\Dropbox\electoral_rolls_matching\up_directory\district\Pratapgarh.csv"

#### Reading Datafiles - Census Data and Form 20 Data

In [3]:
census = pd.read_csv(census_data_path)
electoral_roll_names_hin = pd.read_excel(electoral_roll_names_hin_path)

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
hindi_village_names_by_block = None
grouped = electoral_roll_names_hin.groupby('block')
for name, group in grouped:
    print(name)
    if(name == block):
        hindi_village_names_by_block = group.reset_index(drop = True)
        print("Using ocr data " + name + ' block')

ऊन
काँधला
कैराना


#### Filtering the Census Data for only villages and towns

In [29]:
# change subdistrict based on AC
# census = census.dropna().reset_index(drop = True)
census = census.where((census["CD Block Code"] == block_no)).dropna(how = 'all').reset_index(drop = True)

In [30]:
census_codes_to_names = {int(census['Village Code'][i]):census['Village Name'][i] for i in census.index}

In [31]:
print('Number of Entries in Census Data:',len(census))
print('Number of Unique Village Names in Census Data:',len(set(census['Village Name'])))

Number of Entries in Census Data: 125
Number of Unique Village Names in Census Data: 124


### Transliterate Hindi names to English using OPTITRANS scheme and then transliterating from OPTITRANS to lay indian

In [32]:
#transliterate hindi name from electoral rolls to english and save along with the polling station number(s) in a dict

transliterated_names_to_ps_id = OrderedDict()
for i in hindi_village_names_by_block.index:
    only_main = -1
    if pd.isna(hindi_village_names_by_block["processed"][i]):
        eng = transliterate(hindi_village_names_by_block["mukhya_village"][i], sanscript.DEVANAGARI, sanscript.OPTITRANS)
        only_main = 1
    elif hindi_village_names_by_block["processed"][i] == "(ignore)":
        eng = transliterate(hindi_village_names_by_block["mukhya_village"][i], sanscript.DEVANAGARI, sanscript.OPTITRANS)
        only_main = 2
    elif hindi_village_names_by_block["processed"][i] == "failure":
        eng = "chief_village_fail"
        only_main = 1
    else:
        eng = transliterate(hindi_village_names_by_block["processed"][i], sanscript.DEVANAGARI, sanscript.OPTITRANS)
        only_main = 0
    these_ps_ids = transliterated_names_to_ps_id.get((eng, only_main), -1)
    if these_ps_ids == -1:
        transliterated_names_to_ps_id[(eng, only_main)] = [hindi_village_names_by_block["ps_id"][i]]
    elif hindi_village_names_by_block["ps_id"][i] not in these_ps_ids:
        transliterated_names_to_ps_id[(eng, only_main)].append(hindi_village_names_by_block["ps_id"][i])
    print(eng)

mahotharI
rAjApura devApaTaTI
chaubepaTTI
gokulA
muAraadhAragaMja
muAraadhAragaMja
muAraadhAragaMja
pUrerAmasahAya
kaulApura nandapaTaTI
kaulApura nandapaTaTI
bhAnapura dasiyA
narasiMhagaDha
bhAnapura dasiyA
shAhapura AMsika
shAhapura
shAhapura AMsika
shAhapura
shAhapura AMsika
shAhapura
nidhIpaTTI
saMDilA
devagaDha़kamAsina
devagaDha़kamAsina
bhUsalapura
saMDilA
saMDilA
bAbUpaTaTI
sarAyasetarAya
mathurA
pUre bhaIyA jI
vishambharapura
sigAMhI
AmApura bera
AmApura bera
AmApura bera
AmApura bera
sarAya sultAnI
sarAya sultAnI
barahadA
khushahAlagaDha
rAIpura
phatehapura
mIrapura
koDaraDIha
thariyA
thariyA
kaulApura
dhanuhAM Ashika
dhanuhA
dhanuhAM Ashika
dhanuhA
pUrebasahU
narI
shekhUpura
shekhUpura
lapakana
rAmanagara
rAmanagara
saramAnapura
TikaitA
risAlagaDha
atarI
sujahA
jAjApura
damadama
damadama
DighavaTa
DighavaTa
khAkhApura
khAkhApura
borrA
borrA
sultAnapura
sultAnapura
sultAnapura
bhavAnIgaDha़
biraIpura
rAmApura
rAmApura
rAmApura
rAmApura
raheTuA parasarAmapura
raheTuA parasarAm

#### create a list of 5 lists where each sublist contains the kth best match for each transliterated village name.

In [33]:
all_matches = [[], []
               , [], [], []]
all_codes = [[], [], [], [], []]
for name, num in transliterated_names_to_ps_id.keys():
    matches = process.extract(name, census_codes_to_names)
    for i in range(0, len(all_matches)):
        all_matches[i].append(matches[i][0])
        all_codes[i].append(matches[i][2])

#### Create an excel file with transliterated village name and top 5 matches with the last column 'correct_match_number' to be manually filled

In [34]:
transliterated_names = [] 
bools = []
for name, boolean in transliterated_names_to_ps_id.keys():
    transliterated_names.append(name)
    bools.append(boolean)
allMatchesDf = pd.DataFrame({"transliterated_village_name": transliterated_names,
                             "ps_id": list(transliterated_names_to_ps_id.values()),
                             "match 1": all_matches[0], 
                             "match 2": all_matches[1], 
                             "match 3": all_matches[2],
                             "match 4": all_matches[3],
                             "match 5": all_matches[4]
                            })
correct_match_numbers = []
for name in transliterated_names:
    correct_match_numbers.append(0)
    
allMatchesDf['correct_match_number'] = correct_match_numbers
allMatchesDf['code 1'] = all_codes[0]
allMatchesDf['code 2'] = all_codes[1]
allMatchesDf['code 3'] = all_codes[2]
allMatchesDf['code 4'] = all_codes[3]
allMatchesDf['code 5'] = all_codes[4]
allMatchesDf['only_main'] = bools
allMatchesDf.to_excel(all_matches_path)