<h2>Installing tesseract and pyPDF4:</h2>

In [0]:
!apt-get install python-dev libxml2-dev libxslt1-dev antiword unrtf poppler-utils pstotext tesseract-ocr \
flac ffmpeg lame libmad0 libsox-fmt-mp3 sox libjpeg-dev swig libpulse-dev

In [0]:
!pip install textract

In [0]:
!pip install pyPDF4

<h2>Final Model </h2>

In [0]:
## Below packages are need to be installed
import requests
import pandas as pd
import re
import textract
import PyPDF4
from PyPDF4 import PdfFileReader, PdfFileWriter

<h2>1.Downloading the pdf from target url:</h2>

In [0]:
### downloading the pdf

url = 'http://ceo.karnataka.gov.in/finalrolls_2020/English/MR/AC163/S10A163P1.pdf'

## sending http request
res = requests.get(url).content

## writing the response to a file
## give your own path below
with open("/content/downloaded.pdf","wb") as f:
    f.write(res)

<h2>2.Splitting the pdf into pages(individul pdfs):</h2>

In [0]:
## Reason behind splitting main pdf into individual pdfs:
## if we split the main pdf into multiple pdfs(each refers to corresponding page) we can easily debug the output.
## so that we can able to analyze any page using this method

In [0]:
## Here i am splitting main pdf into multiple pdfs where each pdf refers to corresponding page in main pdf file.
## Below path should be same as above path
pdf_file_path = "/content/downloaded.pdf"

## Creating PdfFileReader object
pdf_reader =  PdfFileReader(pdf_file_path)

## Getting total number of pages in main-pdf
total_num_of_pages = pdf_reader.getNumPages()

## iterating over all pages in main-pdf
for each_page in range(total_num_of_pages):
    pdf_writer = PdfFileWriter()
    pg = pdf_reader.getPage(each_page)
    pdf_writer.addPage(pg)
  
    ## output path
    output_file_name = f'page_{each_page}.pdf'

    ## writing each page as pdf file(page_0,page_1,page_2.......)
    with open(output_file_name, 'wb') as fl:
      pdf_writer.write(fl)


<h2>3.Extracting Names from pdfs:</h2>

In [20]:
## Extracting both person name and his relative name from  each page.pdf
total_pages = total_num_of_pages
final_names_list = []
final_father_names_list = []
final_page_numbers_list = []

## iterating over all pages pdfs
for each_page in range(total_pages):

  names_list = []
  father_names_list = []
  page_numbers_list = []
  ## Below input_path should be same as output path in above module(i.e splitting pdf)
  input_path = '/content/page_'+str(each_page)+'.pdf'

  ## Using textract i have extracted data from pdf
  page_data = textract.process(input_path, method='tesseract' , language='eng')
  ## textract output format is byte-stream and 

  ## converting the byte-stream data to string format , so that we can perform our some string operations on it. 
  total_page_data = page_data.decode('utf-8')

  ## splitting the page_data line by line
  total_page = total_page_data.splitlines()
  #print(total_page)
  for each_line in range(len(total_page)-1):
    #print(each_line)

    ## replacing the unwanted characters with empty string
    first_line = total_page[each_line].replace(':','')
    first_line = first_line.replace('=','')
    first_line = first_line.replace("-",'')
    next_line = total_page[each_line+1].replace(":",'')
    next_line = next_line.replace('=','')
    next_line = next_line.replace('-','')

    ## finding the frequency of word "Name" in the line
    name_freq_in_first_line = len(re.findall('Name',first_line))

    ## finding the frequency of word "Name"  in the next_line.which is after above line
    name_freq_in_second_line = len(re.findall('Name',next_line))

    ## cond-1 and cond-2 --->Here i am simply accesing those lines where first line starts with Name and second line(next_line) starts with Father's Name or Husband's Name or Mother's Name or Other's Name
    ## cond-3--> To avoid mismating fathers name with person name
    ## i am cheking the  word "Name" frequency in both lines,if both count equals we are good to go otherwise father's name and person name will get mismatched
    if(first_line.startswith('Name') and (name_freq_in_first_line==name_freq_in_second_line) and (total_page[each_line+1].startswith("Father's Name") or total_page[each_line+1].startswith("Mother's Name") or total_page[each_line+1].startswith("Husband's Name"))):
      line = first_line.split("Name")
      for each_item in line:
        if(each_item!=''):
            each_item = each_item.strip()
            names_list.append(each_item)
      
      ## Below code is responsible for extracting father's name from next line(second_line)
      line = re.split("Father's Name|Mother's Name|Husband's Name|Other's Name",next_line)
      for each_item in line:
        if(each_item!=''):
            each_item = each_item.strip()
            father_names_list.append(each_item)
            page_numbers_list.append(each_page)
    else:
      pass
  final_names_list.append(names_list)
  final_father_names_list.append(father_names_list)
  final_page_numbers_list.append(page_numbers_list)
  print("Number of pages completed {0}".format(each_page))

Number of pages completed 0
Number of pages completed 1
Number of pages completed 2
Number of pages completed 3
Number of pages completed 4
Number of pages completed 5
Number of pages completed 6
Number of pages completed 7
Number of pages completed 8
Number of pages completed 9
Number of pages completed 10
Number of pages completed 11
Number of pages completed 12
Number of pages completed 13
Number of pages completed 14
Number of pages completed 15
Number of pages completed 16
Number of pages completed 17
Number of pages completed 18
Number of pages completed 19
Number of pages completed 20
Number of pages completed 21
Number of pages completed 22
Number of pages completed 23
Number of pages completed 24
Number of pages completed 25
Number of pages completed 26
Number of pages completed 27
Number of pages completed 28
Number of pages completed 29
Number of pages completed 30
Number of pages completed 31
Number of pages completed 32
Number of pages completed 33
Number of pages complete

In [0]:
### To eliminate non-ascii values(illegal characters)
def non_ascii(text):
	return "".join([(i if ord(i)<128 else " ") for i in text])

In [0]:
def preprocess(str1):
  str1 = str(str1).lower()
  ## removing prefixes such as Mr, Mrs, Dr .. before any name
  str1 = str1.replace("mrs.",'')
  str1 = str1.replace("mr.",'')
  str1 = str1.replace("dr.",'')
  ## removing some special chars
  str1 = str1.replace(".",' ')
  str1 = str1.replace("\\",'')
  str1 = str1.replace("'",'')
  str1 = str1.replace("$",'')
  str1 = str1.replace("_",'')
  str1 = str1.replace("|",'')
  str1 = str1.replace(";",'')
  ## removing digits if any present in names
  str1 = ''.join([i for i in str1 if not i.isdigit()]) 
  ## Converting to ascii format(removing illegal characters if any present ) 
  str1 = non_ascii(str1)
  str1 = str1.replace("  "," ") ### replacing two spaces with one space
  str1 = str1.strip()

  return str1

In [0]:
all_names_list = [preprocess(name) for each_list in final_names_list for name in each_list]
all_father_names_list = [preprocess(name) for each_list in final_father_names_list for name in each_list]
all_page_numbers_list = [pn for each_sub_list in final_page_numbers_list for pn in each_sub_list]

<h3> Splitting the name into initials:</h3>

In [0]:
## *NOTE* : All names are in different format means some names are written in first name , last name format and
## some are written in last name first name format
## i have followed first name , middle name , last name format

def split_name(all_names_list):
  count = 0
  index = 0
  final_all_names_list = []
  index_list = []
  
  ## iterating over all the names
  for each_name in all_names_list:
      each_name_list = each_name.split()

      ## if length of the name equals to 3 it goes to below block
      if(len(each_name_list)==3):
        first_name = each_name_list[0]
        middle_name = each_name_list[1]
        last_name = each_name_list[2]

      ## if length of the name equals to 2 it goes to below block
      elif(len(each_name_list)==2):
        first_name = each_name_list[0]
        middle_name = 'na'
        last_name = each_name_list[1]
      
      ## if length of the name equals to 1 it goes to below block
      elif(len(each_name_list)==1):
        first_name = each_name_list[0]
        middle_name = 'na'
        last_name = 'na'
      
      ## if length of the name is greater than 3 it goes to below block
      else:
        first_name = "na"
        middle_name = "na"
        last_name = "na"
        count+=1
        index_list.append(index)
      obj = {
          "first_name":first_name,
          "middle_name":middle_name,
          "last_name":last_name  
      }
      #print(obj)
      index+=1
      final_all_names_list.append(obj)
  return final_all_names_list    

<h2>4.Write data into CSV file:</h2>

In [45]:
d = {}
d['Full_name'] = all_names_list
d['Relative_name'] = all_father_names_list
d["Page_number"]  = all_page_numbers_list

## df1 is a dataframe and columns are Full name,Relative name and page_number
df1 = pd.DataFrame(data=d )

## df2 is a data frame in which  columns are First name, middle name  and last name 
df2 = pd.DataFrame(data=split_name(all_names_list))

## joining the two dataframes into final one(which is df3)
## it contains all the six columns which are  First name middle name last name, Full name,Relative name ,Page_number.
df3 = df2.join(df1)
print(df3)

## give your path to save the csv file
df3.to_csv('/content/Final_data.csv')

    first_name middle_name  ...     Relative_name Page_number
0        meera       mohan  ...   mohan bharavani           2
1      brijaya           n  ...  narain das kalro           2
2           cg        arun  ...   c g gunashekarn           2
3        henna          na  ...    brijay n kalro           2
4         yash           b  ...    brijay n kalro           2
..         ...         ...  ...               ...         ...
992      tarun          na  ...        roy mammen          42
993    sandhya          na  ...            ananth          42
994         kb          na  ...        kk bopanna          42
995     miriam          na  ...        roy mammen          42
996     aaliya          na  ...     nazir hussain          42

[997 rows x 6 columns]
