In [1]:
%pip install --upgrade pymupdf PyPDF2

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [1]:
from PyPDF2 import PdfReader
import re,fitz
import pandas as pd
import os

In [2]:
ocr = True
if ocr:
    from tempfile import TemporaryDirectory
    import pytesseract
    from pdf2image import convert_from_path
    from PIL import Image

ModuleNotFoundError: No module named 'pytesseract'

In [6]:
#CHANGE FILENAME TO THE NEW ONE
FILENAME = 'ProvasSemImagem/ENEM_2021_P1_CAD_11_DIA_2_LARANJA_LEDOR.pdf'
OUTNAME = 'Data/ENEM_2021_P1_CAD_11_DIA_2_LARANJA_LEDOR.csv'

In [7]:
class EnemAutomata:
    def __init__(self):
        self.state = 0
        self.question = {}
        self.state_dict = {
            0:'header',
            1:'body',
            'A':'alternative A',
            'B':'alternative B',
            'C':'alternative C',
            'D':'alternative D',
            'E':'alternative E',
            'R':'Essay'
        }
    def clear_memory(self):
        self.state = 0
        self.question = {}
        
    def letter_state(self,current_state,next_state,part):
        if part.strip() != next_state:
            if current_state not in self.question:
                self.question[current_state] =''
            self.question[current_state]+=(part.strip('\n')+' ').replace('  ',' ')
        else:
            self.state = next_state
        
    def read(self,part):
        #final state
        if self.state=='E' and (re.search('questão [\d]+',part.lower()) or re.search('ENDOFENEM',part) or re.search('\*.*\*',part)):
            ret = self.question.copy()
            self.clear_memory()
            return ret
        elif re.search('\*.*\*',part.lower()) and self.state == 'R':
            ret = self.question.copy()
            self.clear_memory()
            return ret
        
        if not part:
            return False
        elif re.search('questão [\d]+',part.lower()) and self.state == 0:
            self.question['question'] = part.strip()
            self.state = 1
        elif re.search('instruções para a redação',part.lower()) and self.state == 0:
            self.question['question'] = 'redação'
            self.question['body'] = ''
            self.question['A'] = ''
            self.question['B'] = ''
            self.question['C'] = ''
            self.question['D'] = ''
            self.question['E'] = ''
            self.state = 'R'
        elif self.state =='R':
            self.question['body']+=(part.strip('\n')+' ').replace('  ',' ')
        elif self.state == 1 and part.strip() != 'A':
            if 'body' not in self.question:
                self.question['body'] = ''
            self.question['body']+=(part.strip('\n')+' ').replace('  ',' ')
        elif self.state==1:
            self.state = 'A'
        elif self.state != 0 and self.state != 1:
            self.letter_state(self.state,chr(ord(self.state)+1),part)
        return False            

In [8]:
class OCRAutomata:
    def __init__(self):
        self.state = 0
        self.question = {}
        self.state_dict = {
            0:'header',
            1:'body',
            'A':'alternative A',
            'B':'alternative B',
            'C':'alternative C',
            'D':'alternative D',
            'E':'alternative E',
            'R':'Essay'
        }
    def clear_memory(self):
        self.state = 0
        self.question = {}
        self.question['question'] = ''
        self.question['body']=''
        self.question['A'] = None
        self.question['B'] = None
        self.question['C'] = None
        self.question['D'] = None
        self.question['E'] = None
        
    def read(self,part):
        #final state
        if self.state==1 and (re.search('questão [\d]+',part.lower()) or re.search('ENDOFENEM',part) or re.search('\*.*\*',part) or re.search('instruções para a redação',part.lower())):
            ret = self.question.copy()
            self.clear_memory()
            return ret
        elif re.search('\*.*\*',part.lower()) and self.state == 'R':
            ret = self.question.copy()
            self.clear_memory()
            return ret
        if not part:
            return False
        elif re.search('questão [\d]+',part.lower()) and self.state == 0:
            self.question['question'] = re.search('questão [\d]+',part.lower()).group()
            self.state = 1
        elif re.search('instruções para a redação',part.lower()) and self.state == 0:
            self.question['question'] = 'redação'
            self.question['body'] = ''
            self.question['A'] = None
            self.question['B'] = None
            self.question['C'] = None
            self.question['D'] = None
            self.question['E'] = None
            self.state = 'R'
        elif self.state =='R':
            self.question['body']+=(part.strip('\n')+' ').replace('  ',' ')
        elif self.state == 1:
            if 'body' not in self.question:
                self.question['body'] = ''
            self.question['body']+=(part.strip('\n')+' ').replace('  ',' ')
        return False            

In [9]:
class PhysicalEnemParser:
    def __init__(self,enem_object,engine='pypdf2'):
        self.enem_object = enem_object
        self.engine=engine
        parts = []
        if engine=='pymupdf':
            for page_num in range(1,len(enem_object)):
                page = enem_object[page_num]
                image_list = page.get_images(full=True)
                to_remove = []
                for image in image_list:
                    bbox = page.get_image_bbox(image)
                    tb = page.get_textbox(bbox)
                    to_remove.extend(tb.split('\n'))
                page_text = page.get_text().split('\n')
                for text in page_text:
                    if text not in to_remove:
                        parts.append(text)
        if engine =='pypdf2':
            def visitor_body(text, cm, tm, fontDict, fontSize):
                parts.append(text)

            for page in enem_object.pages:
                page.extract_text(visitor_text=visitor_body)
        #LINUX only
        if engine =='OCR':
            language_config = r'-l por --psm 1'
            # Path of the Input pdf
            PDF_file = enem_object
            
            # Store all the pages of the PDF in a variable
            image_file_list = []
            
            with TemporaryDirectory() as tempdir:
                # Create a temporary directory to hold our temporary images.
                pdf_pages = convert_from_path(PDF_file, 500)
                
                # Iterate through all the pages stored above
                for page_enumeration, page in enumerate(pdf_pages, start=1):        
                    # Create a file name to store the image
                    filename = f"{tempdir}\page_{page_enumeration:03}.jpg"
                    # Save the image of the page in system
                    page.save(filename, "JPEG")
                    image_file_list.append(filename)
                parsed = ''
                # Iterate from 1 to total number of pages
                for image_file in image_file_list:
                # Recognize the text as string in image using pytesserct
                    text = str(((pytesseract.image_to_string(Image.open(image_file),config=language_config))))
                    parsed+=text
                parts = parsed.split('\n')
        self.parts=parts

    def parse_questions(self):
        self.automata = EnemAutomata()
        if self.engine=='OCR':
            self.automata = OCRAutomata()
        questions=[]
        for part in self.parts:
            accept = self.automata.read(part)
            while accept:
                questions.append(accept)
                accept = self.automata.read(part)
        return questions

In [21]:
files = list(os.listdir('ProvasPre2016'))
files = [f for f in files if not '2009' in f]
for fileno in files:
    FILENAME = 'ProvasPre2016/'+fileno
    OUTNAME = 'Data/'+fileno.strip('.pdf')+'.csv'
    enem = PdfReader(FILENAME)
    parser = PhysicalEnemParser(enem,engine='pypdf2')
    questions = parser.parse_questions()
    df = pd.DataFrame(questions)
    essay_instructions = df.loc[df['question']=='redação','body']
    if not essay_instructions.empty:
        essay_instructions = essay_instructions.iloc[0]
        try:
            df.loc[df['question']=='redação','body'] = re.sub(r'PROPOSTA DE REDAÇÃO.*?(?!(TEXTO))','',essay_instructions) + re.search(r'PROPOSTA DE REDAÇÃO.*?(?=TEXTO)',essay_instructions).group(0)
        except:
            pass
    for column in df.columns:
        df[column] = df[column].apply(lambda x: x.replace('\t',' ').replace('  ',' ').strip())
    df.to_csv(OUTNAME,index=False)

In [19]:
fileno

'dia1_caderno3_branco_ledor_2009.pdf'

In [20]:
parser.parts

['',
 '',
 '',
 ' \n',
 '   \n',
 ' \n',
 ' \n',
 'LEDOR',
 '  ',
 ' \n   \n \n \nLEDOR  ',
 '',
 '',
 '',
 '',
 '',
 '',
 'EXAME NACIONAL DO ENSINO MÉDIO\n',
 '2009',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '\n',
 'Caderno',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '\n',
 