In [1]:
import os
import subprocess
import uuid
import requests
import json
import time
import datetime
import csv
from copy import deepcopy
from cStringIO import StringIO

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage

from tabula import read_pdf


class CIBILReportRawData(object):
    """Class to obtain the Raw data from the CIBIL Report"""

    def __init__(self, pdf_path, password=''):
        self.pdf_path = pdf_path
        self.password = password
        self.tabula_params = {
            'pages': 'all',
            'guess': True,
            'pandas_options': {
                'error_bad_lines': False
            },
            'password': self.password,
            'output_format': 'json',
        }
        self.pdf_json = self.__get_pdf_json()
        self.raw_table_data = self.__get_raw_table_data()
        self.pdf_text = self.__get_pdf_text()

    def __get_tabula_params(self, password_type='original'):
        if password_type == 'original':
            return self.tabula_params
        elif password_type == 'empty':
            tabula_params = deepcopy(self.tabula_params)
            tabula_params['password'] = ""
            return tabula_params
        elif password_type == 'capilatized':
            tabula_params = deepcopy(self.tabula_params)
            tabula_params['password'] = self.password.upper()
            return tabula_params
        else:
            return self.tabula_params

    def __get_pdf_json(self):
        try:
            return read_pdf(self.pdf_path, **self.__get_tabula_params('original'))
        except Exception as e:
            try:
                return read_pdf(self.pdf_path, **self.__get_tabula_params('empty'))
            except Exception as e:
                return read_pdf(self.pdf_path, **self.__get_tabula_params('capilatized'))

    def __get_decrypted_pdf_path(self):
        if '.pdf' in self.pdf_path:
            path_list = self.pdf_path.split('.pdf')
            return path_list[0] + '_decrypted.pdf'
        else:
            self.pdf_path + '_decrypted.pdf'

    def __pdf_decryption(self, password_type='original'):
        pdf_decryption = False
        pdf_text = ''
        pdf_path_decrypt = self.__get_decrypted_pdf_path()
        if password_type == 'capilatized':
            password = self.password.upper()
        elif password_type == 'empty':
            password = ''
        else:
            password = self.password
        try:
            decrypt_command = 'qpdf --password={password} --decrypt {pdf_path} {pdf_path_decrypt}'.format(
                password=password, pdf_path=self.pdf_path, pdf_path_decrypt=pdf_path_decrypt)
            decrypt_command_output = subprocess.call(
                decrypt_command, shell=True)
            if decrypt_command_output == 0:
                pdf_decryption = True
        except Exception as e:
            pass
        return pdf_decryption

    def __get_pdf_text(self):
        for password_type in ['original', 'empty', 'capilatized']:
            pdf_decryption = self.__pdf_decryption(password_type)
            if pdf_decryption:
                break
        pdf_path_decrypt = self.__get_decrypted_pdf_path()
        file_clean_command = 'rm {pdf_path_decrypt}'.format(
            pdf_path_decrypt=pdf_path_decrypt)
        pdf_text = self.__pdf_to_text(pdf_path_decrypt)
        subprocess.call(file_clean_command, shell=True)
        return pdf_text

    def __get_raw_table_data(self):
        rows_data_list = []
        for data_dict in self.pdf_json:
            for rows_data in data_dict.get('data', []):
                row_data_list = []
                for row_data in rows_data:
                    if row_data.get('text'):
                        row_data_list.append(row_data['text'])
                if row_data_list:
                    rows_data_list.append(row_data_list)
        return rows_data_list

    def __pdf_to_text(self, pdf_path_decrypt):
        pagenums = set()
        output = StringIO()
        manager = PDFResourceManager()
        converter = TextConverter(manager, output, laparams=LAParams())
        interpreter = PDFPageInterpreter(manager, converter)

        infile = file(pdf_path_decrypt, 'rb')
        for page in PDFPage.get_pages(infile, pagenums):
            interpreter.process_page(page)
        infile.close()
        converter.close()
        text = output.getvalue()
        output.close
        return text