In [1]:
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice

In [2]:
# Open a PDF file.
pdf_file = open('./finance_data/pdf/2014', 'rb')
# Create a PDF parser object associated with the file object.
parser = PDFParser(pdf_file)
# Create a PDF document object that stores the document structure.
document = PDFDocument(parser, '')
# Create a PDF resource manager object that stores shared resources.
rsrcmgr = PDFResourceManager()

In [3]:
from pdfminer.layout import LAParams, LTTextBox, LTTextLine, LTTextBoxHorizontal, LTRect
from pdfminer.converter import PDFPageAggregator
from pdfminer.converter import TextConverter
import operator,re
import pandas as pd

# layout analysis

In [4]:
# Create a PDF page aggregator object.
device = PDFPageAggregator(rsrcmgr, laparams=LAParams()) 
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.get_pages(pdf_file,[9]):
    interpreter.process_page(page)
# receive the LTPage object for the page. 
    layout = device.get_result()

In [5]:
rawBoxAxis =[]
rawText = []
for obj in layout._objs:
    if isinstance(obj,LTRect):
        rawBoxAxis.append(obj.bbox)
    elif isinstance(obj,LTTextBoxHorizontal):
        text_content = obj.get_text()
        text_bang = text_content.strip()
        rawText.append([(int(round(obj.x0))+int(round(obj.x1)))/2,(int(round(obj.y0))+int(round(obj.y1)))/2,text_bang])

In [6]:
labels = ['x0', 'y0', 'x1', 'y1']
df = pd.DataFrame.from_records(rawBoxAxis, columns=labels)

In [7]:
df = df.drop_duplicates()

In [8]:
df.sort_values(['y1', 'x0'], ascending=[False, True]).reset_index(drop = True)

Unnamed: 0,x0,y0,x1,y1
0,50.00,660.0,178.12,706.0
1,178.12,691.0,453.49,706.0
2,453.49,660.0,499.25,706.0
3,499.24,660.0,545.00,706.0
4,178.12,660.0,224.29,691.0
5,224.29,660.0,270.46,691.0
6,270.46,660.0,316.22,691.0
7,316.22,660.0,361.98,691.0
8,361.97,660.0,407.73,691.0
9,407.73,660.0,453.49,691.0


In [9]:
labels_text = ['text_x','text_y','txt']
dfText = pd.DataFrame.from_records(rawText,columns = labels_text)

In [10]:
dfText.sort_values(['text_y','text_x'], ascending=[False, True]).reset_index(drop = True)

Unnamed: 0,text_x,text_y,txt
0,207.5,780.5,연 결 자 본 변 동 표
1,297.5,750.0,제48기 2015년 1월 1일부터 2015년 12월 31일까지
2,297.5,734.0,제47기 2014년 1월 1일부터 2014년 12월 31일까지
3,519.5,718.5,(단위 : 백만원)
4,114.5,718.0,현대자동차주식회사와 그 종속기업
5,315.5,698.5,지배기업 소유주지분
6,114.0,683.5,과 목
7,293.5,683.5,기타자본항
8,339.0,683.5,기타포괄
9,476.5,683.5,비지배지분


In [11]:
def whrAmI(x0,y0,x1,y1,textDfIn,textSrx0,textSry0) :
    xRng = textSrx0.between(x0,x1,inclusive = True)
    yRng = textSry0.between(y0,y1,inclusive = True)
    if textDfIn[xRng & yRng].empty :
        result = 'NULL'
    else :
        extTxtDf = textDfIn[xRng & yRng]
        result = extTxtDf.iloc[0,2]
    return result

In [12]:
df['Contents'] = df.apply(lambda x : whrAmI(x['x0'],x['y0'],x['x1'],x['y1'],dfText,dfText['text_x'],dfText['text_y']),axis = 1)

In [13]:
df.head(20)

Unnamed: 0,x0,y0,x1,y1,Contents
0,50.0,660.0,178.12,706.0,과 목
1,178.12,691.0,453.49,706.0,지배기업 소유주지분
2,453.49,660.0,499.25,706.0,비지배지분
3,499.24,660.0,545.0,706.0,총 계
4,178.12,660.0,224.29,691.0,자 본 금
5,224.29,660.0,270.46,691.0,자본잉여금
6,270.46,660.0,316.22,691.0,기타자본항
7,316.22,660.0,361.98,691.0,기타포괄
8,361.97,660.0,407.73,691.0,이익잉여금
9,407.73,660.0,453.49,691.0,소계


# Sessionization

In [None]:
df['prv_y1'] = df['y1'].shift(1)
# df.columns = ['x0','y0','x1','y1','a','b','prv_y1']

In [None]:
df['new_session'] = (df['y1']-df['prv_y1'] < 0).astype(int)

In [None]:
df.head(10)

In [None]:
df['session_id'] = df['new_session'].cumsum()

In [None]:
df.reset_index(drop = True)

In [None]:
df.head(20)

In [None]:
df.iloc[0,1]

In [None]:
dfText['text_x']