In [None]:
import io
import glob
import re
import os
import pandas as pd
from tqdm import tqdm
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage

The aim of this notebook is to parse the PDF files and extract abstracts' text to plain text files.

**NOTE**: running this notebook requires the installation of the pdfminer2 package. 

In [None]:
#pdf_file = 'papers/Zuffi_Lions_and_Tigers_CVPR_2018_paper.pdf'
pdf_file = 'papers/Yu_DoubleFusion_Real-Time_Capture_CVPR_2018_paper.pdf'

def convert_pdf_to_txt(path):
    rsrcmgr = PDFResourceManager()
    retstr = io.StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos = set()

    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages,
                                  password=password,
                                  caching=caching,
                                  check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue()

    fp.close()
    device.close()
    retstr.close()
    
    return ' '.join(text.split('\n')).replace('- ','')

text = convert_pdf_to_txt(pdf_file)

text

In [None]:
to_delete = [
    'papers/Chen_Robust_Video_content_cvpr_2018_paper.pdf', # Broken link
    'papers/Groueix_A_Papier-Mache_Approach_CVPR_2018_paper.pdf', # Broken link
    'papers/Larsson_Beyond_Grobner_Bases_CVPR_2018_paper.pdf', # Broken link
    'papers/Liu_Exploring_Disentangled_Feature_CVPR_2018_paper.pdf' # The parser gets stuck
    ]
## Manually deleting some papers 
for d in to_delete:
    if os.path.exists(d):
        os.remove(d)

In [None]:
# Extract the raw text from the papers
papers = sorted(glob.glob('papers/*.pdf')) # Sorting alphabetically makes
                                           # debugging easier
    
for i in tqdm(range(len(papers))):
    paper = papers[i]
    output_file = os.path.join('data/', os.path.basename(paper)).replace('.pdf', '.txt')
    if not os.path.exists(output_file):
        text = convert_pdf_to_txt(paper)
        try:
            with open(output_file, 'w') as f:
                f.write(text)
        except:
            os.remove(output_file)

In [None]:
# TODO: create a dataframe with papers and lens, distribution of lens, analysis of outliers

In [None]:
# TODO: it should be enough to get what's in between abstract and 1. Introduction, but sometimes
# specially if there is a figure at the beginning, this may not work. 
# For each pdf extract what it is between abstract and introduction, and take a look at the distribution
# of the length of abstracts to detect weird cases