# This notebooks helps to visualize and label the position of figures on a PDF

- The page where the Figure can be found
- The x0,x1,y0,y1 position of the figure in the page

All the information above will be stored on the `annotation.json` file
This file has the following structure:
```
     "<ID>": {
        "pdf_path": "<path_to_pdf_on_dabase>",
        "Watermark": <bool>,
        "<figx>": {
            "p": <page_number>,
            "y0": <position>,
            "y1": <position>,
            "x0": <position>,
            "x1": <position>
        },
        "missing": [< list_of_figure_withou_label>
        ],
        "all_figures": [
            <list_of_all_figure_downloaded>
        ]
    },
``` 

Import Cell

In [13]:
# import the necessary packages
import os
from glob import glob
import numpy as np
import fitz
import io
from PIL import Image
import io
import pandas as pd
import cv2
import warnings
warnings.filterwarnings('ignore')
from IPython.display import clear_output
import json
import screeninfo
import pyperclip

%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [14]:
# Debug panel
def display_panel(imgs):
    """
    Easy way to create a panel to check any image from imgs list
    
    Parameters
    ----------
    imgs: list
        List of image on opencv format (numpy)
    
    Return
    ------
    panel: PIL object
        A PIL Image object with the imgs inputed in a panel format
    """
    # Total imgs
    total_imgs = len(imgs)

    # Get min shape
    min_shape = sorted( [(np.sum(i.shape[:2]), i.shape[:2] ) for i in imgs])[0][1]
    rows,cols = min_shape
    # Adding border to imgsa
    bordersize = 2

    imgs_aux = [ cv2.copyMakeBorder(
                            cv2.resize(im,(cols,rows)), # Resize img
                            top=bordersize,
                            bottom=bordersize,
                            left=bordersize,
                            right=bordersize,
                            borderType=cv2.BORDER_CONSTANT,
                            value=[69, 87, 96]

                    ) for im in imgs]
    # number of img in vertical
    nv_imgs = int(np.floor(np.sqrt(total_imgs))) 
    # number of img in horizontal
    nh_imgs = int(np.ceil(total_imgs/nv_imgs))

    # Empty imgs
    empty =  nv_imgs*nh_imgs - total_imgs


    # Image Panel on array format
    img_ar = imgs_aux[0]

    # Insert missing imgs to complete a rectangle imgs
    if empty > 0:
        for i in range(empty):
            ii = np.zeros_like(img_ar)
            ii[:] = [69, 87, 96]
            imgs_aux += [ii]

    # Concatenate all imgs
    for i in range(1,len(imgs_aux)):
        img_ar = np.concatenate([img_ar,imgs_aux[i]],axis=0)
    # Reshape properly to (nh_imgs nv_imgs)
    r = img_ar.reshape((nh_imgs,-1)+img_ar.shape[1:])
    panel =  np.hstack(r)

    return  panel

def print_pages(doc, output):
    for n, page in enumerate(doc):
        rendered_page = page.getPixmap(alpha=False,annots=False).getImageData()
        pg_name = f'{output}/{n+1}.png'
        with open(pg_name,'wb') as fp:
            fp.write(rendered_page)

## Checking the figures locations

In [15]:
def generate_imgs_with_bb(pdfpath,fig_state):
   """
    To visualize the downloaded figures from a pdf
    Make sure to put all the extracted/downloaded figure
    on the path 'pdfpath'/figures/
    """
    # Check number of figures
    figs_path = os.path.dirname(pdfpath)+"/figures/"
    figs = glob(figs_path+"/fig*[!.txt]")
    figs.sort()
    figs = [os.path.basename(f) for f in figs]
    figs = [f[:f.rfind(".")] for f in figs]
    # Adding supplementary image into figs
    for sfig in fig_state.keys():
        if 'figS' in sfig:
            figs.append(sfig)
    # Open Document and print the location of the bounding on each page
    page_figures = []
    doc = fitz.open(pdfpath)
    for page in doc:
        pn = page.number + 1    
        # Compare the relevant images with the Figures from the official repository
        img_fig = page.getPixmap(alpha=False,annots=False).getImageData()
        img_fig = np.array(Image.open(io.BytesIO(img_fig)))

        for f in figs:
            if f in fig_state.keys():
                fig = fig_state[f]
                if fig['p'] == pn:

                    start_point = (fig['x0'], fig['y0']) 
                    end_point = (fig['x1'], fig['y1'])
                    if 'figS' in f:
                        color = (40,150,40)
                    else:
                        color = (150, 150, 0) 
                    thickness = 40
                    # Using cv2.rectangle() method 
                    # Draw a rectangle with blue line borders of thickness of 2 px 
                    cv2.rectangle(img_fig, start_point, end_point, color, thickness)
                    f = f.replace("fig","")
                    cv2.putText(img_fig ,f, (int((fig['x0'] + fig['x1'])/2), int((fig['y0'] + fig['y1'])/2)),\
                                cv2.FONT_HERSHEY_SIMPLEX, 5, color, 8)
        page_figures.append(img_fig)
        
    return page_figures


#Global Var
screen_id = 0
is_color = True
CLASSES = ["COORECT", "WRONG"]
# get the size of the screen
global pdf_index
pdf_index = 228

#Functions
def save_df(df):
    df.to_csv("annotation.csv",index = False)
    
def load_files():
    #olnames=['Filename', 'Relevance']
    with open('annotation.json', 'r') as fp:
        annotation_json = json.load(fp)
    data = pd.read_csv("annotation.csv",index_col = False)
    return annotation_json, data

def pdf_annotation():
    """
    Uses the information on annotation.json to visualize a pdf
    """
    screen = screeninfo.get_monitors()[screen_id]
    width, height = screen.width, screen.height
    global pdf_index
    
    window_name = 'Class Annotation'
    cv2.namedWindow(window_name, cv2.WND_PROP_FULLSCREEN)
    cv2.moveWindow(window_name, screen.x - 1, screen.y - 1)
    cv2.setWindowProperty(window_name, int(cv2.WND_PROP_FULLSCREEN/2),int(cv2.WINDOW_FULLSCREEN/2))
    
    global refPt
    global cont 
    cont = 1
    refPt = [None,None]
    
    while(1):
        annotation_json, annotated_data =  load_files()
        pdf_path  = annotated_data.loc[pdf_index]['PDFPATH']
        figures_state = annotation_json[str(pdf_index)]
        
        checked = annotated_data.loc[pdf_index]['CHECKED']
        page_list = generate_imgs_with_bb(pdf_path,figures_state)
        image = display_panel(page_list)
        image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
        
        text = "Checked = %s "% str(checked)
        if checked:
            color = (20,200,20)      
        else:
            color = (20,20,200)      
        cv2.putText(image, "{} {}/{} ".format(text, pdf_index,len(annotated_data)), (100, 200),                                                                                                                                             
        cv2.FONT_HERSHEY_TRIPLEX, 5, color, 10)                                                                                                                                                                                      
        cv2.imshow(window_name, image)
        
        
        key = cv2.waitKey(5)
        
        if key == ord('s'):
            save_df(annotated_data)
            text = "SAVED"                                                                                                                                                                                              
            color = (0,0,255)      
            cv2.putText(image, "{} {}/{} ".format(text, pdf_index,len(annotated_data)), (100, 200),                                                                                                                                             
                cv2.FONT_HERSHEY_DUPLEX, 0.8, color, 2)                                                                                                                                                                                      
            cv2.imshow(window_name, image)
            key = cv2.waitKey(0)
            continue
            
        # Change label
        if key == ord(' ') :
            print("CHANGE")
            annotated_data.iloc[pdf_index, annotated_data.columns.get_loc('CHECKED')] = not checked
            save_df(annotated_data)
            continue
            
        # Keymap for forward and backward
        if key==ord('d'):  # normally -1 returned,so don't print it
            pdf_index +=1
            pdf_index = pdf_index % len(annotated_data)
            save_df(annotated_data)
            clear_output()
            print(f"PDF: {annotation_json[str(pdf_index)]['pdf_path']}")
            print(f"MISSING: {annotation_json[str(pdf_index)]['missing']}")
            pyperclip.copy(annotation_json[str(pdf_index)]['pdf_path'])
            
        elif key==ord('a'):
            pdf_index -= 1
            pdf_index = pdf_index % len(annotated_data)
            save_df(annotated_data)
            clear_output()
            print(f"PDF: {annotation_json[str(pdf_index)]['pdf_path']}")
            pyperclip.copy(annotation_json[str(pdf_index)]['pdf_path'])
            print(f"MISSING: {annotation_json[str(pdf_index)]['missing']}")
            
        elif key==27:
            print("ESCAPE")
            break # else print its value
    cv2.destroyAllWindows()

In [16]:
# Run annotation tool
pdf_annotation()

PDF: pdf_image_extraction_dataset/fazlul_sarkar/other_papers/wang_2012_b/wang_2012.pdf
MISSING: []
ESCAPE


In [18]:
data = pd.read_csv("annotation.csv",index_col = False)

----

In [18]:
new_dict= {}
for index, row in annotation.iterrows():
    
    # Check number of figures
    pdfpath = row['PDFPATH']
    figs_path = os.path.dirname(pdfpath)+"/figures/"
    figs = glob(figs_path+"/fig*[!.txt]")
    figs.sort()
    all_figs = [os.path.basename(f) for f in figs]
    
    json_id = "%03d" % row['ID']
    
    if json_id in figs_json.keys():
        new_dict[row['ID']] = figs_json[json_id]
        all_figs = [f[:f.rfind(".")] for f in all_figs]
        new_dict[row['ID']]['all_figures'] = all_figs
    else:
        pdfpath = row['PDFPATH']
        wm = row['Retraction watermark on paper']
        missing = all_figs.copy()
        all_figs = [f[:f.rfind(".")] for f in all_figs]
        new_dict[row['ID']] = {
            'pdf_path': pdfpath,
            'Watermark': wm,
            'missing': missing,
            'all_figures': all_figs
        }

In [21]:
import json
with open('annotation.json', 'w') as fp:
    json.dump(new_dict, fp,  indent=4)