In [1]:
import tabula as tb
import PyPDF2 as pypdf2
import pandas as pd
import numpy as np
import glob

In [2]:
def pdf_to_df(path, group, society, month, from_page=2):

    df_list = []
    all_files = pd.Series(glob.glob(path + "\\*.pdf"))

    pattern = group + '_' + society + month
    files_list = all_files[
        all_files.astype(str).str.contains(pattern)].tolist()
    print(files_list)
    
    for file in files_list:
        
        with open(file, "rb") as pdf_file:
            
            pdf_reader = pypdf2.PdfFileReader(pdf_file)
            to_page = pdf_reader.numPages
        
        if to_page > 2 :

            dfs = tb.read_pdf(
                file,
                pages=np.arange(from_page, to_page+1).tolist(),
                multiple_tables=True,
                stream=True)
        else:
            
            dfs = tb.read_pdf(
                file,
                pages=2,
                multiple_tables=True,
                stream=True)
            
        
        aux_df = pd.concat(dfs)
        df_list.append(aux_df)
        
    df = pd.concat(df_list)
    
    return df
    
    
def anti_join(x, y, on):
    """Return rows in x which are not present in y"""
    ans = pd.merge(left=x, right=y, how='left', indicator=True, on=on)
    ans = ans.loc[ans._merge == 'left_only', :].drop(columns='_merge')
    return ans

In [3]:
path = "C:/projects/pdf_mining/data"
groups = ['at', 'co', 'hc', 'la', 'li', 'po', 'pu']
societies = ['cli', 'diag', 'lab', 'cem', 'hos', 'ima', 'rad', 'res', 'reg']
months = ['1', '2', '3', '4', '5', '6', '7', '8']

In [4]:
df_list = []

for group in groups:
    for society in societies:
        for month in months:            
                        
            try:
                
                aux_df = pdf_to_df(path, group, society, month)
    
                # cambiar el el nombre de columnas
                new_header = aux_df.iloc[0]
                aux1_df = aux_df.iloc[1:]
                aux1_df.columns = new_header

                aux2_df = aux1_df[aux1_df['RUT'].notna()].copy()
                aux2_df = aux2_df[aux2_df['RUT']!='RUT'].copy().reset_index(drop=True)
                aux2_df.reset_index(drop=True, inplace=True)

                # Se agregan columnas de grupo y sociedad
                aux2_df['grupo'] = group
                aux2_df['mes'] = month
                aux2_df['sociedad'] = society

                # Se le quita el '.' al 'RUT'
                aux2_df['RUT'] = aux2_df['RUT'].apply(lambda x: x.replace('.',''))
                aux2_df['Remuneración'] = aux2_df['Remuneración'].apply(lambda x: x.replace('.',''))


                df_list.append(aux2_df)
                
                print(group + '_' + society + month)
            
            except:
                
                print('no existe el archivo %s_%s%s' % (group, society, month))


df = pd.concat(df_list)

['C:/projects/pdf_mining/data\\at_cli1a.pdf']
at_cli1
['C:/projects/pdf_mining/data\\at_cli2a.pdf']
at_cli2
['C:/projects/pdf_mining/data\\at_cli3a.pdf']
at_cli3
['C:/projects/pdf_mining/data\\at_cli4a.pdf']
at_cli4
['C:/projects/pdf_mining/data\\at_cli5a.pdf']
at_cli5
['C:/projects/pdf_mining/data\\at_cli6a.pdf']
at_cli6
['C:/projects/pdf_mining/data\\at_cli7a.pdf']
at_cli7
['C:/projects/pdf_mining/data\\at_cli8a.pdf']
at_cli8
['C:/projects/pdf_mining/data\\at_diag1a.pdf']
at_diag1
['C:/projects/pdf_mining/data\\at_diag2a.pdf']
at_diag2
['C:/projects/pdf_mining/data\\at_diag3a.pdf']
at_diag3
['C:/projects/pdf_mining/data\\at_diag4a.pdf']
at_diag4
['C:/projects/pdf_mining/data\\at_diag5a.pdf']
at_diag5
['C:/projects/pdf_mining/data\\at_diag6a.pdf']
at_diag6
['C:/projects/pdf_mining/data\\at_diag7a.pdf']
at_diag7
['C:/projects/pdf_mining/data\\at_diag8a.pdf']
at_diag8
['C:/projects/pdf_mining/data\\at_lab1a.pdf']
at_lab1
['C:/projects/pdf_mining/data\\at_lab2a.pdf']
at_lab2
['C:/project

In [5]:
df.to_csv('raw_data2.csv', index=False, sep=';')

In [6]:
df.head()

Unnamed: 0,RUT,Apellido Paterno,Apellido Materno,Nombres,Remuneración,Movimiento,Fecha Inicio,Fecha Termino,grupo,mes,sociedad
0,14102553-8,AGUILERA,CORTES,ANA ANYELINA,1478682,0,,,at,1,cli
1,16248551-2,AHUMADA,BALBONTIN,MARIA INES,559112,2,30/01/2020,31/01/2020,at,1,cli
2,11617329-8,ALARCON,COLLAO,MARCELA DEL CARMEN,442534,0,,,at,1,cli
3,15458350-5,ALARCON,RIOS,FERNANDA CECILIA,977201,0,,,at,1,cli
4,16013018-0,ALDAY,CORTÉS,JOSELYN NATALY,1263135,0,,,at,1,cli


In [7]:
df['Remuneración'].astype('int').sum()

10668601763

In [8]:
df.shape

(15029, 11)

In [9]:
df['grupo'].unique()

array(['at', 'co', 'hc', 'la', 'li', 'po', 'pu'], dtype=object)

In [10]:
df[df['grupo']=='la']

Unnamed: 0,RUT,Apellido Paterno,Apellido Materno,Nombres,Remuneración,Movimiento,Fecha Inicio,Fecha Termino,grupo,mes,sociedad
0,18535994-8,ACEVEDO,RIVAS,SUSAN SCARLETT,420799,0,,,la,1,cli
1,10326456-1,ACOSTA,VALENZUELA,MONICA DEL CARMEN,419566,0,,,la,1,cli
2,16395578-4,ACUÑA,ACUÑA,ELISA ANDREA,519135,0,,,la,1,cli
3,10219126-9,ACUÑA,MELO,JUAN BERNARDO,514703,0,,,la,1,cli
4,11243904-8,AEDO,MEJIAS,ANTONELLA MARISOL,528308,0,,,la,1,cli
...,...,...,...,...,...,...,...,...,...,...,...
3,15695576-0,REYES,LLANTEN,BORIS LEONARDO,1353301,2,12/07/2020,14/07/2020,la,7,res
0,10057512-4,CARRASCO,CORDOVA,ALEJANDRO MARIO,2300092,0,,,la,8,res
1,17592626-7,CHEUQUELAF,OPAZO,KAREN CECILIA,413753,0,,,la,8,res
2,12922847-4,GATICA,ROLDAN,ARTURO ESTEBAN,560898,0,,,la,8,res


In [11]:
df.to_csv('raw_data3.csv', index=False, sep=';')