In [351]:
# Some utils
from datetime import datetime
import re


def now_in_string():
    return datetime.now().strftime("%H:%M:%S")


def get_type_and_tribunal(long_name):
    """
    Parses the name of each tribunal, and returns the name and the speciality
    """
    result = re.search('([\s\w.]*) - ([\w]*)', long_name)
    return result.groups()

In [352]:
import urllib.parse

import requests
from bs4 import BeautifulSoup


def get_all_specialties():
    DOCENTES_URL = "https://ceice.gva.es/auto/Actas/"
    r = requests.get(DOCENTES_URL)
    bs_content = BeautifulSoup(r.text, 'lxml')
    table = bs_content.find('table', attrs={'id': 'indexlist'})
    rows = table.findChildren('tr')
    
    structured_rows = []
    for row in rows:
        if not row.get('class')[0] in ['even', 'odd']:
            continue
        cols = row.findChildren('td')
        name = cols[1].a.get_text().strip().replace('.', '').replace('/', '')
        structured = {
            'name': name,
            'link': urllib.parse.urljoin(DOCENTES_URL, cols[1].a.get('href')).strip(),
            'modified': cols[2].get_text().strip(),
        }
        structured_rows.append(structured)

    return structured_rows

specialties = get_all_specialties()

In [353]:
import os
from pathlib import Path


def get_all_tribunals(speciality_row):
    speciality_url = speciality_row['link']
    
    r = requests.get(speciality_url)
    bs_content = BeautifulSoup(r.text, 'lxml')

    table = bs_content.find('table', attrs={'id': 'indexlist'})
    rows = table.findChildren('tr')
    
    structured_rows = []
    for row in rows:
        if not row.get('class')[0] in ['even', 'odd']:
            continue
        cols = row.findChildren('td')
        name = cols[1].a.get_text().strip().replace('.', '')
        
        structured = {
            'type': get_type_and_tribunal(name)[0].strip(),
            'tribunal': get_type_and_tribunal(name)[1],
            'name': name,
            'link': urllib.parse.urljoin(speciality_url, cols[1].a.get('href')).strip(),
            'modified': cols[2].get_text().strip(),
        }
        structured_rows.append(structured)

    return structured_rows


def get_all_files(structured_row, exclude_pattern=None):
    url_files = structured_row['link']
    r = requests.get(url_files)
    bs_content = BeautifulSoup(r.text, 'lxml')
    table = bs_content.find('table', attrs={'id': 'indexlist'})
    rows = table.findChildren('tr')
    
    structured_rows = []
    for row in rows:
        if not row.get('class')[0] in ['even', 'odd']:
            continue
        cols = row.findChildren('td')
        name = cols[1].a.get_text().strip()
        
        # Exclude provisional files, in the second round
        if exclude_pattern and exclude_pattern in name:
            continue

        structured = {
            '_tribunal': structured_row,
            'name': name,
            'link': urllib.parse.urljoin(url_files, cols[1].a.get('href')),
            'modified': cols[2].get_text().strip(),
        }
        structured_rows.append(structured)

    return structured_rows

def download_file(structured_row):
    tribunal_type = structured_row['_tribunal']['type']
    tribunal = structured_row['_tribunal']['tribunal']
    modified = structured_row['modified'].replace(':', '_').replace('-', '_')
    url = structured_row['link']
    name = Path(structured_row['name'])
    name_wo_ext = name.with_suffix('')
    name_ext = name.suffix
    file_name = f"{tribunal} - {name_wo_ext} {modified}{name_ext}"
    final_name = f"{tribunal_type}/{file_name}"
        
    r_doc = requests.get(url)
    open(final_name, 'wb').write(r_doc.content)
    print(f"\t{file_name}")

    return final_name


for speciality in specialties:
    tribunals = get_all_tribunals(speciality)
    print(f"===== START {speciality['name']}")
    
    for idx, tribunal in enumerate(tribunals):
        # Create the directory for storing the PDFs
        tribunal_type = tribunal['type']
        exists = os.path.exists(tribunal_type)
        if not exists:
            os.makedirs(tribunal_type)
        
        counter_str = f"{idx}/{len(tribunals)}"
        print(f"{now_in_string()} ({counter_str}) {tribunal['type']} {tribunal['tribunal']}")
        
        # Uncomment this line if you only want to download the final files
        # files = get_all_files(tribunal, exclude_pattern="Prov")
        files = get_all_files(tribunal, exclude_pattern=None)
        
        for file in files:
            download_file(file)
        
    print(f"===== END {speciality['name']}")


===== START 128_EDUCACIO PRIMARIA
22:28:57 (0/135) EDUCACIO PRIMARIA V16
	V16 - DataConvocatoria2DID 2022_07_04 19_15.pdf
	V16 - ActaNotes1Aprovats 2022_07_01 10_22.pdf
	V16 - ActaNotes1Definitiva 2022_07_01 10_22.pdf
	V16 - ActaNotes1AprovatsProv 2022_06_29 11_39.pdf
	V16 - ActaNotes1Provisional 2022_06_29 11_38.pdf
22:28:58 (1/135) EDUCACIO PRIMARIA V26
	V26 - DataConvocatoria2DID 2022_07_04 18_57.pdf
	V26 - ActaNotes1Aprovats 2022_07_01 10_13.pdf
	V26 - ActaNotes1Definitiva 2022_07_01 10_13.pdf
	V26 - ActaNotes1Provisional 2022_06_29 11_31.pdf
	V26 - ActaNotes1AprovatsProv 2022_06_29 11_07.pdf
22:28:59 (2/135) EDUCACIO PRIMARIA V17
	V17 - DataConvocatoria2DID 2022_07_04 18_52.pdf
	V17 - ActaNotes1Aprovats 2022_07_01 10_25.pdf
	V17 - ActaNotes1Definitiva 2022_07_01 10_25.pdf
	V17 - ActaNotes1AprovatsProv 2022_06_29 12_42.pdf
	V17 - ActaNotes1Provisional 2022_06_29 12_41.pdf
22:29:00 (3/135) EDUCACIO PRIMARIA A43
	A43 - DataConvocatoria2DID 2022_07_04 18_42.pdf
	A43 - ActaNotes1Aprova

KeyboardInterrupt: 

In [340]:
import tabula

def extract_data_from_pdf_to_csv(file, suffix):
    result = re.search('([\w]*) - ([\w]*)', file.name)
    specialty = str(file.parent).strip()
    tribunal = result[1]    
    
    tables = tabula.read_pdf(file,pages="all")
    print(f"({len(tables)} tables on this file)")
    
    for idx, table in enumerate(tables):
        table['specialty'] = [specialty for x in range(len(table))]
        table['tribunal'] = [tribunal for x in range(len(table))]
        table.to_csv(f'csv/{specialty}_{tribunal}_{suffix}_{idx}.csv', index=False)

def extract_data_from_pattern(glob_pattern):
    p = Path('.').glob(glob_pattern)
    files = [x for x in p if x.is_file()]
    for idx, file in enumerate(files):
        counter_str = f"{idx}/{len(files)}"
        print(f"{now_in_string()} ({counter_str}) Reading '{file}'...", end=' ')
        extract_data_from_pdf_to_csv(file, suffix='definitiva')

    print("FINISH")
    
extract_data_from_pattern('**/*Notes1Definitiva*.pdf')

22:22:01 (0/200) Reading 'AUDICIO I LLENGUATGE/A6 - ActaNotes1Definitiva 2022_07_01 11_48.pdf'... (2 tables on this file)
22:22:04 (1/200) Reading 'AUDICIO I LLENGUATGE/A4 - ActaNotes1Definitiva 2022_07_01 12_12.pdf'... 

KeyboardInterrupt: 

In [None]:
import pandas as pd
import glob

# This step requires running a couple of commands manually:
# (it could be included on this jupyter, but sometimes the shell is faster :)
#
#
# cat *prov*.csv | grep "\*\*\*" > PROVISIONAL.csv
# cat *definitiva*.csv | grep "\*\*\*" > DEFINITIVE.csv

def load_csv_file(filename):
    return pd.read_csv(filename,
                     names=['DNI', 'nombre', 'tema', 'caso_practico', 'total', 'especialidad', 'tribunal'])

def compare_valuations(row):
    return (
        row['tema_provisional'] == row['tema_definitivo']
        and
        row['caso_practico_provisional'] == row['caso_practico_definitivo']
    )

provisional = load_csv_file('csv/PROVISIONAL.csv')
definitive = load_csv_file('csv/DEFINITIVE.csv')

# Merge both files (as columns), and add a new column 'iguales' that compares provisional and definitive results
merged = pd.merge(provisional, definitive, on=['DNI', 'nombre', 'especialidad', 'tribunal'], suffixes=('_provisional', '_definitivo'), how="outer")
merged['iguales'] = concatenated.apply(compare_valuations, axis=1)

# Sort the columns to look better on the spreedsheet
merged.reindex(columns=['DNI', 'nombre', 'tema_provisional', 'caso_practico_provisional',
       'total_provisional', 'iguales', 'tema_definitivo',
       'caso_practico_definitivo', 'total_definitivo', 'especialidad', 'tribunal'])

# Delete the name column (for sharing):
del merged['nombre']

with pd.ExcelWriter('results.xlsx') as writter:
    for speciality in merged['especialidad'].unique():
        temp_df = merged.loc[concatenated['especialidad'] == speciality]    
        temp_df.to_excel(writter, sheet_name=speciality)
