In [1]:
# Some utils
from datetime import datetime
import re
import hashlib

def hash_for_obj(obj):
    return hashlib.md5(str(obj).encode('utf-8')).hexdigest()


def now_in_string():
    return datetime.now().strftime("%H:%M:%S")


def get_type_and_tribunal(long_name):
    """
    Parses the name of each tribunal, and returns the name and the speciality
    """
    result = re.search('([\s\w.]*) - ([\w]*)', long_name)
    return result.groups()

In [2]:
import urllib.parse
import json

import requests
from bs4 import BeautifulSoup


def load_json_file(path):
    try:
        with open(path) as json_file:
            return json.load(json_file)
    except FileNotFoundError:
        return None
    
def save_to_json_file(path, dictionary):
    with open(path, 'w') as outfile:
        json.dump(dictionary, outfile)


def get_all_specialties():
    CACHE_FILE = 'cache/specialities.json'
    cache = load_json_file(CACHE_FILE)
    if cache:
        return cache
    
    DOCENTES_URL = "https://ceice.gva.es/auto/Actas/"
    r = requests.get(DOCENTES_URL)
    bs_content = BeautifulSoup(r.text, 'lxml')
    table = bs_content.find('table', attrs={'id': 'indexlist'})
    rows = table.findChildren('tr')
    
    structured_rows = []
    for row in rows:
        if not row.get('class')[0] in ['even', 'odd']:
            continue
        cols = row.findChildren('td')
        name = cols[1].a.get_text().strip().replace('.', '').replace('/', '')
        structured = {
            'name': name,
            'link': urllib.parse.urljoin(DOCENTES_URL, cols[1].a.get('href')).strip(),
            'modified': cols[2].get_text().strip(),
        }
        structured_rows.append(structured)

    save_to_json_file(CACHE_FILE, structured_rows)
    return structured_rows

specialties = get_all_specialties()

In [3]:
import os
from pathlib import Path
from slugify import slugify


def get_all_tribunals(speciality_row):        
    slug = slugify(speciality_row['name'])
    CACHE_FILE = f'cache/tribunals{slug}.json'
    
    cache = load_json_file(CACHE_FILE)
    if cache:
        return cache
    
    print(f"Haciendo la peticion: {speciality_row['name']}")
    
    speciality_url = speciality_row['link']
    
    r = requests.get(speciality_url)
    bs_content = BeautifulSoup(r.text, 'lxml')

    table = bs_content.find('table', attrs={'id': 'indexlist'})
    rows = table.findChildren('tr')
    
    structured_rows = []
    for row in rows:
        if not row.get('class')[0] in ['even', 'odd']:
            continue
        cols = row.findChildren('td')
        name = cols[1].a.get_text().strip().replace('.', '')
        
        structured = {
            'type': get_type_and_tribunal(name)[0].strip(),
            'tribunal': get_type_and_tribunal(name)[1],
            'name': name,
            'link': urllib.parse.urljoin(speciality_url, cols[1].a.get('href')).strip(),
            'modified': cols[2].get_text().strip(),
        }
        structured_rows.append(structured)

    save_to_json_file(CACHE_FILE, structured_rows)    
    return structured_rows


def get_all_files(structured_row, include_patterns=['ActaNotesProva2Provisional.pdf']):
    slug = slugify(f"files_tribunal_{structured_row['name']}{hash_for_obj(include_patterns)}")
    CACHE_FILE = f'cache/{slug}a.json'
    
    cache = load_json_file(CACHE_FILE)
    if cache:
        return cache
    
    print(f"Haciendo la peticion: {CACHE_FILE}")
    
    url_files = structured_row['link']
    r = requests.get(url_files)
    bs_content = BeautifulSoup(r.text, 'lxml')
    table = bs_content.find('table', attrs={'id': 'indexlist'})
    rows = table.findChildren('tr')
    
    structured_rows = []
    for row in rows:
        if not row.get('class')[0] in ['even', 'odd']:
            continue
        cols = row.findChildren('td')
        name = cols[1].a.get_text().strip()
        
        # Only include expected files
        included = False
        for included_pattern in include_patterns:
            if included_pattern in name:
                included = True

        if not included:
            continue

        structured = {
            '_tribunal': structured_row,
            'name': name,
            'link': urllib.parse.urljoin(url_files, cols[1].a.get('href')),
            'modified': cols[2].get_text().strip(),
        }
        structured_rows.append(structured)

    save_to_json_file(CACHE_FILE, structured_rows)    
    return structured_rows

def download_file(structured_row):
    tribunal_type = structured_row['_tribunal']['type']
    tribunal = structured_row['_tribunal']['tribunal']
    modified = structured_row['modified'].replace(':', '_').replace('-', '_')
    url = structured_row['link']
    name = Path(structured_row['name'])
    name_wo_ext = name.with_suffix('')
    name_ext = name.suffix
    file_name = f"{tribunal} - {name_wo_ext} {modified}{name_ext}"
    final_name = f"{tribunal_type}/{file_name}"
    
    path = Path(final_name)
    if path.is_file():
        # Skip the download if the file exists
        return final_name
    
    r_doc = requests.get(url)
    open(final_name, 'wb').write(r_doc.content)
    print(f"\t{file_name}")

    return final_name


for speciality in specialties:
    tribunals = get_all_tribunals(speciality)
    print(f"===== START {speciality['name']}")
    
    for idx, tribunal in enumerate(tribunals):
        # Create the directory for storing the PDFs
        tribunal_type = tribunal['type']
        exists = os.path.exists(tribunal_type)
        if not exists:
            os.makedirs(tribunal_type)
        
        counter_str = f"{idx}/{len(tribunals) - 1}"
        print(f"{now_in_string()} ({counter_str}) {tribunal['type']} {tribunal['tribunal']}")
        
        include_patterns=[
            'ActaNotes1Provisional.pdf',
            'ActaNotes1Definitiva.pdf',
            'ActaNotesProva2Provisional.pdf',
            'ActaNotesProva2Definitiva.pdf'
        ]
        files = get_all_files(tribunal, include_patterns=include_patterns)
        
        for file in files:
            download_file(file)
        
    print(f"===== END {speciality['name']}")


===== START 128_EDUCACIO PRIMARIA
23:53:28 (0/134) EDUCACIO PRIMARIA V60
23:53:28 (1/134) EDUCACIO PRIMARIA A48
23:53:28 (2/134) EDUCACIO PRIMARIA A47
23:53:28 (3/134) EDUCACIO PRIMARIA V59
23:53:28 (4/134) EDUCACIO PRIMARIA V58
23:53:28 (5/134) EDUCACIO PRIMARIA V21
23:53:28 (6/134) EDUCACIO PRIMARIA V23
23:53:28 (7/134) EDUCACIO PRIMARIA V9
23:53:28 (8/134) EDUCACIO PRIMARIA A36
23:53:28 (9/134) EDUCACIO PRIMARIA A19
23:53:28 (10/134) EDUCACIO PRIMARIA A53
23:53:28 (11/134) EDUCACIO PRIMARIA C4
23:53:28 (12/134) EDUCACIO PRIMARIA A34
23:53:28 (13/134) EDUCACIO PRIMARIA V31
23:53:28 (14/134) EDUCACIO PRIMARIA A33
23:53:28 (15/134) EDUCACIO PRIMARIA V20
23:53:28 (16/134) EDUCACIO PRIMARIA A28
23:53:28 (17/134) EDUCACIO PRIMARIA V19
23:53:28 (18/134) EDUCACIO PRIMARIA V43
23:53:28 (19/134) EDUCACIO PRIMARIA A41
23:53:28 (20/134) EDUCACIO PRIMARIA A51
23:53:28 (21/134) EDUCACIO PRIMARIA V33
23:53:28 (22/134) EDUCACIO PRIMARIA A23
23:53:28 (23/134) EDUCACIO PRIMARIA V29
23:53:28 (24/134) 

23:53:28 (50/146) EDUCACIO INFANTIL V67
23:53:28 (51/146) EDUCACIO INFANTIL A38
23:53:28 (52/146) EDUCACIO INFANTIL C7
23:53:28 (53/146) EDUCACIO INFANTIL V38
23:53:28 (54/146) EDUCACIO INFANTIL V47
23:53:28 (55/146) EDUCACIO INFANTIL C1
23:53:28 (56/146) EDUCACIO INFANTIL A8
23:53:28 (57/146) EDUCACIO INFANTIL A24
23:53:28 (58/146) EDUCACIO INFANTIL A25
23:53:28 (59/146) EDUCACIO INFANTIL A42
23:53:28 (60/146) EDUCACIO INFANTIL V17
23:53:28 (61/146) EDUCACIO INFANTIL V71
23:53:28 (62/146) EDUCACIO INFANTIL A33
23:53:28 (63/146) EDUCACIO INFANTIL V29
23:53:28 (64/146) EDUCACIO INFANTIL V53
23:53:28 (65/146) EDUCACIO INFANTIL V37
23:53:28 (66/146) EDUCACIO INFANTIL C14
23:53:28 (67/146) EDUCACIO INFANTIL A39
23:53:28 (68/146) EDUCACIO INFANTIL A14
23:53:28 (69/146) EDUCACIO INFANTIL V40
23:53:28 (70/146) EDUCACIO INFANTIL V66
23:53:28 (71/146) EDUCACIO INFANTIL A31
23:53:28 (72/146) EDUCACIO INFANTIL C16
23:53:28 (73/146) EDUCACIO INFANTIL V36
23:53:28 (74/146) EDUCACIO INFANTIL V34
23:

In [None]:
import os
import tabula

CSV_PATH = "./csv/tmp/"
CACHE_FILES = "./cache/tabula/" 

def extract_data_from_pdf_to_csv(file, suffix):
    result = re.search('([\w]*) - ([\w]*)', file.name)
    specialty = str(file.parent).strip()
    tribunal = result[1]    
    
    CACHE_FILE = Path().joinpath(CACHE_FILES, f"{specialty}_{tribunal}_{hash_for_obj(file.name)}.csv")
    path = Path(CACHE_FILE)
    if path.is_file():
        # Skip the download if the file exists
        print(f"Skipping tabula for {file}")
        return
    
    tables = tabula.read_pdf(file,pages="all")
    print(f"({len(tables)} tables on this file)")
    
    for idx, table in enumerate(tables):
        table['specialty'] = [specialty for x in range(len(table))]
        table['tribunal'] = [tribunal for x in range(len(table))]
        filename = Path().joinpath(CSV_PATH, f"{specialty}_{tribunal}_{suffix}_{idx}.csv")
        table.to_csv(filename, index=False)
    
    save_to_json_file(CACHE_FILE, {'date': now_in_string()})

def extract_data_from_pattern(glob_pattern, suffix):
    p = Path('.').glob(glob_pattern)
    files = [x for x in p if x.is_file()]
    for idx, file in enumerate(files):
        counter_str = f"{idx}/{len(files)}"
        print(f"{now_in_string()} ({counter_str}) Reading '{file}'...")
        extract_data_from_pdf_to_csv(file, suffix)

    print("FINISH")


exists = os.path.exists(CACHE_FILES)
if not exists:
    os.makedirs(CACHE_FILES)


# Just to load the methods on this cell without running it
# Create the directory for storing the CSVs
exists = os.path.exists(CSV_PATH)
if not exists:
    os.makedirs(CSV_PATH)

    
include_patterns=[
    ('**/*ActaNotes1Provisional*.pdf', 'provisional-1'),
    ('**/*ActaNotes1Definitiva*.pdf', 'definitiva1-1'),
    ('**/*ActaNotesProva2Provisional*.pdf', 'provisional-2'),
    ('**/*ActaNotesProva2Definitiva*.pdf', 'definitiva-2')
]

    
start_time_str = now_in_string()
print(f"===== START {start_time_str}")
for pattern in include_patterns:
    extract_data_from_pattern(pattern[0], suffix=pattern[1])
print(f"===== END ({start_time_str} -> {now_in_string()}")

===== START 23:53:30
23:53:30 (0/395) Reading 'EDUCACIO INFANTIL/V36 - ActaNotes1Provisional 2022_06_29 10_13.pdf'...
Skipping tabula for EDUCACIO INFANTIL/V36 - ActaNotes1Provisional 2022_06_29 10_13.pdf
23:53:30 (1/395) Reading 'EDUCACIO INFANTIL/V12 - ActaNotes1Provisional 2022_06_29 09_58.pdf'...
Skipping tabula for EDUCACIO INFANTIL/V12 - ActaNotes1Provisional 2022_06_29 09_58.pdf
23:53:30 (2/395) Reading 'EDUCACIO INFANTIL/C1 - ActaNotes1Provisional 2022_06_28 11_56.pdf'...
Skipping tabula for EDUCACIO INFANTIL/C1 - ActaNotes1Provisional 2022_06_28 11_56.pdf
23:53:30 (3/395) Reading 'EDUCACIO INFANTIL/V24 - ActaNotes1Provisional 2022_06_29 10_59.pdf'...
Skipping tabula for EDUCACIO INFANTIL/V24 - ActaNotes1Provisional 2022_06_29 10_59.pdf
23:53:30 (4/395) Reading 'EDUCACIO INFANTIL/V28 - ActaNotes1Provisional 2022_06_29 11_14.pdf'...
Skipping tabula for EDUCACIO INFANTIL/V28 - ActaNotes1Provisional 2022_06_29 11_14.pdf
23:53:30 (5/395) Reading 'EDUCACIO INFANTIL/A33 - ActaNotes1P

(2 tables on this file)
23:53:38 (203/395) Reading 'ANGLES/V7 - ActaNotes1Provisional 2022_06_29 07_45.pdf'...
(2 tables on this file)
23:53:45 (204/395) Reading 'ANGLES/V13 - ActaNotes1Provisional 2022_06_29 08_57.pdf'...
(2 tables on this file)
23:53:51 (205/395) Reading 'ANGLES/V4 - ActaNotes1Provisional 2022_06_29 08_42.pdf'...
(2 tables on this file)
23:53:59 (206/395) Reading 'ANGLES/A2 - ActaNotes1Provisional 2022_06_29 09_19.pdf'...


In [None]:
import pandas as pd
import glob

# This step requires running a couple of commands manually:
# (it could be included on this jupyter, but sometimes the shell is faster :)
#
#
# cat csv/tmp/*provisional*.csv | grep "\*\*\*" > csv/PROVISIONAL.csv
# cat csv/tmp/*definitiva*.csv | grep "\*\*\*" > csv/DEFINITIVE.csv

def load_csv_file(filename):
    return pd.read_csv(filename,
                     names=['DNI', 'nombre', 'tema', 'caso_practico', 'total', 'especialidad', 'tribunal'])

def compare_valuations(row):
    return (
        row['tema_provisional'] == row['tema_definitivo']
        and
        row['caso_practico_provisional'] == row['caso_practico_definitivo']
    )


################

run = False

if run:
    start_time_str = now_in_string()
    print(f"===== START {start_time_str}")

    provisional = load_csv_file('csv/PROVISIONAL.csv')
    definitive = load_csv_file('csv/DEFINITIVE.csv')

    # Merge both files (as columns), and add a new column 'iguales' that compares provisional and definitive results
    merged = pd.merge(provisional, definitive, on=['DNI', 'nombre', 'especialidad', 'tribunal'], suffixes=('_provisional', '_definitivo'), how="outer")
    merged['iguales'] = concatenated.apply(compare_valuations, axis=1)

    # Sort the columns to look better on the spreedsheet
    merged = merged.reindex(columns=['especialidad', 'tribunal', 'DNI', 'nombre',
                                     'tema_provisional', 'caso_practico_provisional', 'total_provisional',
                                     'iguales',
                                     'tema_definitivo', 'caso_practico_definitivo', 'total_definitivo'])

    # Delete the name column (for sharing):
    del merged['nombre']

    with pd.ExcelWriter('results.xlsx') as writter:
        for speciality in merged['especialidad'].unique():
            temp_df = merged.loc[concatenated['especialidad'] == speciality]
            del temp_df['especialidad']
            temp_df.to_excel(writter, sheet_name=speciality)

    print(f"===== END ({start_time_str} -> {now_in_string()}")


In [None]:
# 14/Jul/2022
# Downloading the qualifications from the last part ()
# Download:
# - ActaNotesProva2Provisional.pdf
# - ActaNotesProva2Definitiva.pdf

for speciality in specialties:
    tribunals = get_all_tribunals(speciality)
    print(f"===== START {speciality['name']}")
    
    for idx, tribunal in enumerate(tribunals):
        # Create the directory for storing the PDFs
        tribunal_type = tribunal['type']
        exists = os.path.exists(tribunal_type)
        if not exists:
            os.makedirs(tribunal_type)
        
        counter_str = f"{idx}/{len(tribunals)}"
        print(f"{now_in_string()} ({counter_str}) {tribunal['type']} {tribunal['tribunal']}")
        
        files = get_all_files(tribunal, include_patterns=['ActaNotesProva2Provisional.pdf', 'ActaNotesProva2Definitiva.pdf'])
        
        for file in files:
            download_file(file)
        
    print(f"===== END {speciality['name']}")

In [None]:
CSV_PATH = "./csv/tmp/"

# Create the directory for storing the CSVs
exists = os.path.exists(CSV_PATH)
if not exists:
    os.makedirs(CSV_PATH)
exit()

start_time_str = now_in_string()
print(f"===== START {start_time_str}")
extract_data_from_pattern('*ANGLES*/*Prova2Provisional*.pdf', suffix='provisional-2')
extract_data_from_pattern('*ANGLES*/*Prova2Definitiva*.pdf', suffix='definitiva-2')
print(f"===== END ({start_time_str} -> {now_in_string()}")

In [None]:
import pandas as pd
import glob

# This step requires running a couple of commands manually:
# (it could be included on this jupyter, but sometimes the shell is faster :)
#
#
# cat csv/tmp/*provisional-2*.csv | grep "\*\*\*" > csv/PROVISIONAL2.csv
# cat csv/tmp/*definitiva-2*.csv | grep "\*\*\*" > csv/DEFINITIVE2.csv

def load_csv_file_prueba_2(filename):
    return pd.read_csv(filename,
                     names=['DNI', 'nombre', 'tema', 'caso_practico', 'total', 'especialidad', 'tribunal'])

def compare_valuations(row):
    return (
        row['tema_provisional'] == row['tema_definitivo']
        and
        row['caso_practico_provisional'] == row['caso_practico_definitivo']
    )



run = True
if run:
    start_time_str = now_in_string()
    print(f"===== START {start_time_str}")

    provisional = load_csv_file('csv/PROVISIONAL.csv')
    definitive = load_csv_file('csv/DEFINITIVE.csv')
    provisional2 = load_csv_file('csv/PROVISIONAL2.csv')
    definitive2 = load_csv_file('csv/DEFINITIVE2.csv')

    
    # Merge both files (as columns), and add a new column 'iguales' that compares provisional and definitive results
    merged = pd.merge(provisional, definitive, on=['DNI', 'nombre', 'especialidad', 'tribunal'], suffixes=('_provisional', '_definitivo'), how="outer")
    merged['iguales'] = merged.apply(compare_valuations, axis=1)
    
    merged2 = pd.merge(merged, provisional2, on=['DNI', 'nombre', 'especialidad', 'tribunal'], suffixes=('_provisional1', '_fase2'), how="outer")

provisional2
    
