In [321]:
# Some utils
from datetime import datetime
import re
import hashlib

def hash_for_obj(obj):
    return hashlib.md5(str(obj).encode('utf-8')).hexdigest()


def now_in_string():
    return datetime.now().strftime("%H:%M:%S")


def get_type_and_tribunal(long_name):
    """
    Parses the name of each tribunal, and returns the name and the speciality
    """
    result = re.search('([\s\w.]*) - ([\w]*)', long_name)
    return result.groups()

In [322]:
import urllib.parse
import json

import requests
from bs4 import BeautifulSoup


def load_json_file(path):
    try:
        with open(path) as json_file:
            return json.load(json_file)
    except FileNotFoundError:
        return None
    
def save_to_json_file(path, dictionary):
    with open(path, 'w') as outfile:
        json.dump(dictionary, outfile)


def get_all_specialties():
    CACHE_FILE = 'cache/specialities.json'
    cache = load_json_file(CACHE_FILE)
    if cache:
        return cache
    
    DOCENTES_URL = "https://ceice.gva.es/auto/Actas/"
    r = requests.get(DOCENTES_URL)
    bs_content = BeautifulSoup(r.text, 'lxml')
    table = bs_content.find('table', attrs={'id': 'indexlist'})
    rows = table.findChildren('tr')
    
    structured_rows = []
    for row in rows:
        if not row.get('class')[0] in ['even', 'odd']:
            continue
        cols = row.findChildren('td')
        name = cols[1].a.get_text().strip().replace('.', '').replace('/', '')
        structured = {
            'name': name,
            'link': urllib.parse.urljoin(DOCENTES_URL, cols[1].a.get('href')).strip(),
            'modified': cols[2].get_text().strip(),
        }
        structured_rows.append(structured)

    save_to_json_file(CACHE_FILE, structured_rows)
    return structured_rows

specialties = get_all_specialties()

In [323]:
import os
from pathlib import Path
from slugify import slugify


def get_all_tribunals(speciality_row):        
    slug = slugify(speciality_row['name'])
    CACHE_FILE = f'cache/tribunals{slug}.json'
    
    cache = load_json_file(CACHE_FILE)
    if cache:
        return cache
    
    print(f"Haciendo la peticion: {speciality_row['name']}")
    
    speciality_url = speciality_row['link']
    
    r = requests.get(speciality_url)
    bs_content = BeautifulSoup(r.text, 'lxml')

    table = bs_content.find('table', attrs={'id': 'indexlist'})
    rows = table.findChildren('tr')
    
    structured_rows = []
    for row in rows:
        if not row.get('class')[0] in ['even', 'odd']:
            continue
        cols = row.findChildren('td')
        name = cols[1].a.get_text().strip().replace('.', '')
        
        structured = {
            'type': get_type_and_tribunal(name)[0].strip(),
            'tribunal': get_type_and_tribunal(name)[1],
            'name': name,
            'link': urllib.parse.urljoin(speciality_url, cols[1].a.get('href')).strip(),
            'modified': cols[2].get_text().strip(),
        }
        structured_rows.append(structured)

    save_to_json_file(CACHE_FILE, structured_rows)    
    return structured_rows


def get_all_files(structured_row, include_patterns=['ActaNotesProva2Provisional.pdf']):
    slug = slugify(f"files_tribunal_{structured_row['name']}{hash_for_obj(include_patterns)}")
    CACHE_FILE = f'cache/{slug}a.json'
    
    cache = load_json_file(CACHE_FILE)
    if cache:
        return cache
    
    print(f"Haciendo la peticion: {CACHE_FILE}")
    
    url_files = structured_row['link']
    r = requests.get(url_files)
    bs_content = BeautifulSoup(r.text, 'lxml')
    table = bs_content.find('table', attrs={'id': 'indexlist'})
    rows = table.findChildren('tr')
    
    structured_rows = []
    for row in rows:
        if not row.get('class')[0] in ['even', 'odd']:
            continue
        cols = row.findChildren('td')
        name = cols[1].a.get_text().strip()
        
        # Only include expected files
        included = False
        for included_pattern in include_patterns:
            if included_pattern in name:
                included = True

        if not included:
            continue

        structured = {
            '_tribunal': structured_row,
            'name': name,
            'link': urllib.parse.urljoin(url_files, cols[1].a.get('href')),
            'modified': cols[2].get_text().strip(),
        }
        structured_rows.append(structured)

    save_to_json_file(CACHE_FILE, structured_rows)    
    return structured_rows

def download_file(structured_row):
    tribunal_type = structured_row['_tribunal']['type']
    tribunal = structured_row['_tribunal']['tribunal']
    modified = structured_row['modified'].replace(':', '_').replace('-', '_')
    url = structured_row['link']
    name = Path(structured_row['name'])
    name_wo_ext = name.with_suffix('')
    name_ext = name.suffix
    file_name = f"{tribunal} - {name_wo_ext} {modified}{name_ext}"
    final_name = f"{tribunal_type}/{file_name}"
    
    path = Path(final_name)
    if path.is_file():
        # Skip the download if the file exists
        return final_name
    
    r_doc = requests.get(url)
    open(final_name, 'wb').write(r_doc.content)
    print(f"\t{file_name}")

    return final_name


start_time_str = now_in_string()

for speciality in specialties:
    tribunals = get_all_tribunals(speciality)
    print(f"===== START {speciality['name']}")
    
    for idx, tribunal in enumerate(tribunals):
        # Create the directory for storing the PDFs
        tribunal_type = tribunal['type']
        exists = os.path.exists(tribunal_type)
        if not exists:
            os.makedirs(tribunal_type)
        
        counter_str = f"{idx}/{len(tribunals) - 1}"
        print(f"{now_in_string()} ({counter_str}) {tribunal['type']} {tribunal['tribunal']}")
        
        include_patterns=[
            'ActaNotes1Provisional.pdf',
            'ActaNotes1Definitiva.pdf',
            'ActaNotesProva2Provisional.pdf',
            'ActaNotesProva2Definitiva.pdf'
        ]
        files = get_all_files(tribunal, include_patterns=include_patterns)
        
        for file in files:
            download_file(file)
        
    print(f"===== END {speciality['name']}")

print(f"===== END ({start_time_str} -> {now_in_string()}")


===== START 128_EDUCACIO PRIMARIA
02:04:37 (0/134) EDUCACIO PRIMARIA V60
02:04:37 (1/134) EDUCACIO PRIMARIA A48
02:04:37 (2/134) EDUCACIO PRIMARIA A47
02:04:37 (3/134) EDUCACIO PRIMARIA V59
02:04:37 (4/134) EDUCACIO PRIMARIA V58
02:04:37 (5/134) EDUCACIO PRIMARIA V21
02:04:37 (6/134) EDUCACIO PRIMARIA V23
02:04:37 (7/134) EDUCACIO PRIMARIA V9
02:04:37 (8/134) EDUCACIO PRIMARIA A36
02:04:37 (9/134) EDUCACIO PRIMARIA A19
02:04:37 (10/134) EDUCACIO PRIMARIA A53
02:04:37 (11/134) EDUCACIO PRIMARIA C4
02:04:37 (12/134) EDUCACIO PRIMARIA A34
02:04:37 (13/134) EDUCACIO PRIMARIA V31
02:04:37 (14/134) EDUCACIO PRIMARIA A33
02:04:37 (15/134) EDUCACIO PRIMARIA V20
02:04:37 (16/134) EDUCACIO PRIMARIA A28
02:04:37 (17/134) EDUCACIO PRIMARIA V19
02:04:37 (18/134) EDUCACIO PRIMARIA V43
02:04:37 (19/134) EDUCACIO PRIMARIA A41
02:04:37 (20/134) EDUCACIO PRIMARIA A51
02:04:37 (21/134) EDUCACIO PRIMARIA V33
02:04:37 (22/134) EDUCACIO PRIMARIA A23
02:04:37 (23/134) EDUCACIO PRIMARIA V29
02:04:37 (24/134) 

02:04:37 (67/146) EDUCACIO INFANTIL A39
02:04:37 (68/146) EDUCACIO INFANTIL A14
02:04:37 (69/146) EDUCACIO INFANTIL V40
02:04:37 (70/146) EDUCACIO INFANTIL V66
02:04:37 (71/146) EDUCACIO INFANTIL A31
02:04:37 (72/146) EDUCACIO INFANTIL C16
02:04:37 (73/146) EDUCACIO INFANTIL V36
02:04:37 (74/146) EDUCACIO INFANTIL V34
02:04:37 (75/146) EDUCACIO INFANTIL A34
02:04:37 (76/146) EDUCACIO INFANTIL C3
02:04:37 (77/146) EDUCACIO INFANTIL C13
02:04:37 (78/146) EDUCACIO INFANTIL C6
02:04:37 (79/146) EDUCACIO INFANTIL V28
02:04:37 (80/146) EDUCACIO INFANTIL A5
02:04:37 (81/146) EDUCACIO INFANTIL C15
02:04:37 (82/146) EDUCACIO INFANTIL V19
02:04:37 (83/146) EDUCACIO INFANTIL A17
02:04:37 (84/146) EDUCACIO INFANTIL C2
02:04:37 (85/146) EDUCACIO INFANTIL A9
02:04:37 (86/146) EDUCACIO INFANTIL V57
02:04:37 (87/146) EDUCACIO INFANTIL A12
02:04:37 (88/146) EDUCACIO INFANTIL C17
02:04:37 (89/146) EDUCACIO INFANTIL A29
02:04:37 (90/146) EDUCACIO INFANTIL A35
02:04:37 (91/146) EDUCACIO INFANTIL V8
02:04:

In [324]:
import os
import tabula

CSV_PATH = "./csv/tmp/"
CACHE_FILES = "./cache/tabula/" 

def extract_data_from_pdf_to_csv(file, suffix):
    result = re.search('([\w]*) - ([\w]*)', file.name)
    specialty = str(file.parent).strip()
    tribunal = result[1]    
    
    CACHE_FILE = Path().joinpath(CACHE_FILES, f"{specialty}_{tribunal}_{hash_for_obj(file.name)}.csv")
    path = Path(CACHE_FILE)
    if path.is_file():
        # Skip the download if the file exists
        print(f"Skipping tabula for {file}")
        return
    
    tables = tabula.read_pdf(file,pages="all")
    print(f"({len(tables)} tables on this file)")
    
    for idx, table in enumerate(tables):
        table['specialty'] = [specialty for x in range(len(table))]
        table['tribunal'] = [tribunal for x in range(len(table))]
        filename = Path().joinpath(CSV_PATH, f"{specialty}_{tribunal}_{suffix}_{idx}.csv")
        table.to_csv(filename, index=False)
    
    save_to_json_file(CACHE_FILE, {'date': now_in_string()})

def extract_data_from_pattern(glob_pattern, suffix):
    p = Path('.').glob(glob_pattern)
    files = [x for x in p if x.is_file()]
    for idx, file in enumerate(files):
        counter_str = f"{idx}/{len(files)}"
        print(f"{now_in_string()} ({counter_str}) Reading '{file}'...")
        extract_data_from_pdf_to_csv(file, suffix)

    print("FINISH")


exists = os.path.exists(CACHE_FILES)
if not exists:
    os.makedirs(CACHE_FILES)


# Just to load the methods on this cell without running it
# Create the directory for storing the CSVs
exists = os.path.exists(CSV_PATH)
if not exists:
    os.makedirs(CSV_PATH)

    
include_patterns=[
    ('**/*ActaNotes1Provisional*.pdf', 'provisional-1'),
    ('**/*ActaNotes1Definitiva*.pdf', 'definitiva-1'),
    ('**/*ActaNotesProva2Provisional*.pdf', 'provisional-2'),
    ('**/*ActaNotesProva2Definitiva*.pdf', 'definitiva-2')
]

    
start_time_str = now_in_string()
print(f"===== START {start_time_str}")
for pattern in include_patterns:
    extract_data_from_pattern(pattern[0], suffix=pattern[1])
print(f"===== END ({start_time_str} -> {now_in_string()}")

===== START 02:04:38
02:04:38 (0/395) Reading 'EDUCACIO INFANTIL/V36 - ActaNotes1Provisional 2022_06_29 10_13.pdf'...
Skipping tabula for EDUCACIO INFANTIL/V36 - ActaNotes1Provisional 2022_06_29 10_13.pdf
02:04:38 (1/395) Reading 'EDUCACIO INFANTIL/V12 - ActaNotes1Provisional 2022_06_29 09_58.pdf'...
Skipping tabula for EDUCACIO INFANTIL/V12 - ActaNotes1Provisional 2022_06_29 09_58.pdf
02:04:38 (2/395) Reading 'EDUCACIO INFANTIL/C1 - ActaNotes1Provisional 2022_06_28 11_56.pdf'...
Skipping tabula for EDUCACIO INFANTIL/C1 - ActaNotes1Provisional 2022_06_28 11_56.pdf
02:04:38 (3/395) Reading 'EDUCACIO INFANTIL/V24 - ActaNotes1Provisional 2022_06_29 10_59.pdf'...
Skipping tabula for EDUCACIO INFANTIL/V24 - ActaNotes1Provisional 2022_06_29 10_59.pdf
02:04:38 (4/395) Reading 'EDUCACIO INFANTIL/V28 - ActaNotes1Provisional 2022_06_29 11_14.pdf'...
Skipping tabula for EDUCACIO INFANTIL/V28 - ActaNotes1Provisional 2022_06_29 11_14.pdf
02:04:38 (5/395) Reading 'EDUCACIO INFANTIL/A33 - ActaNotes1P

Skipping tabula for MUSICA/A2 - ActaNotes1Provisional 2022_06_29 10_17.pdf
02:04:38 (254/395) Reading 'MUSICA/A3 - ActaNotes1Provisional 2022_06_29 10_18.pdf'...
Skipping tabula for MUSICA/A3 - ActaNotes1Provisional 2022_06_29 10_18.pdf
02:04:38 (255/395) Reading 'MUSICA/V3 - ActaNotes1Provisional 2022_06_29 11_06.pdf'...
Skipping tabula for MUSICA/V3 - ActaNotes1Provisional 2022_06_29 11_06.pdf
02:04:38 (256/395) Reading 'MUSICA/V5 - ActaNotes1Provisional 2022_06_29 11_26.pdf'...
Skipping tabula for MUSICA/V5 - ActaNotes1Provisional 2022_06_29 11_26.pdf
02:04:38 (257/395) Reading 'MUSICA/A1 - ActaNotes1Provisional 2022_06_29 10_18.pdf'...
Skipping tabula for MUSICA/A1 - ActaNotes1Provisional 2022_06_29 10_18.pdf
02:04:38 (258/395) Reading 'MUSICA/C2 - ActaNotes1Provisional 2022_06_28 14_16.pdf'...
Skipping tabula for MUSICA/C2 - ActaNotes1Provisional 2022_06_28 14_16.pdf
02:04:38 (259/395) Reading 'MUSICA/V1 - ActaNotes1Provisional 2022_06_29 12_51.pdf'...
Skipping tabula for MUSICA/V

02:04:38 (0/395) Reading 'EDUCACIO INFANTIL/V78 - ActaNotes1Definitiva 2022_07_01 10_21.pdf'...
Skipping tabula for EDUCACIO INFANTIL/V78 - ActaNotes1Definitiva 2022_07_01 10_21.pdf
02:04:38 (1/395) Reading 'EDUCACIO INFANTIL/A17 - ActaNotes1Definitiva 2022_07_01 10_39.pdf'...
Skipping tabula for EDUCACIO INFANTIL/A17 - ActaNotes1Definitiva 2022_07_01 10_39.pdf
02:04:38 (2/395) Reading 'EDUCACIO INFANTIL/C1 - ActaNotes1Definitiva 2022_07_01 10_59.pdf'...
Skipping tabula for EDUCACIO INFANTIL/C1 - ActaNotes1Definitiva 2022_07_01 10_59.pdf
02:04:38 (3/395) Reading 'EDUCACIO INFANTIL/V35 - ActaNotes1Definitiva 2022_07_01 10_31.pdf'...
Skipping tabula for EDUCACIO INFANTIL/V35 - ActaNotes1Definitiva 2022_07_01 10_31.pdf
02:04:38 (4/395) Reading 'EDUCACIO INFANTIL/C15 - ActaNotes1Definitiva 2022_07_01 11_05.pdf'...
Skipping tabula for EDUCACIO INFANTIL/C15 - ActaNotes1Definitiva 2022_07_01 11_05.pdf
02:04:38 (5/395) Reading 'EDUCACIO INFANTIL/A21 - ActaNotes1Definitiva 2022_07_01 10_21.pdf'

Skipping tabula for ANGLES/V7 - ActaNotes1Definitiva 2022_07_01 10_41.pdf
02:04:38 (215/395) Reading 'ANGLES/A3 - ActaNotes1Definitiva 2022_07_01 10_13.pdf'...
Skipping tabula for ANGLES/A3 - ActaNotes1Definitiva 2022_07_01 10_13.pdf
02:04:38 (216/395) Reading 'PEDAGOGIA TERAPEUTICA/V10 - ActaNotes1Definitiva 2022_07_01 10_42.pdf'...
Skipping tabula for PEDAGOGIA TERAPEUTICA/V10 - ActaNotes1Definitiva 2022_07_01 10_42.pdf
02:04:38 (217/395) Reading 'PEDAGOGIA TERAPEUTICA/A2 - ActaNotes1Definitiva 2022_07_01 11_18.pdf'...
Skipping tabula for PEDAGOGIA TERAPEUTICA/A2 - ActaNotes1Definitiva 2022_07_01 11_18.pdf
02:04:38 (218/395) Reading 'PEDAGOGIA TERAPEUTICA/A9 - ActaNotes1Definitiva 2022_07_01 10_18.pdf'...
Skipping tabula for PEDAGOGIA TERAPEUTICA/A9 - ActaNotes1Definitiva 2022_07_01 10_18.pdf
02:04:38 (219/395) Reading 'PEDAGOGIA TERAPEUTICA/V15 - ActaNotes1Definitiva 2022_07_01 10_42.pdf'...
Skipping tabula for PEDAGOGIA TERAPEUTICA/V15 - ActaNotes1Definitiva 2022_07_01 10_42.pdf
02

Skipping tabula for EDUCACIO INFANTIL/V24 - ActaNotesProva2Provisional 2022_07_11 10_22.pdf
02:04:38 (53/395) Reading 'EDUCACIO INFANTIL/A26 - ActaNotesProva2Provisional 2022_07_11 08_40.pdf'...
Skipping tabula for EDUCACIO INFANTIL/A26 - ActaNotesProva2Provisional 2022_07_11 08_40.pdf
02:04:38 (54/395) Reading 'EDUCACIO INFANTIL/V20 - ActaNotesProva2Provisional 2022_07_11 09_23.pdf'...
Skipping tabula for EDUCACIO INFANTIL/V20 - ActaNotesProva2Provisional 2022_07_11 09_23.pdf
02:04:38 (55/395) Reading 'EDUCACIO INFANTIL/C18 - ActaNotesProva2Provisional 2022_07_11 11_44.pdf'...
Skipping tabula for EDUCACIO INFANTIL/C18 - ActaNotesProva2Provisional 2022_07_11 11_44.pdf
02:04:38 (56/395) Reading 'EDUCACIO INFANTIL/V33 - ActaNotesProva2Provisional 2022_07_11 08_27.pdf'...
Skipping tabula for EDUCACIO INFANTIL/V33 - ActaNotesProva2Provisional 2022_07_11 08_27.pdf
02:04:38 (57/395) Reading 'EDUCACIO INFANTIL/V70 - ActaNotesProva2Provisional 2022_07_11 10_23.pdf'...
Skipping tabula for EDUCA

Skipping tabula for EDUCACIO PRIMARIA/V49 - ActaNotesProva2Provisional 2022_07_11 08_14.pdf
02:04:38 (304/395) Reading 'EDUCACIO PRIMARIA/V14 - ActaNotesProva2Provisional 2022_07_11 12_08.pdf'...
Skipping tabula for EDUCACIO PRIMARIA/V14 - ActaNotesProva2Provisional 2022_07_11 12_08.pdf
02:04:38 (305/395) Reading 'EDUCACIO PRIMARIA/A29 - ActaNotesProva2Provisional 2022_07_11 11_02.pdf'...
Skipping tabula for EDUCACIO PRIMARIA/A29 - ActaNotesProva2Provisional 2022_07_11 11_02.pdf
02:04:38 (306/395) Reading 'EDUCACIO PRIMARIA/A42 - ActaNotesProva2Provisional 2022_07_11 10_33.pdf'...
Skipping tabula for EDUCACIO PRIMARIA/A42 - ActaNotesProva2Provisional 2022_07_11 10_33.pdf
02:04:38 (307/395) Reading 'EDUCACIO PRIMARIA/V24 - ActaNotesProva2Provisional 2022_07_11 10_34.pdf'...
Skipping tabula for EDUCACIO PRIMARIA/V24 - ActaNotesProva2Provisional 2022_07_11 10_34.pdf
02:04:38 (308/395) Reading 'EDUCACIO PRIMARIA/A46 - ActaNotesProva2Provisional 2022_07_11 09_26.pdf'...
Skipping tabula for 

Skipping tabula for EDUCACIO INFANTIL/V19 - ActaNotesProva2Definitiva 2022_07_13 08_23.pdf
02:04:38 (118/395) Reading 'EDUCACIO INFANTIL/V16 - ActaNotesProva2Definitiva 2022_07_13 07_07.pdf'...
Skipping tabula for EDUCACIO INFANTIL/V16 - ActaNotesProva2Definitiva 2022_07_13 07_07.pdf
02:04:38 (119/395) Reading 'EDUCACIO INFANTIL/A47 - ActaNotesProva2Definitiva 2022_07_13 15_39.pdf'...
Skipping tabula for EDUCACIO INFANTIL/A47 - ActaNotesProva2Definitiva 2022_07_13 15_39.pdf
02:04:38 (120/395) Reading 'EDUCACIO INFANTIL/V13 - ActaNotesProva2Definitiva 2022_07_13 01_37.pdf'...
Skipping tabula for EDUCACIO INFANTIL/V13 - ActaNotesProva2Definitiva 2022_07_13 01_37.pdf
02:04:38 (121/395) Reading 'EDUCACIO INFANTIL/V8 - ActaNotesProva2Definitiva 2022_07_13 09_11.pdf'...
Skipping tabula for EDUCACIO INFANTIL/V8 - ActaNotesProva2Definitiva 2022_07_13 09_11.pdf
02:04:38 (122/395) Reading 'EDUCACIO INFANTIL/V52 - ActaNotesProva2Definitiva 2022_07_13 12_16.pdf'...
Skipping tabula for EDUCACIO INF

Skipping tabula for EDUCACIO PRIMARIA/C3 - ActaNotesProva2Definitiva 2022_07_13 09_58.pdf
02:04:38 (371/395) Reading 'EDUCACIO PRIMARIA/C4 - ActaNotesProva2Definitiva 2022_07_13 10_02.pdf'...
Skipping tabula for EDUCACIO PRIMARIA/C4 - ActaNotesProva2Definitiva 2022_07_13 10_02.pdf
02:04:38 (372/395) Reading 'EDUCACIO PRIMARIA/A44 - ActaNotesProva2Definitiva 2022_07_13 09_54.pdf'...
Skipping tabula for EDUCACIO PRIMARIA/A44 - ActaNotesProva2Definitiva 2022_07_13 09_54.pdf
02:04:38 (373/395) Reading 'EDUCACIO PRIMARIA/A30 - ActaNotesProva2Definitiva 2022_07_13 11_17.pdf'...
Skipping tabula for EDUCACIO PRIMARIA/A30 - ActaNotesProva2Definitiva 2022_07_13 11_17.pdf
02:04:38 (374/395) Reading 'EDUCACIO PRIMARIA/V8 - ActaNotesProva2Definitiva 2022_07_13 09_10.pdf'...
Skipping tabula for EDUCACIO PRIMARIA/V8 - ActaNotesProva2Definitiva 2022_07_13 09_10.pdf
02:04:38 (375/395) Reading 'EDUCACIO PRIMARIA/C2 - ActaNotesProva2Definitiva 2022_07_13 10_10.pdf'...
Skipping tabula for EDUCACIO PRIMARI

In [325]:
import pandas as pd
import glob

# This step requires running a couple of commands manually:
# (it could be included on this jupyter, but sometimes the shell is faster :)
#
#
# cat csv/tmp/*provisional-1*.csv | grep "\*\*\*" > csv/PROVISIONAL1.csv
# cat csv/tmp/*provisional-2*.csv | grep "\*\*\*" > csv/PROVISIONAL2.csv
# cat csv/tmp/*definitiva1-1*.csv | grep "\*\*\*" > csv/DEFINITIVA1.csv
# cat csv/tmp/*definitiva-2*.csv | grep "\*\*\*" > csv/DEFINITIVA2.csv

COLS_PRUEBA1 = ['DNI', 'nombre', 'tema', 'caso', 'total', 'especialidad', 'tribunal']
COLS_PRUEBA2 = ['DNI', 'nombre', 'program', 'especialidad', 'tribunal']

def load_csv_file(filename, cols_names):
    return pd.read_csv(filename, names=cols_names)


def field_is_equal(row, field_name_1, field_name_2):
    str_1 = str(row[field_name_1])
    str_2 = str(row[field_name_2])
    
    return str_1 == str_2


def are_equal_provisional_and_definitive(field_name):
    def return_function(row):
        return field_is_equal(row, f'{field_name}_prov', f'{field_name}_def')
    
    return return_function

def modifying_approval_part_1(row):
    str_1 = str(row['total_prov'])
    str_2 = str(row['total_def'])
    
    if str_1 in ['nan', 'NP'] and str_2 not in ['nan', 'NP']:
        return "aprobado"

    if str_2 in ['nan', 'NP'] and str_1 not in ['nan', 'NP']:
        return "suspenso"
    
    return "-"

def find_duplicated(total_dataset):
    dupli_dict = {}
    
    duplicates = total_dataset[total_dataset.duplicated(subset=['DNI','especialidad', 'tribunal'], keep=False)]
    for idx, dup in duplicates.iterrows():
        # For each duplicated element, I'm going to look for the rest of occurences of that value
        # and compare the name. If the name is partially similar, I'll include in the results for debug them
        name = dup['nombre']

        others = duplicates[(
            (duplicates['DNI'] == dup['DNI'])
            & (duplicates['especialidad'] == dup['especialidad'])
            & (duplicates['tribunal'] == dup['tribunal'])
            & (duplicates['nombre'] != dup['nombre'])
        )]

        for ido, other in others.iterrows():
            name_other = other['nombre']

            if name in name_other:
                dupli_dict[name] = name_other
    return dupli_dict

################

start_time_str = now_in_string()
print(f"===== START {start_time_str}")

provisional_1 = load_csv_file('csv/PROVISIONAL1.csv', COLS_PRUEBA1)
definitiva_1 = load_csv_file('csv/DEFINITIVA1.csv', COLS_PRUEBA1)

provisional_2 = load_csv_file('csv/PROVISIONAL2.csv', COLS_PRUEBA2)
definitiva_2 = load_csv_file('csv/DEFINITIVA2.csv', COLS_PRUEBA2)


prueba_1 = pd.merge(provisional_1, definitiva_1, on=['DNI', 'especialidad', 'tribunal', 'nombre'], how="outer", suffixes=('_prov', '_def'))
prueba_2 = pd.merge(provisional_2, definitiva_2, on=['DNI', 'especialidad', 'tribunal', 'nombre'], how="outer", suffixes=('_prov', '_def'))

total = pd.merge(prueba_1, prueba_2, on=['DNI', 'especialidad', 'tribunal', 'nombre'], how="outer", suffixes=('_1', '_2'))              


duplicados = find_duplicated(total)
print(f"Rarezas encontradas: {len(duplicados)}")

# Fix the names with 'rarezas'
for short_name, long_name in duplicados.items():
    prueba_1.replace(short_name, long_name, inplace=True)
    prueba_2.replace(short_name, long_name, inplace=True)

total_fixed = pd.merge(prueba_1, prueba_2, on=['DNI', 'especialidad', 'tribunal', 'nombre'], how="outer", suffixes=('_1', '_2'))
duplicados_after_fixing = find_duplicated(total_fixed)
print(f"Rarezas pendientes: {len(duplicados_after_fixing)}")


print(f"prueba 1: {len(prueba_1)}, prueba 2: {len(prueba_2)}, mergeado: {len(total_fixed)}")


# Add columns 'iguales' that compares provisional and definitive results
total_fixed['tema_='] = total_fixed.apply(are_equal_provisional_and_definitive('tema'), axis=1)
total_fixed['caso_='] = total_fixed.apply(are_equal_provisional_and_definitive('caso'), axis=1)
total_fixed['total_p1_='] = total_fixed.apply(modifying_approval_part_1, axis=1)
total_fixed['program_='] = total_fixed.apply(are_equal_provisional_and_definitive('program'), axis=1)

# Sort the columns to look better on the spreedsheet
total_fixed = total_fixed.reindex(columns=['especialidad', 'tribunal', 'DNI', 'nombre',
                                           'tema_prov', 'tema_def', 'tema_=',
                                           'caso_prov', 'caso_def', 'caso_=',
                                           'total_prov', 'total_def', 'total_p1_=',
                                           'program_prov', 'program_def', 'program_='
                                          ])

fila = total_fixed[total_fixed['total_p1_='] == 'suspenso']
fila

print(f"===== END ({start_time_str} -> {now_in_string()})")

===== START 02:04:38
Rarezas encontradas: 203
Rarezas pendientes: 0
prueba 1: 19592, prueba 2: 7928, mergeado: 19592
===== END (02:04:38 -> 02:04:44)


In [326]:
start_time_str = now_in_string()
print(f"===== START {start_time_str}")

# Delete the name column (for sharing):
del total_fixed['nombre']

with pd.ExcelWriter('results_sin_nombre.xlsx') as writter:
    for speciality in total_fixed['especialidad'].unique():
        temp_df = total_fixed.loc[total_fixed['especialidad'] == speciality]
        del temp_df['especialidad']
        temp_df.to_excel(writter, sheet_name=speciality, index=False)


print(f"===== END ({start_time_str} -> {now_in_string()})")

===== START 02:04:44
===== END (02:04:44 -> 02:04:52)
