In [134]:
import pandas as pd
import numpy as np
import json, os, string
from janitor import clean_names
from fuzzywuzzy import fuzz
from fuzzywuzzy import process



In [127]:
def read_json_dict(path: str) -> dict:
    """
    Reads a json file and returns it as dict object
    """
    
    file = open(path) # Opening JSON file
    return json.load(file) # returns JSON object as a dictionary

def folder_creator(folder_name: string, path: string) -> None:
    """
    Generates a folder in specified path
    
    input: name of root folder, path where you want 
    folder to be created
    output: None
    """
    
    # defining paths
    data_folder_path = path + "/" + folder_name
    data_folder_exists = os.path.exists(data_folder_path)

    # creating folders if don't exist
    if data_folder_exists:
        pass
    else:    
        # create a new directory because it does not exist 
        os.makedirs(data_folder_path)

        # create subfolders
        print(f"The new directory {folder_name} was created!")
        
def fuzzy_merge(df_1, df_2, key1, key2, threshold=90, limit=2):
    """
    :param df_1: the left table to join
    :param df_2: the right table to join
    :param key1: key column of the left table
    :param key2: key column of the right table
    :param threshold: how close the matches should be to return a match, based on Levenshtein distance
    :param limit: the amount of matches that will get returned, these are sorted high to low
    :return: dataframe with boths keys and matches
    """
    s = df_2[key2].tolist()
    
    m = df_1[key1].apply(lambda x: process.extract(x, s, limit=limit))    
    df_1['matches'] = m
    
    m2 = df_1['matches'].apply(lambda x: ', '.join([i[0] for i in x if i[1] >= threshold]))
    df_1['matches'] = m2
    
    return df_1

## Reading paths

In [18]:
paths = read_json_dict("paths.json")

In [19]:
paths

{'data_path': 'D:/Accesos directos/Trabajo/World Bank/WB Repos/peru-scrape-justice',
 'code_path': '/Users/brandonmora/GitHub/peru-amag-stats/case_outcomes',
 'data_amag_i': 'D:/Accesos directos/Trabajo/World Bank/WB Repos/peru-scrape-justice/01_AMAG',
 'data_cej': 'D:/Accesos directos/Trabajo/World Bank/WB Repos/peru-scrape-justice/data_cleaned',
 'data_gender': 'D:/Accesos directos/Trabajo/World Bank/WB Repos/peru-scrape-justice/07_Other/02_Raw/names_gender',
 'local_storage': 'D:/Daniel Chen Dropbox/Marco Antonio GutiÃ©rrez ChÃ¡vez/datasets_amag_ii_scrape'}

In [20]:
data_path = paths["data_path"]

In [42]:
folder_creator("data_cleaned", data_path)

The new directory data_cleaned is created!


In [44]:
data_cleaned_path = data_path + "/data_cleaned"

In [45]:
folder_creator("raw", data_cleaned_path)

The new directory raw was created!


In [46]:
dc_raw_path = data_cleaned_path + "/raw"

In [52]:
dc_raw_path

'D:/Accesos directos/Trabajo/World Bank/WB Repos/peru-scrape-justice/data_cleaned/raw'

# 1. Creating participants list

In [129]:
lab_data = pd.read_stata(data_path + "/data/lab_Data/Clean_Full_Data12.dta")

In [130]:
lab_data["participant_nombre_apellido"] = lab_data["Nombres"] + " " + lab_data["ApellidoPaterno"] + " " + lab_data["ApellidoMaterno"]
lab_data["participant_nombre_apellido"] = lab_data["participant_nombre_apellido"].str.strip()

  lab_data["participant_nombre_apellido"] = lab_data["Nombres"] + " " + lab_data["ApellidoPaterno"] + " " + lab_data["ApellidoMaterno"]


In [131]:
lab_data["participant_apellido_nombre"] = lab_data["ApellidoPaterno"] + " " + lab_data["ApellidoMaterno"] + " " + lab_data["Nombres"]
lab_data["participant_apellido_nombre"] = lab_data["participant_apellido_nombre"].str.strip()

  lab_data["participant_apellido_nombre"] = lab_data["ApellidoPaterno"] + " " + lab_data["ApellidoMaterno"] + " " + lab_data["Nombres"]


In [132]:
lab_data = lab_data.rename(columns={"DNI": "nrodocumento"})

In [168]:
amag_ii_participants = lab_data[["nrodocumento", "participant_nombre_apellido", "participant_apellido_nombre"]]

In [169]:
amag_ii_participants.to_csv(dc_raw_path + "/amag_ii_participants_list.csv")

# 2. Creating Cases List

### Selecting reporte files

In [138]:
files_reports = pd.read_csv(dc_raw_path + "/DF_file_report_2022.csv")
files_reports = clean_names(files_reports)

### Cleaning the reporte files

Creating lists with characters to be replace

In [139]:
backslash_reps = ["\\(\\*\\)", "\\", "\\([^()]{0,}\\)"]
trailing_and_special_reps = ["^\\s", "\\,", "\\.$", " \\- JUZ$", "\\*"]
other_strs_reps = ["\\- MIXTO Y LIQ", "\\- MIXTO", "\\- JUZ\\. MIXTO", 
                   "- JM", "- INVESTIGACION", "- PAZ LETRADO", "SECOM - ", "- JT"]

In [140]:
empty_reps = backslash_reps + trailing_and_special_reps +  other_strs_reps

In [141]:
name_reps = [["ALFREDO E\\.", "ALFREDO E"], ["BERTHA F\\.", "BERTHA F"], ["CLAUDIO W\\.", "CLAUDIO W"], 
            ["CLAVELITO L\\.", "CLAVELITO L"], ["ELMER L\\.", "ELMER L"], ["ERNESTO A\\.", "ERNESTO A"],
            ["HERBERT M\\.", "HERBERT M"], ["LUZ K\\.", "LUZ K"], ["NANCY S\\.", "NANCY S"], ["JESSICA E\\.", "JESSICA E"],
            ["PATRICIA C\\.", "PATRICIA C"], ["JESSICA P\\.", "JESSICA P"], ["YOLANDA B\\.", "YOLANDA B\\."],
            ["LUZ M\\.", "LUZ M"], ["EDGAR\\.", "EDGAR"], ["C\\. ARTURO", "C ARTURO"], ["ALEXANDER A\\.", "ALEXANDER A"],
            ["RENE G\\.", "RENE G"], ["GUILLERMO S\\.", "GUILLERMO S"], ["FANNY L\\. ",  "FANNY L"], ["ELISA \\(LA", "ELISA"],
            ["JULIA \\(LA", "JULIA"], ["ACEVEDO DIEZ CECILIA", "ACEVEDO DIEZ CECILIA DEL PILAR"], [" J. ", " J "],
            [" K. ", " K "]]

In [142]:
for val in empty_reps:    
    files_reports["juez_"] = files_reports["juez_"].str.replace(val, "")

  files_reports["juez_"] = files_reports["juez_"].str.replace(val, "")
  files_reports["juez_"] = files_reports["juez_"].str.replace(val, "")


In [143]:
for name_rep in name_reps:
    files_reports["juez_"] = files_reports["juez_"].str.replace(name_rep[0], name_rep[1])

  files_reports["juez_"] = files_reports["juez_"].str.replace(name_rep[0], name_rep[1])


In [144]:
files_reports = files_reports[files_reports["juez_"].notna()]

In [145]:
files_reports["juez_splitted"] = files_reports["juez_"].apply(lambda row: row.split("."))

In [146]:
files_reports["n_judges_case"] = files_reports["juez_splitted"].apply(lambda row: len(row))

In [165]:
judge_names = files_reports[files_reports["n_judges_case"] == 1]

In [159]:
multiple_judge_names = files_reports[files_reports["n_judges_case"] != 1]

In [162]:
multiple_judge_names["juez_1"] = multiple_judge_names["juez_splitted"].apply(lambda row: row[0])
multiple_judge_names["juez_2"] = multiple_judge_names["juez_splitted"].apply(lambda row: row[1])
multiple_judge_names["juez_3"] = multiple_judge_names["juez_splitted"].apply(lambda row: row[2])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  multiple_judge_names["juez_1"] = multiple_judge_names["juez_splitted"].apply(lambda row: row[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  multiple_judge_names["juez_2"] = multiple_judge_names["juez_splitted"].apply(lambda row: row[1])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  multiple_ju

In [167]:
judge_names

Unnamed: 0,expediente_n°_,organo_jurisdiccional_,distrito_judicial_,juez_,especialista_legal_,fecha_de_inicio_,proceso_,observacion_,especialidad_,materia_s_,estado_,etapa_procesal_,fecha_conclusion_,ubicacion_,motivo_conclusion_,sumilla_,juez_splitted,n_judges_case
0,00001-2022-0-2402-JP-CI-01,JUZGADO DE PAZ LETRADO - SEDE MANANTAY,UCAYALI,DERLINDA VASQUEZ RUIZ,VIRNA LIZET MORENO CHU,07/01/2022,UNICO DE EJECUCION,\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t...,CIVIL,OBLIGACION DE DAR SUMA DE DINERO,EJECUCION,GENERAL,,POOL ASIST. JUDICIAL,-------,INTERPONE DEMANDA,[DERLINDA VASQUEZ RUIZ],1
1,00001-2022-0-2402-JP-CI-03,3°JUZGADO DE PAZ LETRADO - Sede Manco Capac,UCAYALI,NERY VANESA SOSA NAVARRO,ERIKA MARTEL DEL RISCO,05/01/2022,NO CONTENCIOSO,\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t...,CIVIL,"CAMBIO DE NOMBRE, SUPRESION DE NOMBRE Y/O ADIC...",ARCHIVO PROVISIONAL,GENERAL,,ARCHIVO GENERAL,-------,DEMANDA DE RECTIFICACION Y MODIFICACIÓN DE PAR...,[NERY VANESA SOSA NAVARRO],1
2,00001-2022-0-2402-JR-CI-02,2°JUZGADO DE PAZ LETRADO - Sede Manco Capac,UCAYALI,ECHEVARRIA POMA SOLIA WILDA,FLORES JOSEPH PAULA DEL ROCIO,05/01/2022,SUMARISIMO,\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t...,CIVIL,INEFICACIA DE TITULO VALOR,ARCHIVO DEFINITIVO,GENERAL,,POOL ASIST. JUDICIAL,-------,INTERPONE DEMANDA DE INEFICACIA DE TITULO VALOR,[ECHEVARRIA POMA SOLIA WILDA],1
3,00001-2022-0-2403-JP-CI-01,JUZGADO DE PAZ LETRADO - Sede Atalaya,UCAYALI,BRYAN APAGUEÑO REATEGUI,FELIX ALFREDO RIVERA AROSTEGU,21/01/2022,UNICO DE EJECUCION,\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t...,CIVIL,OBLIGACION DE DAR SUMA DE DINERO,EN EJECUCION,GENERAL,,POOL ASIST. JUDICIAL,-------,DEMANDA DE OBLIGACION DE DAR SUMA DE DINERO.,[BRYAN APAGUEÑO REATEGUI],1
4,00001-2022-0-2405-JP-CI-01,JUZGADO DE PAZ LETRADO - SEDE PURUS,UCAYALI,SADITH VELA TANANTA,NAPOLEON TORRES EDER ISMAEL,16/05/2022,SUMARISIMO,\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t...,CIVIL,OBLIGACION DE DAR SUMA DE DINERO,EN EJECUCION,GENERAL,,ESPECIALISTA,-------,OBLIGACION DE DAR SUMA DE DINERO,[SADITH VELA TANANTA],1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2988,00006-2022-0-2403-JM-LA-01,1º JUZGADO DE TRABAJO - SEDE CENTRAL,UCAYALI,ZEVALLOS BASUALDO JUAN ANDRES,SEGURA FLORES KARIN,11/04/2022,ESPECIAL,\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t...,LABORAL,ACCION CONTENCIOSA ADMINISTRATIVA,INADMISIBLE,GENERAL,,POOL ASIST. JUDICIAL,-------,INTERPONGO DEMANDA CONTENCIOSO ADMINISTRATIVO ...,[ZEVALLOS BASUALDO JUAN ANDRES],1
2989,00006-2022-0-2403-JP-LA-01,JUZGADO MIXTO - Sede Atalaya,UCAYALI,FELIPE DAVID PALACIOS SANTOS,VALDIVIESO BARDALES SEGUNDO,03/02/2022,EJECUCION,\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t...,LABORAL,OBLIGACION DE DAR SUMA DE DINERO INICIADAS POR...,APELADO,GENERAL,,POOL ASIST. JUDICIAL,-------,INTERPONEMOS DEMANDA,[FELIPE DAVID PALACIOS SANTOS],1
2990,00006-2022-0-2404-JM-LA-01,1º JUZGADO DE TRABAJO - SEDE CENTRAL,UCAYALI,ZEVALLOS BASUALDO JUAN ANDRES,SEGURA FLORES KARIN,28/01/2022,CONTENCIOSO ADMINISTRATIVO,\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t...,LABORAL,ACCION CONTENCIOSA ADMINISTRATIVA,SENTENCIADO/ RESUELTO,GENERAL,,POOL ASIST. JUDICIAL,-------,CONTENCIOSO ADMINISTRATIVO,[ZEVALLOS BASUALDO JUAN ANDRES],1
2991,00006-2022-0-2407-JM-LA-01,JUZGADO MIXTO - SEDE CONTAMANA,UCAYALI,CLAVELITO LINDA CUHELLO GUERRA,JULCA ARAUJO S. NOEMI,25/01/2022,ORDINARIO,\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t...,LABORAL,ACCION CONTENCIOSA ADMINISTRATIVA,TRAMITE,GENERAL,,ESPECIALISTA,-------,INTERPONE DEMANDA CONTENCIOSA ADMINISTRATIVA,[CLAVELITO LINDA CUHELLO GUERRA],1


In [171]:
judge_names = judge_names.rename(columns={"juez_": "juez"})

In [None]:
matched_judge_name1 = fuzzy_merge(judge_names, amag_ii_participants, "juez", "participant_apellido_nombre", threshold=90, limit=2)

In [None]:
len(matched_judge_name1)