In [127]:
import pickle
import pandas as pd
import numpy as np
import json, os, string
from janitor import clean_names
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

In [121]:
def read_json_dict(path: str) -> dict:
    """
    Reads a json file and returns it as dict object
    """
    
    file = open(path) # Opening JSON file
    return json.load(file) # returns JSON object as a dictionary

def folder_creator(folder_name: string, path: string) -> None:
    """
    Generates a folder in specified path
    
    input: name of root folder, path where you want 
    folder to be created
    output: None
    """
    
    # defining paths
    data_folder_path = path + "/" + folder_name
    data_folder_exists = os.path.exists(data_folder_path)

    # creating folders if don't exist
    if data_folder_exists:
        pass
    else:    
        # create a new directory because it does not exist 
        os.makedirs(data_folder_path)

        # create subfolders
        print(f"The new directory {folder_name} was created!")
        
def fuzzy_merge(df_1, df_2, key1, key2, threshold=90, limit=2):
    """
    :param df_1: the left table to join
    :param df_2: the right table to join
    :param key1: key column of the left table
    :param key2: key column of the right table
    :param threshold: how close the matches should be to return a match, based on Levenshtein distance
    :param limit: the amount of matches that will get returned, these are sorted high to low
    :return: dataframe with boths keys and matches
    """
    s = df_2[key2].tolist()
    
    m = df_1[key1].apply(lambda x: process.extract(x, s, limit=limit))    
    df_1['matches'] = m
    
    m2 = df_1['matches'].apply(lambda x: ', '.join([i[0] for i in x if i[1] >= threshold]))
    df_1['matches'] = m2
    
    return df_1

def create_pickle(object_name, file_name: str, path: str) -> None:
    """
    Creates a pickle file for object. Note: Path should have no slash 
    at the end
    """
    with open(path + f"/{file_name}", "wb") as storing_output:
        pickle.dump(object_name, storing_output)
        storing_output.close()

## Reading paths

In [3]:
paths = read_json_dict("paths.json")

In [4]:
paths

{'data_path': 'D:/Accesos directos/Trabajo/World Bank/WB Repos/peru-scrape-justice',
 'code_path': '/Users/brandonmora/GitHub/peru-amag-stats/case_outcomes',
 'data_amag_i': 'D:/Accesos directos/Trabajo/World Bank/WB Repos/peru-scrape-justice/01_AMAG',
 'data_cej': 'D:/Accesos directos/Trabajo/World Bank/WB Repos/peru-scrape-justice/data_cleaned_',
 'data_gender': 'D:/Accesos directos/Trabajo/World Bank/WB Repos/peru-scrape-justice/07_Other/02_Raw/names_gender',
 'local_storage': 'D:/Daniel Chen Dropbox/Marco Antonio GutiÃ©rrez ChÃ¡vez/datasets_amag_ii_scrape'}

In [5]:
data_path = paths["data_path"]

In [6]:
folder_creator("data_cleaned", data_path)

In [7]:
data_cleaned_path = data_path + "/data_cleaned"

In [8]:
folder_creator("raw", data_cleaned_path)

In [9]:
dc_raw_path = data_cleaned_path + "/raw"

In [122]:
folder_creator("temp", data_cleaned_path)

The new directory temp was created!


In [123]:
dc_temp_path = data_cleaned_path + "/temp"

# 1. Creating participants list

Reading lab data

In [11]:
lab_data = pd.read_stata(data_path + "/data/lab_Data/Clean_Full_Data12.dta")

Creating name variables for future fuzzy merge

In [12]:
lab_data["participant_nombre_apellido"] = lab_data["Nombres"] + " " + lab_data["ApellidoPaterno"] + " " + lab_data["ApellidoMaterno"]
lab_data["participant_nombre_apellido"] = lab_data["participant_nombre_apellido"].str.strip()

  lab_data["participant_nombre_apellido"] = lab_data["Nombres"] + " " + lab_data["ApellidoPaterno"] + " " + lab_data["ApellidoMaterno"]


In [13]:
lab_data["participant_apellido_nombre"] = lab_data["ApellidoPaterno"] + " " + lab_data["ApellidoMaterno"] + " " + lab_data["Nombres"]
lab_data["participant_apellido_nombre"] = lab_data["participant_apellido_nombre"].str.strip()

  lab_data["participant_apellido_nombre"] = lab_data["ApellidoPaterno"] + " " + lab_data["ApellidoMaterno"] + " " + lab_data["Nombres"]


In [14]:
lab_data = lab_data.rename(columns={"DNI": "nrodocumento"})

In [15]:
amag_ii_participants = lab_data[["nrodocumento", "participant_nombre_apellido", "participant_apellido_nombre"]]

Exporting the list of participants

In [16]:
amag_ii_participants.to_csv(dc_raw_path + "/amag_ii_participants_list.csv")

# 2. Creating Cases List

### 2.0. Selecting reporte files

In [17]:
files_reports = pd.read_csv(dc_raw_path + "/DF_file_report_2022.csv")
files_reports = clean_names(files_reports)

### 2.1. Cleaning the reporte files

Creating lists with characters to be replace

In [18]:
backslash_reps = ["\\(\\*\\)", "\\", "\\([^()]{0,}\\)"]
trailing_and_special_reps = ["^\\s", "\\,", "\\.$", " \\- JUZ$", "\\*"]
other_strs_reps = ["\\- MIXTO Y LIQ", "\\- MIXTO", "\\- JUZ\\. MIXTO", 
                   "- JM", "- INVESTIGACION", "- PAZ LETRADO", "SECOM - ", "- JT"]

### 2.2. Replacing backlashes, special characters and other uninformative characters

In [19]:
empty_reps = backslash_reps + trailing_and_special_reps +  other_strs_reps

In [21]:
for val in empty_reps:    
    files_reports["juez_"] = files_reports["juez_"].str.replace(val, "")

  files_reports["juez_"] = files_reports["juez_"].str.replace(val, "")
  files_reports["juez_"] = files_reports["juez_"].str.replace(val, "")


In [20]:
name_reps = [["ALFREDO E\\.", "ALFREDO E"], ["BERTHA F\\.", "BERTHA F"], ["CLAUDIO W\\.", "CLAUDIO W"], 
            ["CLAVELITO L\\.", "CLAVELITO L"], ["ELMER L\\.", "ELMER L"], ["ERNESTO A\\.", "ERNESTO A"],
            ["HERBERT M\\.", "HERBERT M"], ["LUZ K\\.", "LUZ K"], ["NANCY S\\.", "NANCY S"], ["JESSICA E\\.", "JESSICA E"],
            ["PATRICIA C\\.", "PATRICIA C"], ["JESSICA P\\.", "JESSICA P"], ["YOLANDA B\\.", "YOLANDA B\\."],
            ["LUZ M\\.", "LUZ M"], ["EDGAR\\.", "EDGAR"], ["C\\. ARTURO", "C ARTURO"], ["ALEXANDER A\\.", "ALEXANDER A"],
            ["RENE G\\.", "RENE G"], ["GUILLERMO S\\.", "GUILLERMO S"], ["FANNY L\\. ",  "FANNY L"], ["ELISA \\(LA", "ELISA"],
            ["JULIA \\(LA", "JULIA"], ["ACEVEDO DIEZ CECILIA", "ACEVEDO DIEZ CECILIA DEL PILAR"], [" J. ", " J "],
            [" K. ", " K "]]

### 2.3. Replacing names with issues

In [22]:
for name_rep in name_reps:
    files_reports["juez_"] = files_reports["juez_"].str.replace(name_rep[0], name_rep[1])

  files_reports["juez_"] = files_reports["juez_"].str.replace(name_rep[0], name_rep[1])


### 2.4. Obtaining the names of judges

Some cases have multiple judges assigned to them. As a result, we need to extract these names as we will match the case with the judge information.

In [23]:
files_reports = files_reports[files_reports["juez_"].notna()]

In [24]:
files_reports["juez_splitted"] = files_reports["juez_"].apply(lambda row: row.split("."))

In [25]:
files_reports["n_judges_case"] = files_reports["juez_splitted"].apply(lambda row: len(row))

In [26]:
judge_names = files_reports[files_reports["n_judges_case"] == 1]

In [27]:
multiple_judge_names = files_reports[files_reports["n_judges_case"] != 1]

In [28]:
multiple_judge_names["juez_1"] = multiple_judge_names["juez_splitted"].apply(lambda row: row[0])
multiple_judge_names["juez_2"] = multiple_judge_names["juez_splitted"].apply(lambda row: row[1])
multiple_judge_names["juez_3"] = multiple_judge_names["juez_splitted"].apply(lambda row: row[2])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  multiple_judge_names["juez_1"] = multiple_judge_names["juez_splitted"].apply(lambda row: row[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  multiple_judge_names["juez_2"] = multiple_judge_names["juez_splitted"].apply(lambda row: row[1])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  multiple_ju

In [30]:
judge_names = judge_names.rename(columns={"juez_": "juez"})

### 2.5. Fuzzy merge with Lab Experiment Data

Creating debuggings dataset to simulate the fuzzy merge

In [42]:
judge_names.columns

Index(['expediente_n°_', 'organo_jurisdiccional_', 'distrito_judicial_',
       'juez', 'especialista_legal_', 'fecha_de_inicio_', 'proceso_',
       'observacion_', 'especialidad_', 'materia_s_', 'estado_',
       'etapa_procesal_', 'fecha_conclusion_', 'ubicacion_',
       'motivo_conclusion_', 'sumilla_', 'juez_splitted', 'n_judges_case'],
      dtype='object')

In [55]:
debug_judges = judge_names[(judge_names["juez"] == "CRUZADO MEJIA MARTIN VALDEMAR") |
                           (judge_names["juez"] == "APAGUEÑO REATEGUI BRYAN ENRIQUE")]

In [57]:
debug_participants = amag_ii_participants[(amag_ii_participants["participant_apellido_nombre"] == "CRUZADO MEJIA MARTIN VALDEMAR") |
                                          (amag_ii_participants["participant_apellido_nombre"] == "APAGUEÑO REATEGUI BRYAN ENRIQUE")]

Fuzzy match of cases

In [66]:
matched_judge_name1 = fuzzy_merge(debug_judges, debug_participants, "juez", "participant_apellido_nombre", threshold=90, limit=2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_1['matches'] = m
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_1['matches'] = m2


In [97]:
amag_ii_cases = matched_judge_name1.copy()

# 3. Creating CEJ datasets

A pending task would be to bind the rows of all the dataframes

## Follow up dataframe

In [38]:
files_follow_up = pd.read_csv(dc_raw_path + "/DF_follow_up_cleaner_2022.csv", error_bad_lines=False)
files_follow_up = clean_names(files_follow_up)



  files_follow_up = pd.read_csv(dc_raw_path + "/DF_follow_up_cleaner_2022.csv", error_bad_lines=False)
Skipping line 8128: expected 10 fields, saw 11



# Procedural parts dataframe

In [112]:
files_procedural_parts = pd.read_csv(dc_raw_path + "/DF_procedural_parts_2022.csv")
files_procedural_parts = clean_names(files_procedural_parts)

In [113]:
files_procedural_parts["expediente_n°_"] = files_procedural_parts["expediente_n°_"].apply(lambda row: row.split("\\")[-1])

## Downloads dataframe

In [40]:
files_downloads = pd.read_csv(dc_raw_path + "/DF_DOWNLOADS_2022.csv")
files_downloads = clean_names(files_downloads)

### Merging complementary case data with amag ii cases

In [98]:
amag_ii_cases = pd.merge(amag_ii_cases, files_reports, how="inner", on="expediente_n°_")

In [99]:
amag_ii_cases = pd.merge(amag_ii_cases, files_follow_up, how="inner", on="expediente_n°_")

In [115]:
amag_ii_cases = pd.merge(amag_ii_cases, files_procedural_parts, how="inner", on="expediente_n°_")

In [120]:
amag_ii_cases = pd.merge(amag_ii_cases, files_downloads, how="inner", left_on="expediente_n°_", right_on="expediente_num")

In [125]:
help(create_pickle)

Help on function create_pickle in module __main__:

create_pickle(object_name, file_name: str, path: str) -> None
    Creates a pickle file for object. Note: Path should have no slash 
    at the end



In [128]:
create_pickle(amag_ii_cases, "amag_ii_cases.pkl", dc_temp_path)