In [1]:
import pickle
import regex as re
import nltk
import pandas as pd
import numpy as np
import json, os, string
from janitor import clean_names
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from datetime import datetime
import d6tjoin.top1
import d6tjoin.utils
import d6tjoin



In [2]:
def extract_text(text: str, pattern: str) -> str:
    """Extracts substring from string using a given regex pattern"""
    
    if type(text) is str:
        match = re.search(pattern, text)
        if match:
            return match.group(1)
        else:
            return ""
    else:
        return ""


# This function is meant to be used for extracting specific substrings from a given text based on a regular expression pattern

In [3]:
def read_json_dict(path: str) -> dict:
    """
    Reads a json file and returns it as dict object
    """
    
    file = open(path) # Opening JSON file
    return json.load(file) # returns JSON object as a dictionary

def folder_creator(folder_name: string, path: string) -> None:
    """
    Generates a folder in specified path
    
    input: name of root folder, path where you want 
    folder to be created
    output: None
    """
    
    # defining paths
    data_folder_path = path + "/" + folder_name
    data_folder_exists = os.path.exists(data_folder_path)

    # creating folders if don't exist
    if data_folder_exists:
        pass
    else:    
        # create a new directory because it does not exist 
        os.makedirs(data_folder_path)

        # create subfolders
        print(f"The new directory {folder_name} was created!")
        
def fuzzy_merge(df_1, df_2, key1, key2, threshold=90, limit=2):
    """
    :param df_1: the left table to join
    :param df_2: the right table to join
    :param key1: key column of the left table
    :param key2: key column of the right table
    :param threshold: how close the matches should be to return a match, based on Levenshtein distance
    :param limit: the amount of matches that will get returned, these are sorted high to low
    :return: dataframe with boths keys and matches
    """
    s = df_2[key2].tolist()
    
    m = df_1[key1].apply(lambda x: process.extract(x, s, limit=limit))    
    df_1['matches'] = m
    
    m2 = df_1['matches'].apply(lambda x: ', '.join([i[0] for i in x if i[1] >= threshold]))
    df_1['matches'] = m2
    
    return df_1

def create_pickle(object_name, file_name: str, path: str) -> None:
    """
    Creates a pickle file for object. Note: Path should have no slash 
    at the end
    """
    with open(path + f"/{file_name}", "wb") as storing_output:
        pickle.dump(object_name, storing_output)
        storing_output.close()
        
def read_pickle(file_name: str, path: str) -> None:
    """
    Reads pickle file from specified path 
    """
    pickle_file = open(path + f"/{file_name}", "rb")
    output = pickle.load(pickle_file)
    pickle_file.close()
    return output

## Reading paths

In [4]:
paths = read_json_dict("paths.json")

In [5]:
paths

{'data_path': 'C:/Users/PC/Daniel Chen Dropbox/Alexis Malca/Peru_Justice/02_Data/08_CEJ_Web/data_cleaned',
 'code_path': '/Users/brandonmora/GitHub/peru-amag-stats/case_outcomes',
 'data_amag_i': 'D:/Accesos directos/Trabajo/World Bank/WB Repos/peru-scrape-justice/01_AMAG',
 'data_cej': 'D:/Accesos directos/Trabajo/World Bank/WB Repos/peru-scrape-justice/data_cleaned_',
 'data_gender': 'D:/Accesos directos/Trabajo/World Bank/WB Repos/peru-scrape-justice/07_Other/02_Raw/names_gender',
 'local_storage': 'D:/Daniel Chen Dropbox/Marco Antonio GutiÃƒÂ©rrez ChÃƒÂ¡vez/datasets_amag_ii_scrape'}

In [6]:
data_path = paths["data_path"]

In [7]:
data_path

'C:/Users/PC/Daniel Chen Dropbox/Alexis Malca/Peru_Justice/02_Data/08_CEJ_Web/data_cleaned'

In [8]:
data_cleaned_path = data_path + "/data_cleaned_test"

In [9]:
folder_creator("raw", data_cleaned_path)

In [10]:
dc_raw_path = data_cleaned_path + "/raw"

In [11]:
folder_creator("temp", data_cleaned_path)

In [12]:
dc_temp_path = data_cleaned_path + "/temp"

In [13]:
folder_creator("intermediate", data_cleaned_path)

In [14]:
dc_interm_path = data_cleaned_path + "/intermediate"

In [15]:
folder_creator("final", data_cleaned_path)

In [16]:
dc_final_path = data_cleaned_path + "/final"

# 1. Creating participants list

Reading lab data: lab_data has 166 columns. We will use many of them for creating columns with names

In [17]:
lab_data = pd.read_stata(data_path + "/lab_Data/Clean_Full_Data12_jueces.dta")

One or more strings in the dta file could not be decoded using utf-8, and
so the fallback encoding of latin-1 is being used.  This can happen when a file
has been incorrectly encoded by Stata or some other software. You should verify
the string values returned are correct.
  lab_data = pd.read_stata(data_path + "/lab_Data/Clean_Full_Data12_jueces.dta")
One or more strings in the dta file could not be decoded using utf-8, and
so the fallback encoding of latin-1 is being used.  This can happen when a file
has been incorrectly encoded by Stata or some other software. You should verify
the string values returned are correct.
  lab_data = pd.read_stata(data_path + "/lab_Data/Clean_Full_Data12_jueces.dta")


In [18]:
lab_data['Nombres']

0         JUAN ANTONIO
1         ELVIRA LAURA
2       ELARD FERNANDO
3        GLORIA LUCILA
4      BENJAMIN ISRAEL
            ...       
220       CARMEN JULIA
221          TATIANOVA
222     GIANNY ELEISER
223     NORA ELIZABETH
224       YULEMI PAULA
Name: Nombres, Length: 225, dtype: object

Creating name variables for future fuzzy merge

In [19]:
lab_data["participant_nombre_apellido"] = lab_data["Nombres"] + " " + lab_data["ApellidoPaterno"] + " " + lab_data["ApellidoMaterno"]
lab_data["participant_nombre_apellido"] = lab_data["participant_nombre_apellido"].str.strip()

  lab_data["participant_nombre_apellido"] = lab_data["Nombres"] + " " + lab_data["ApellidoPaterno"] + " " + lab_data["ApellidoMaterno"]


In [20]:
lab_data["participant_apellido_nombre"] = lab_data["ApellidoPaterno"] + " " + lab_data["ApellidoMaterno"] + " " + lab_data["Nombres"]
lab_data["participant_apellido_nombre"] = lab_data["participant_apellido_nombre"].str.strip()

  lab_data["participant_apellido_nombre"] = lab_data["ApellidoPaterno"] + " " + lab_data["ApellidoMaterno"] + " " + lab_data["Nombres"]


In [21]:
lab_data = lab_data.rename(columns={"DNI": "nrodocumento"})

In [22]:
exp_participants = lab_data[["nrodocumento", "participant_nombre_apellido", "participant_apellido_nombre"]]

In [23]:
exp_participants

Unnamed: 0,nrodocumento,participant_nombre_apellido,participant_apellido_nombre
0,40070874,JUAN ANTONIO ROSAS CASTAÑEDA,ROSAS CASTAÑEDA JUAN ANTONIO
1,4069691,ELVIRA LAURA HUAMAN PORTAL,HUAMAN PORTAL ELVIRA LAURA
2,8254850,ELARD FERNANDO ZAVALAGA VARGAS,ZAVALAGA VARGAS ELARD FERNANDO
3,42135048,GLORIA LUCILA LAIZA ESPINOZA,LAIZA ESPINOZA GLORIA LUCILA
4,21462609,BENJAMIN ISRAEL MORON DOMINGUEZ,MORON DOMINGUEZ BENJAMIN ISRAEL
...,...,...,...
220,17435860,CARMEN JULIA PALMER OLIDEN,PALMER OLIDEN CARMEN JULIA
221,26704681,TATIANOVA ABANTO TAFUR,ABANTO TAFUR TATIANOVA
222,10019679,GIANNY ELEISER MORALES FERNANDEZ,MORALES FERNANDEZ GIANNY ELEISER
223,41332693,NORA ELIZABETH LLANCA VARA,LLANCA VARA NORA ELIZABETH


Exporting the list of participants

In [24]:
exp_participants = exp_participants.dropna() # erasing rows w/ no data

In [25]:
exp_participants #There are no rows w/ no data

Unnamed: 0,nrodocumento,participant_nombre_apellido,participant_apellido_nombre
0,40070874,JUAN ANTONIO ROSAS CASTAÑEDA,ROSAS CASTAÑEDA JUAN ANTONIO
1,4069691,ELVIRA LAURA HUAMAN PORTAL,HUAMAN PORTAL ELVIRA LAURA
2,8254850,ELARD FERNANDO ZAVALAGA VARGAS,ZAVALAGA VARGAS ELARD FERNANDO
3,42135048,GLORIA LUCILA LAIZA ESPINOZA,LAIZA ESPINOZA GLORIA LUCILA
4,21462609,BENJAMIN ISRAEL MORON DOMINGUEZ,MORON DOMINGUEZ BENJAMIN ISRAEL
...,...,...,...
220,17435860,CARMEN JULIA PALMER OLIDEN,PALMER OLIDEN CARMEN JULIA
221,26704681,TATIANOVA ABANTO TAFUR,ABANTO TAFUR TATIANOVA
222,10019679,GIANNY ELEISER MORALES FERNANDEZ,MORALES FERNANDEZ GIANNY ELEISER
223,41332693,NORA ELIZABETH LLANCA VARA,LLANCA VARA NORA ELIZABETH


In [26]:
exp_participants.to_csv(dc_raw_path + "/exp_participants_list.csv")

# 2. Creating Cases List

### 2.0. Selecting reporte files

In [27]:
files_reports = pd.read_csv(data_path + "/DF_file_report_2022.csv")
files_reports = clean_names(files_reports)

#Clean names is a function imported from janitor

In [28]:
files_reports

Unnamed: 0,expediente_n°_,organo_jurisdiccional_,distrito_judicial_,juez_,especialista_legal_,fecha_de_inicio_,proceso_,observacion_,especialidad_,materia_s_,estado_,etapa_procesal_,fecha_conclusion_,ubicacion_,motivo_conclusion_,sumilla_
0,00001-2022-0-0101-JP-CI-01,JUZGADO DE PAZ LETRADO - Leymebamba,AMAZONAS,PISCOYA SOSA ALDO FRANCISCO,ROJAS SILVA EDA,14/01/2022,NO CONTENCIOSO,\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t...,CIVIL,SUCESION INTESTADA,ARCHIVO DEFINITIVO,GENERAL,,ARCHIVO GENERAL,-------,SUCESION INTESTADA
1,00001-2022-0-0102-JP-CI-01,JUZGADO PAZ LETRADO - Imaza,AMAZONAS,VEGA BOCANEGRA BETO,GARCIA ODAR JOSE ALFREDO,27/01/2022,SUMARISIMO,\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t...,CIVIL,OBLIGACION DE DAR SUMA DE DINERO,TRAMITE,GENERAL,,ESPECIALISTA,-------,DEMANDA DE OBLIGACION DE DAR SUMA DE DINERO
2,00001-2022-0-0103-JP-CI-01,JUZGADO DE PAZ LETRADO - FLORIDA,AMAZONAS,LOLO JARA ESTELA,DAVILA HORNA HERMES WILMAN,10/01/2022,NO CONTENCIOSO,\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t...,CIVIL,SUCESION INTESTADA,ARCHIVO PROVISIONAL,GENERAL,,POOL ASIST. JUDICIAL,-------,SUCESION INTESTADA
3,00001-2022-0-0104-JP-CI-01,JUZGADO PAZ LETRADO - Sede Yutupis,AMAZONAS,"HUAMAN CULQUI, IRMA MERCEDES",ELORRIAGA CHAVEZ ROMINA PAOLA,16/03/2022,UNICO DE EJECUCION,\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t...,CIVIL,OBLIGACION DE DAR SUMA DE DINERO,EN PLAZO DE IMPUGNACION,GENERAL,,POOL ASIST. JUDICIAL,-------,DEMANDA OBLIGACION DE DAR SUMA DE DINERO
4,00001-2022-0-0105-JP-CI-01,JUZGADO DE PAZ LETRADO - Sede Luya,AMAZONAS,BARRERA BARDALES GUIMO ALBERTO,ANGULO CULLAMPE LIZETH DEL CARMEN,06/01/2022,NO CONTENCIOSO,\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t...,CIVIL,SUCESION INTESTADA,SENTENCIADO/ RESUELTO,GENERAL,,POOL ASIST. JUDICIAL,-------,DEMANDA DE SUCESIÓN INTESTADA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56254,00002-2022-0-3301-SP-FC-01,SALA CIVIL,PUENTE PIEDRA - VENTANILLA,,FLORES AGUILAR ANA VICTORIA,14/01/2022,NO CONTENCIOSO,\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t...,FAMILIA CIVIL,EXEQUATUR,RECHAZADO,GENERAL,,OTRO DIST. JUDICIAL,-------,SOLICITA RECONOCIMIENTO DE SENTENCIA JUDICIAL ...
56255,00003-2022-0-3301-SP-FC-01,SALA CIVIL,PUENTE PIEDRA - VENTANILLA,,FLORES AGUILAR ANA VICTORIA,02/03/2022,ESPECIAL,\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t...,FAMILIA CIVIL,QUEJA DE DERECHO,SENTENCIADO/ RESUELTO,GENERAL,,POOL ASIST. JUDICIAL,-------,DEMANDA DE QUEJA DE DERECHO
56256,00004-2022-0-3301-SP-FC-01,SALA CIVIL,PUENTE PIEDRA - VENTANILLA,,FLORES AGUILAR ANA VICTORIA,04/04/2022,SUMARIO,\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t...,FAMILIA CIVIL,QUEJA DE DERECHO,SENTENCIADO/ RESUELTO,GENERAL,,RELATORIA,-------,3° JFAMILIA DE VENTANILLA MI PERU REMITE RAZON...
56257,00005-2022-1-3301-SP-FC-01,SALA CIVIL,PUENTE PIEDRA - VENTANILLA,,FLORES AGUILAR ANA VICTORIA,04/05/2022,ESPECIAL,\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t...,FAMILIA CIVIL,QUEJA DE DERECHO,SENTENCIADO/ RESUELTO,GENERAL,,POOL ASIST. JUDICIAL,-------,QUEJA DERECHO REMITIDA POR EL SNEJ JUZGADO DE ...


### 2.1. Cleaning the reporte files

Creating lists with characters to be replace

In [29]:
backslash_reps = ["\\(\\*\\)", "\\", "\\([^()]{0,}\\)"]
trailing_and_special_reps = ["^\\s", "\\,", "\\.$", " \\- JUZ$", "\\*"]
other_strs_reps = ["\\- MIXTO Y LIQ", "\\- MIXTO", "\\- JUZ\\. MIXTO", 
                   "- JM", "- INVESTIGACION", "- PAZ LETRADO", "SECOM - ", "- JT"] #¿Porque ponemos esto?

### 2.2. Replacing backlashes, special characters and other uninformative characters

In [30]:
empty_reps = backslash_reps + trailing_and_special_reps + other_strs_reps

In [31]:
files_reports["juez_"]

0           PISCOYA SOSA ALDO FRANCISCO
1                   VEGA BOCANEGRA BETO
2                      LOLO JARA ESTELA
3          HUAMAN CULQUI, IRMA MERCEDES
4        BARRERA BARDALES GUIMO ALBERTO
                      ...              
56254                               NaN
56255                               NaN
56256                               NaN
56257                               NaN
56258                               NaN
Name: juez_, Length: 56259, dtype: object

In [32]:
for val in empty_reps:    
    files_reports["juez_"] = files_reports["juez_"].str.replace(val, "")

  files_reports["juez_"] = files_reports["juez_"].str.replace(val, "")
  files_reports["juez_"] = files_reports["juez_"].str.replace(val, "")


In [33]:
files_reports["juez_"]

0           PISCOYA SOSA ALDO FRANCISCO
1                   VEGA BOCANEGRA BETO
2                      LOLO JARA ESTELA
3           HUAMAN CULQUI IRMA MERCEDES
4        BARRERA BARDALES GUIMO ALBERTO
                      ...              
56254                               NaN
56255                               NaN
56256                               NaN
56257                               NaN
56258                               NaN
Name: juez_, Length: 56259, dtype: object

In [34]:
name_reps = [["ALFREDO E\\.", "ALFREDO E"], ["BERTHA F\\.", "BERTHA F"], ["CLAUDIO W\\.", "CLAUDIO W"], 
            ["CLAVELITO L\\.", "CLAVELITO L"], ["ELMER L\\.", "ELMER L"], ["ERNESTO A\\.", "ERNESTO A"],
            ["HERBERT M\\.", "HERBERT M"], ["LUZ K\\.", "LUZ K"], ["NANCY S\\.", "NANCY S"], ["JESSICA E\\.", "JESSICA E"],
            ["PATRICIA C\\.", "PATRICIA C"], ["JESSICA P\\.", "JESSICA P"], ["YOLANDA B\\.", "YOLANDA B\\."],
            ["LUZ M\\.", "LUZ M"], ["EDGAR\\.", "EDGAR"], ["C\\. ARTURO", "C ARTURO"], ["ALEXANDER A\\.", "ALEXANDER A"],
            ["RENE G\\.", "RENE G"], ["GUILLERMO S\\.", "GUILLERMO S"], ["FANNY L\\. ",  "FANNY L"], ["ELISA \\(LA", "ELISA"],
            ["JULIA \\(LA", "JULIA"], ["ACEVEDO DIEZ CECILIA", "ACEVEDO DIEZ CECILIA DEL PILAR"], [" J. ", " J "],
            [" K. ", " K "]] #Como sabes estos nombres?

### 2.3. Replacing names with issues

In [35]:
for name_rep in name_reps:
    files_reports["juez_"] = files_reports["juez_"].str.replace(name_rep[0], name_rep[1])

  files_reports["juez_"] = files_reports["juez_"].str.replace(name_rep[0], name_rep[1])


In [36]:
files_reports["juez_"]

0           PISCOYA SOSA ALDO FRANCISCO
1                   VEGA BOCANEGRA BETO
2                      LOLO JARA ESTELA
3           HUAMAN CULQUI IRMA MERCEDES
4        BARRERA BARDALES GUIMO ALBERTO
                      ...              
56254                               NaN
56255                               NaN
56256                               NaN
56257                               NaN
56258                               NaN
Name: juez_, Length: 56259, dtype: object

### 2.4. Obtaining the names of judges

Some cases have multiple judges assigned to them. As a result, we need to extract these names as we will match the case with the judge information.

In [37]:
files_reports = files_reports[files_reports["juez_"].notna()]

In [38]:
files_reports["juez_splitted"] = files_reports["juez_"].apply(lambda row: row.split("."))

#we use lambda to create very short function

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  files_reports["juez_splitted"] = files_reports["juez_"].apply(lambda row: row.split("."))


In [39]:
files_reports["juez_splitted"] # Por qué lo convertimos en listas?

0           [PISCOYA SOSA ALDO FRANCISCO]
1                   [VEGA BOCANEGRA BETO]
2                      [LOLO JARA ESTELA]
3           [HUAMAN CULQUI IRMA MERCEDES]
4        [BARRERA BARDALES GUIMO ALBERTO]
                       ...               
56241     [POMA ALOSILLA NARDA KATHERINE]
56242     [POMA ALOSILLA NARDA KATHERINE]
56243     [POMA ALOSILLA NARDA KATHERINE]
56244     [POMA ALOSILLA NARDA KATHERINE]
56245     [POMA ALOSILLA NARDA KATHERINE]
Name: juez_splitted, Length: 54533, dtype: object

In [40]:
files_reports["n_judges_case"] = files_reports["juez_splitted"].apply(lambda row: len(row))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  files_reports["n_judges_case"] = files_reports["juez_splitted"].apply(lambda row: len(row))


In [41]:
files_reports["n_judges_case"]

0        1
1        1
2        1
3        1
4        1
        ..
56241    1
56242    1
56243    1
56244    1
56245    1
Name: n_judges_case, Length: 54533, dtype: int64

In [42]:
files_reports["n_judges_case"].value_counts()

1    53590
3      633
2      228
4       75
6        4
5        3
Name: n_judges_case, dtype: int64

In [43]:
exploring_filters = files_reports[files_reports["n_judges_case"] == 6]
exploring_filters

Unnamed: 0,expediente_n°_,organo_jurisdiccional_,distrito_judicial_,juez_,especialista_legal_,fecha_de_inicio_,proceso_,observacion_,especialidad_,materia_s_,estado_,etapa_procesal_,fecha_conclusion_,ubicacion_,motivo_conclusion_,sumilla_,juez_splitted,n_judges_case
2824,00018-2022-22-0101-JR-LA-01,SALA CIVIL - SEDE CENTRAL,AMAZONAS,TAFUR GUPIOC ESPERANZA. HORNA CARPIO ROSA MARL...,VILCARROMERO CULQUI MARIA DOLORES,25/05/2022,ORDINARIO,\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t...,LABORAL,REINTEGRO DE BENEFICIOS SOCIALES,ARCHIVO DEFINITIVO,GENERAL,,POOL ASIST. JUDICIAL,-------,CUADERNO DE QUEJA POR DENEGATORIA DE APELACION,"[TAFUR GUPIOC ESPERANZA, HORNA CARPIO ROSA MA...",6
2957,00004-2022-61-0101-JR-LA-01,SALA CIVIL - SEDE CENTRAL,AMAZONAS,TAFUR GUPIOC ESPERANZA. CRISPIN QUISPE ALEJAND...,VILCARROMERO CULQUI MARIA DOLORES,07/04/2022,ESPECIAL,\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t...,LABORAL,ACCION CONTENCIOSA ADMINISTRATIVA,ARCHIVO DEFINITIVO,GENERAL,,POOL ASIST. JUDICIAL,-------,CUADERNO DE QUEJA DE DERECHO,"[TAFUR GUPIOC ESPERANZA, CRISPIN QUISPE ALEJA...",6
3576,00004-2022-61-0101-JR-LA-01,SALA CIVIL - SEDE CENTRAL,AMAZONAS,TAFUR GUPIOC ESPERANZA. CRISPIN QUISPE ALEJAND...,VILCARROMERO CULQUI MARIA DOLORES,07/04/2022,ESPECIAL,\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t...,LABORAL,ACCION CONTENCIOSA ADMINISTRATIVA,ARCHIVO DEFINITIVO,GENERAL,,POOL ASIST. JUDICIAL,-------,CUADERNO DE QUEJA DE DERECHO,"[TAFUR GUPIOC ESPERANZA, CRISPIN QUISPE ALEJA...",6
52764,00005-2022-0-1801-SP-LA-01,2° SALA LABORAL PERMANENTE,LIMA,...GASTULO CHAVEZ SILVIA JAENETTE. FUENTES LO...,"HERNANDEZ GARCIA, ANGELA DIONISIA",10/01/2022,ANULACION DE LAUDOS ARBITRALES,\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t...,LABORAL,ANULACION DE LAUDOS ARBITRALES,APELACION,GENERAL,,SALA SUPREMA,-------,INTERPONGO DEMANDA,"[, , , GASTULO CHAVEZ SILVIA JAENETTE, FUENT...",6


In [44]:
judge_names = files_reports[files_reports["n_judges_case"] == 1]

In [45]:
multiple_judge_names = files_reports[files_reports["n_judges_case"] != 1] 
# If there is more than 1 judge, then it's higly likely that it is an appellate judge

In [46]:
multiple_judge_names

Unnamed: 0,expediente_n°_,organo_jurisdiccional_,distrito_judicial_,juez_,especialista_legal_,fecha_de_inicio_,proceso_,observacion_,especialidad_,materia_s_,estado_,etapa_procesal_,fecha_conclusion_,ubicacion_,motivo_conclusion_,sumilla_,juez_splitted,n_judges_case
2540,00001-2022-0-0102-JR-LA-01,SALA CIVIL - SEDE UTCUBAMBA,AMAZONAS,ARTEAGA RAMIREZ FLORMIRA. VIGIL CURO LUZ CAROL...,PILCO MASLUCAN MERCEDES,12/01/2022,ORDINARIO,\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t...,LABORAL,REINTEGRO DE BENEFICIOS SOCIALES,SENTENCIADO/ RESUELTO,GENERAL,,POOL ASIST. JUDICIAL,-------,DEMANDA REINTEGRO DE BENEFICIOS SOCIALES,"[ARTEAGA RAMIREZ FLORMIRA, VIGIL CURO LUZ CAR...",3
2548,00101-2022-0-0102-JR-LA-01,SALA CIVIL - SEDE UTCUBAMBA,AMAZONAS,ARTEAGA RAMIREZ FLORMIRA. VIGIL CURO LUZ CAROL...,PILCO MASLUCAN MERCEDES,22/07/2022,ESPECIAL,\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t...,LABORAL,ACCION CONTENCIOSA ADMINISTRATIVA,APELACION,GENERAL,,POOL ASIST. JUDICIAL,-------,DEMANDA CONTENCIOSO ADMINISTRATIVO DE NULIDAD ...,"[ARTEAGA RAMIREZ FLORMIRA, VIGIL CURO LUZ CAR...",3
2565,00106-2022-95-0107-JR-LA-01,SALA CIVIL - SEDE UTCUBAMBA,AMAZONAS,ARTEAGA RAMIREZ FLORMIRA. CRISPIN QUISPE ALEJA...,PILCO MASLUCAN MERCEDES,28/10/2022,ORDINARIO,\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t...,LABORAL,RECONOCIMIENTO DE LOS DERECHOS COMPRENDIDOS EN...,APELACION,GENERAL,,SECRETARIA - MP,-------,INTERPONGO MEDIDA CAUTELAR GENERICA,"[ARTEAGA RAMIREZ FLORMIRA, CRISPIN QUISPE ALE...",4
2603,00117-2022-0-0101-JR-LA-01,SALA CIVIL - SEDE CENTRAL,AMAZONAS,TAFUR GUPIOC ESPERANZA. MARTINEZ CHASQUERO EDD...,VILCARROMERO CULQUI MARIA DOLORES,17/03/2022,ORDINARIO,\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t...,LABORAL,ACCION CONTENCIOSA ADMINISTRATIVA,TRAMITE,GENERAL,,POOL ASIST. JUDICIAL,-------,DEMANDA NULIDAD DE RESOLUCIONES ADMINISTRATIVA...,"[TAFUR GUPIOC ESPERANZA, MARTINEZ CHASQUERO E...",3
2619,00121-2022-0-0102-JR-LA-01,SALA CIVIL - SEDE UTCUBAMBA,AMAZONAS,ARTEAGA RAMIREZ FLORMIRA. CRISPIN QUISPE ALEJA...,PILCO MASLUCAN MERCEDES,01/09/2022,ORDINARIO,\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t...,LABORAL,ACCION CONTENCIOSA ADMINISTRATIVA,APELACION,GENERAL,,POOL ASIST. JUDICIAL,-------,DEMANDA ACCION CONTENCIOSA ADMINISTRATIVA,"[ARTEAGA RAMIREZ FLORMIRA, CRISPIN QUISPE ALE...",4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55968,00003-2022-0-2402-JR-LA-02,SALA LABORAL,UCAYALI,CRUZ COBEÑAS MARLENY. BARBARAN RIOS ASELA ISAB...,SUELLEN Y. POPOLIZIO PANDURO,04/02/2022,ORDINARIO,\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t...,LABORAL,DESNATURALIZACIÓN DE CONTRATO,SENTENCIADO/ RESUELTO,GENERAL,,RELATORIA,-------,INTERPONE DEMANDA DE DESNATURALIZACION DE CONT...,"[CRUZ COBEÑAS MARLENY, BARBARAN RIOS ASELA IS...",3
56220,00012-2022-0-3301-JR-LA-01,SALA LABORAL,PUENTE PIEDRA - VENTANILLA,CAMPOS MURILLO WALTER EDUARDO. ZUÑIGA HERRERA ...,HINOJOSA GRANADOS LAURA ESTHEFANY,14/01/2022,ORDINARIO,\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t...,LABORAL,DESNATURALIZACIÓN DE CONTRATO,EN PLAZO DE IMPUGNACION,GENERAL,,SECRETARIA - MP,-------,RECONOCIMIENTO DE VINCULO LABORAL BAJO LOS ALC...,"[CAMPOS MURILLO WALTER EDUARDO, ZUÑIGA HERRER...",3
56227,00018-2022-0-3301-JR-LA-01,SALA LABORAL,PUENTE PIEDRA - VENTANILLA,CAMPOS MURILLO WALTER EDUARDO. ZUÑIGA HERRERA ...,HINOJOSA GRANADOS LAURA ESTHEFANY,18/01/2022,ORDINARIO,\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t...,LABORAL,DESNATURALIZACIÓN DE CONTRATO,EN TRAMITE(Pendiente),GENERAL,,SECRETARIA - MP,-------,DEMANDA SOBRE INCUMPLIMIENTO,"[CAMPOS MURILLO WALTER EDUARDO, ZUÑIGA HERRER...",3
56232,00022-2022-0-3301-JR-LA-01,SALA LABORAL,PUENTE PIEDRA - VENTANILLA,CAMPOS MURILLO WALTER EDUARDO. ZUÑIGA HERRERA ...,HINOJOSA GRANADOS LAURA ESTHEFANY,22/01/2022,ORDINARIO,\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t...,LABORAL,INCUMPLIMIENTO DE DISPOSICIONES Y NORMAS LABOR...,EN TRAMITE(Pendiente),GENERAL,,SECRETARIA - MP,-------,DESNATURALIZACION DEL CONTRATO DE TRABAJO Y OTRO,"[CAMPOS MURILLO WALTER EDUARDO, ZUÑIGA HERRER...",3


In [47]:
multiple_judge_names.loc[:, "juez_1"] = multiple_judge_names["juez_splitted"].apply(lambda row: row[0]).copy()
multiple_judge_names["juez_2"] = multiple_judge_names["juez_splitted"].apply(lambda row: row[1] if len(row) > 1 else np.NaN)
multiple_judge_names["juez_3"] = multiple_judge_names["juez_splitted"].apply(lambda row: row[2] if len(row) > 2 else np.NaN)
multiple_judge_names["juez_4"] = multiple_judge_names["juez_splitted"].apply(lambda row: row[3] if len(row) > 3 else np.NaN)
multiple_judge_names["juez_5"] = multiple_judge_names["juez_splitted"].apply(lambda row: row[4] if len(row) > 4 else np.NaN)
multiple_judge_names["juez_6"] = multiple_judge_names["juez_splitted"].apply(lambda row: row[5] if len(row) > 5 else np.NaN)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  multiple_judge_names.loc[:, "juez_1"] = multiple_judge_names["juez_splitted"].apply(lambda row: row[0]).copy()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  multiple_judge_names["juez_2"] = multiple_judge_names["juez_splitted"].apply(lambda row: row[1] if len(row) > 1 else np.NaN)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#re

In [48]:
multiple_judge_names["juez_1"]

2540          ARTEAGA RAMIREZ FLORMIRA
2548          ARTEAGA RAMIREZ FLORMIRA
2565          ARTEAGA RAMIREZ FLORMIRA
2603            TAFUR GUPIOC ESPERANZA
2619          ARTEAGA RAMIREZ FLORMIRA
                     ...              
55968             CRUZ COBEÑAS MARLENY
56220    CAMPOS MURILLO WALTER EDUARDO
56227    CAMPOS MURILLO WALTER EDUARDO
56232    CAMPOS MURILLO WALTER EDUARDO
56236    CAMPOS MURILLO WALTER EDUARDO
Name: juez_1, Length: 943, dtype: object

In [49]:
multiple_judge_names["juez_2"]

2540         VIGIL CURO LUZ CAROLINA
2548         VIGIL CURO LUZ CAROLINA
2565        CRISPIN QUISPE ALEJANDRO
2603         MARTINEZ CHASQUERO EDDY
2619        CRISPIN QUISPE ALEJANDRO
                    ...             
55968     BARBARAN RIOS ASELA ISABEL
56220     ZUÑIGA HERRERA ELICEA INES
56227     ZUÑIGA HERRERA ELICEA INES
56232     ZUÑIGA HERRERA ELICEA INES
56236     ZUÑIGA HERRERA ELICEA INES
Name: juez_2, Length: 943, dtype: object

In [50]:
judge_names = judge_names.rename(columns={"juez_": "juez"})

In [51]:
judge_names

Unnamed: 0,expediente_n°_,organo_jurisdiccional_,distrito_judicial_,juez,especialista_legal_,fecha_de_inicio_,proceso_,observacion_,especialidad_,materia_s_,estado_,etapa_procesal_,fecha_conclusion_,ubicacion_,motivo_conclusion_,sumilla_,juez_splitted,n_judges_case
0,00001-2022-0-0101-JP-CI-01,JUZGADO DE PAZ LETRADO - Leymebamba,AMAZONAS,PISCOYA SOSA ALDO FRANCISCO,ROJAS SILVA EDA,14/01/2022,NO CONTENCIOSO,\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t...,CIVIL,SUCESION INTESTADA,ARCHIVO DEFINITIVO,GENERAL,,ARCHIVO GENERAL,-------,SUCESION INTESTADA,[PISCOYA SOSA ALDO FRANCISCO],1
1,00001-2022-0-0102-JP-CI-01,JUZGADO PAZ LETRADO - Imaza,AMAZONAS,VEGA BOCANEGRA BETO,GARCIA ODAR JOSE ALFREDO,27/01/2022,SUMARISIMO,\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t...,CIVIL,OBLIGACION DE DAR SUMA DE DINERO,TRAMITE,GENERAL,,ESPECIALISTA,-------,DEMANDA DE OBLIGACION DE DAR SUMA DE DINERO,[VEGA BOCANEGRA BETO],1
2,00001-2022-0-0103-JP-CI-01,JUZGADO DE PAZ LETRADO - FLORIDA,AMAZONAS,LOLO JARA ESTELA,DAVILA HORNA HERMES WILMAN,10/01/2022,NO CONTENCIOSO,\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t...,CIVIL,SUCESION INTESTADA,ARCHIVO PROVISIONAL,GENERAL,,POOL ASIST. JUDICIAL,-------,SUCESION INTESTADA,[LOLO JARA ESTELA],1
3,00001-2022-0-0104-JP-CI-01,JUZGADO PAZ LETRADO - Sede Yutupis,AMAZONAS,HUAMAN CULQUI IRMA MERCEDES,ELORRIAGA CHAVEZ ROMINA PAOLA,16/03/2022,UNICO DE EJECUCION,\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t...,CIVIL,OBLIGACION DE DAR SUMA DE DINERO,EN PLAZO DE IMPUGNACION,GENERAL,,POOL ASIST. JUDICIAL,-------,DEMANDA OBLIGACION DE DAR SUMA DE DINERO,[HUAMAN CULQUI IRMA MERCEDES],1
4,00001-2022-0-0105-JP-CI-01,JUZGADO DE PAZ LETRADO - Sede Luya,AMAZONAS,BARRERA BARDALES GUIMO ALBERTO,ANGULO CULLAMPE LIZETH DEL CARMEN,06/01/2022,NO CONTENCIOSO,\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t...,CIVIL,SUCESION INTESTADA,SENTENCIADO/ RESUELTO,GENERAL,,POOL ASIST. JUDICIAL,-------,DEMANDA DE SUCESIÓN INTESTADA,[BARRERA BARDALES GUIMO ALBERTO],1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56241,00006-2022-0-3301-JR-LA-01,JUZGADO DE TRABAJO,PUENTE PIEDRA - VENTANILLA,POMA ALOSILLA NARDA KATHERINE,SALDAÑA GONZALEZ ADRIANA A.,06/01/2022,ORDINARIO,\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t...,LABORAL,PAGO DE BENEFICIOS SOCIALES Y/O INDEMNIZACION ...,EN TRAMITE(Pendiente),GENERAL,,POOL ASIST. JUDICIAL,-------,PAGO DE BENEFICIOS SOCIALES,[POMA ALOSILLA NARDA KATHERINE],1
56242,00007-2022-0-3301-JP-LA-01,JUZGADO DE TRABAJO,PUENTE PIEDRA - VENTANILLA,POMA ALOSILLA NARDA KATHERINE,SALDAÑA GONZALEZ ADRIANA A.,07/01/2022,ABREVIADO,\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t...,LABORAL,PAGO DE BENEFICIOS SOCIALES Y/O INDEMNIZACION ...,EN TRAMITE(Pendiente),GENERAL,,POOL ASIST. DE AUDIO,-------,PAGO DE BENEFICIOS LABORALES,[POMA ALOSILLA NARDA KATHERINE],1
56243,00007-2022-0-3301-JR-LA-01,JUZGADO DE TRABAJO,PUENTE PIEDRA - VENTANILLA,POMA ALOSILLA NARDA KATHERINE,SALDAÑA GONZALEZ ADRIANA A.,07/01/2022,ORDINARIO,\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t...,LABORAL,DESNATURALIZACIÓN DE CONTRATO,EN TRAMITE(Pendiente),GENERAL,,ESPECIALISTA,-------,INTERPONGO DEMANDA POR DESNATURALIZACION DE CO...,[POMA ALOSILLA NARDA KATHERINE],1
56244,00008-2022-0-3301-JR-LA-01,JUZGADO DE TRABAJO,PUENTE PIEDRA - VENTANILLA,POMA ALOSILLA NARDA KATHERINE,SALDAÑA GONZALEZ ADRIANA A.,11/01/2022,ORDINARIO,\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t...,LABORAL,INCUMPLIMIENTO DE DISPOSICIONES Y NORMAS LABOR...,EN TRAMITE(Pendiente),GENERAL,,POOL ASIST. DE AUDIO,-------,DEMANDA SOBRE INCUMPLIMIENTO,[POMA ALOSILLA NARDA KATHERINE],1


### 2.5. Fuzzy merge with Lab Experiment Data

Keeping for the fuzzy merge only obs that can't be matched directly

In [52]:
judge_names_only = judge_names["juez"].reset_index()
judge_names_only = judge_names_only.drop_duplicates(subset=["juez"])

In [53]:
judge_names_only

Unnamed: 0,index,juez
0,0,PISCOYA SOSA ALDO FRANCISCO
1,1,VEGA BOCANEGRA BETO
2,2,LOLO JARA ESTELA
3,3,HUAMAN CULQUI IRMA MERCEDES
4,4,BARRERA BARDALES GUIMO ALBERTO
...,...,...
53490,56140,YOLANDA PETRONILA CAMPOS SOTELO
53493,56143,FLORES CALDERON PAMELA DESIRE
53496,56146,KATHERINE LA ROSA CASTILLO
53499,56150,MIRANDA SARMIENTO LUZ CRISTINA


In [54]:
nombre_apellido_merge = pd.merge(judge_names_only, exp_participants, left_on="juez", right_on="participant_nombre_apellido")

In [55]:
nombre_apellido_merge

Unnamed: 0,index,juez,nrodocumento,participant_nombre_apellido,participant_apellido_nombre
0,26886,LUIS ALVIN QUISPE SANCHEZ,41929968,LUIS ALVIN QUISPE SANCHEZ,QUISPE SANCHEZ LUIS ALVIN
1,45112,AMPARO BEATRIZ RODRIGUEZ CASTILLO,18090278,AMPARO BEATRIZ RODRIGUEZ CASTILLO,RODRIGUEZ CASTILLO AMPARO BEATRIZ
2,47010,FELIX ENRIQUE RAMIREZ SANCHEZ,18135223,FELIX ENRIQUE RAMIREZ SANCHEZ,RAMIREZ SANCHEZ FELIX ENRIQUE
3,52934,JUAN MANUEL FLORES SANCHEZ,1325221,JUAN MANUEL FLORES SANCHEZ,FLORES SANCHEZ JUAN MANUEL


In [56]:
nombre_apellido_merge = nombre_apellido_merge.drop_duplicates(subset=["juez"])

In [57]:
nombre_apellido_merge

Unnamed: 0,index,juez,nrodocumento,participant_nombre_apellido,participant_apellido_nombre
0,26886,LUIS ALVIN QUISPE SANCHEZ,41929968,LUIS ALVIN QUISPE SANCHEZ,QUISPE SANCHEZ LUIS ALVIN
1,45112,AMPARO BEATRIZ RODRIGUEZ CASTILLO,18090278,AMPARO BEATRIZ RODRIGUEZ CASTILLO,RODRIGUEZ CASTILLO AMPARO BEATRIZ
2,47010,FELIX ENRIQUE RAMIREZ SANCHEZ,18135223,FELIX ENRIQUE RAMIREZ SANCHEZ,RAMIREZ SANCHEZ FELIX ENRIQUE
3,52934,JUAN MANUEL FLORES SANCHEZ,1325221,JUAN MANUEL FLORES SANCHEZ,FLORES SANCHEZ JUAN MANUEL


In [58]:
judge_names_only_cleaned = judge_names_only[~judge_names_only["juez"].isin(nombre_apellido_merge["juez"])]

In [59]:
judge_names_only_cleaned

Unnamed: 0,index,juez
0,0,PISCOYA SOSA ALDO FRANCISCO
1,1,VEGA BOCANEGRA BETO
2,2,LOLO JARA ESTELA
3,3,HUAMAN CULQUI IRMA MERCEDES
4,4,BARRERA BARDALES GUIMO ALBERTO
...,...,...
53490,56140,YOLANDA PETRONILA CAMPOS SOTELO
53493,56143,FLORES CALDERON PAMELA DESIRE
53496,56146,KATHERINE LA ROSA CASTILLO
53499,56150,MIRANDA SARMIENTO LUZ CRISTINA


In [60]:
apellido_nombre_merge = pd.merge(judge_names_only, exp_participants, left_on="juez", right_on="participant_apellido_nombre")

In [61]:
apellido_nombre_merge

Unnamed: 0,index,juez,nrodocumento,participant_nombre_apellido,participant_apellido_nombre
0,10,CHIGNE MOZOMBITE MARIA DEL PILAR,41371819,MARIA DEL PILAR CHIGNE MOZOMBITE,CHIGNE MOZOMBITE MARIA DEL PILAR
1,684,OCAMPO VARGAS CLODOMIRA,18138792,CLODOMIRA OCAMPO VARGAS,OCAMPO VARGAS CLODOMIRA
2,3587,DIAZ ALVAN ROBERTO CARLOS,41359917,ROBERTO CARLOS DIAZ ALVAN,DIAZ ALVAN ROBERTO CARLOS
3,5507,BACILIO SALAZAR EMMA CONSUELO,7299339,EMMA CONSUELO BACILIO SALAZAR,BACILIO SALAZAR EMMA CONSUELO
4,5704,ROJAS OBREGON FIORELLA MAGALI,42897314,FIORELLA MAGALI ROJAS OBREGON,ROJAS OBREGON FIORELLA MAGALI
5,6678,GUTIERREZ GALVEZ ZONIA VIRGINIA,28260854,ZONIA VIRGINIA GUTIERREZ GALVEZ,GUTIERREZ GALVEZ ZONIA VIRGINIA
6,12174,CARY CHOQUE CARLOS ALVARO,29224520,CARLOS ALVARO CARY CHOQUE,CARY CHOQUE CARLOS ALVARO
7,12203,CALLE VERA OSCAR FRANCIS,43188285,OSCAR FRANCIS CALLE VERA,CALLE VERA OSCAR FRANCIS
8,12879,BELLIDO ANGULO MARIA SOLEDAD,29424173,MARIA SOLEDAD BELLIDO ANGULO,BELLIDO ANGULO MARIA SOLEDAD
9,13744,YUCA HUARACCALLO FELIPE FEDERICO,29580121,FELIPE FEDERICO YUCA HUARACCALLO,YUCA HUARACCALLO FELIPE FEDERICO


In [62]:
apellido_nombre_merge = apellido_nombre_merge.drop_duplicates(subset=["juez"])

In [63]:
apellido_nombre_merge

Unnamed: 0,index,juez,nrodocumento,participant_nombre_apellido,participant_apellido_nombre
0,10,CHIGNE MOZOMBITE MARIA DEL PILAR,41371819,MARIA DEL PILAR CHIGNE MOZOMBITE,CHIGNE MOZOMBITE MARIA DEL PILAR
1,684,OCAMPO VARGAS CLODOMIRA,18138792,CLODOMIRA OCAMPO VARGAS,OCAMPO VARGAS CLODOMIRA
2,3587,DIAZ ALVAN ROBERTO CARLOS,41359917,ROBERTO CARLOS DIAZ ALVAN,DIAZ ALVAN ROBERTO CARLOS
3,5507,BACILIO SALAZAR EMMA CONSUELO,7299339,EMMA CONSUELO BACILIO SALAZAR,BACILIO SALAZAR EMMA CONSUELO
4,5704,ROJAS OBREGON FIORELLA MAGALI,42897314,FIORELLA MAGALI ROJAS OBREGON,ROJAS OBREGON FIORELLA MAGALI
5,6678,GUTIERREZ GALVEZ ZONIA VIRGINIA,28260854,ZONIA VIRGINIA GUTIERREZ GALVEZ,GUTIERREZ GALVEZ ZONIA VIRGINIA
6,12174,CARY CHOQUE CARLOS ALVARO,29224520,CARLOS ALVARO CARY CHOQUE,CARY CHOQUE CARLOS ALVARO
7,12203,CALLE VERA OSCAR FRANCIS,43188285,OSCAR FRANCIS CALLE VERA,CALLE VERA OSCAR FRANCIS
8,12879,BELLIDO ANGULO MARIA SOLEDAD,29424173,MARIA SOLEDAD BELLIDO ANGULO,BELLIDO ANGULO MARIA SOLEDAD
9,13744,YUCA HUARACCALLO FELIPE FEDERICO,29580121,FELIPE FEDERICO YUCA HUARACCALLO,YUCA HUARACCALLO FELIPE FEDERICO


In [64]:
judge_names_only_cleaned_2 = judge_names_only[~judge_names_only["juez"].isin(apellido_nombre_merge["juez"])]

In [65]:
judge_names_only_cleaned_2

Unnamed: 0,index,juez
0,0,PISCOYA SOSA ALDO FRANCISCO
1,1,VEGA BOCANEGRA BETO
2,2,LOLO JARA ESTELA
3,3,HUAMAN CULQUI IRMA MERCEDES
4,4,BARRERA BARDALES GUIMO ALBERTO
...,...,...
53490,56140,YOLANDA PETRONILA CAMPOS SOTELO
53493,56143,FLORES CALDERON PAMELA DESIRE
53496,56146,KATHERINE LA ROSA CASTILLO
53499,56150,MIRANDA SARMIENTO LUZ CRISTINA


### 2.5.0. Keeping unmatched experiment participants

In [66]:
# Resultado 1:

result_1 = pd.merge(exp_participants, nombre_apellido_merge, on="participant_nombre_apellido", indicator=True, how="outer")

In [67]:
result_1 = result_1[result_1["_merge"] == "left_only"].rename(columns = 
                                                   {"nrodocumento_x": "DNI"})[["DNI","participant_nombre_apellido"]]

#This keeps information only from the exp_participant dataframe

In [68]:
result_1 = pd.merge(result_1, lab_data[["ApellidoPaterno", "ApellidoMaterno", "Nombres", "nrodocumento"]], left_on="DNI", right_on="nrodocumento")

In [69]:
result_1 = result_1.drop(columns = "nrodocumento")

In [70]:
result_1

Unnamed: 0,DNI,participant_nombre_apellido,ApellidoPaterno,ApellidoMaterno,Nombres
0,40070874,JUAN ANTONIO ROSAS CASTAÑEDA,ROSAS,CASTAÑEDA,JUAN ANTONIO
1,4069691,ELVIRA LAURA HUAMAN PORTAL,HUAMAN,PORTAL,ELVIRA LAURA
2,8254850,ELARD FERNANDO ZAVALAGA VARGAS,ZAVALAGA,VARGAS,ELARD FERNANDO
3,42135048,GLORIA LUCILA LAIZA ESPINOZA,LAIZA,ESPINOZA,GLORIA LUCILA
4,21462609,BENJAMIN ISRAEL MORON DOMINGUEZ,MORON,DOMINGUEZ,BENJAMIN ISRAEL
...,...,...,...,...,...
216,17435860,CARMEN JULIA PALMER OLIDEN,PALMER,OLIDEN,CARMEN JULIA
217,26704681,TATIANOVA ABANTO TAFUR,ABANTO,TAFUR,TATIANOVA
218,10019679,GIANNY ELEISER MORALES FERNANDEZ,MORALES,FERNANDEZ,GIANNY ELEISER
219,41332693,NORA ELIZABETH LLANCA VARA,LLANCA,VARA,NORA ELIZABETH


In [71]:
result_2 = pd.merge(exp_participants, apellido_nombre_merge, on="participant_apellido_nombre", indicator=True, how="outer")

result_2

Unnamed: 0,nrodocumento_x,participant_nombre_apellido_x,participant_apellido_nombre,index,juez,nrodocumento_y,participant_nombre_apellido_y,_merge
0,40070874,JUAN ANTONIO ROSAS CASTAÑEDA,ROSAS CASTAÑEDA JUAN ANTONIO,,,,,left_only
1,4069691,ELVIRA LAURA HUAMAN PORTAL,HUAMAN PORTAL ELVIRA LAURA,,,,,left_only
2,8254850,ELARD FERNANDO ZAVALAGA VARGAS,ZAVALAGA VARGAS ELARD FERNANDO,,,,,left_only
3,42135048,GLORIA LUCILA LAIZA ESPINOZA,LAIZA ESPINOZA GLORIA LUCILA,,,,,left_only
4,21462609,BENJAMIN ISRAEL MORON DOMINGUEZ,MORON DOMINGUEZ BENJAMIN ISRAEL,,,,,left_only
...,...,...,...,...,...,...,...,...
220,17435860,CARMEN JULIA PALMER OLIDEN,PALMER OLIDEN CARMEN JULIA,,,,,left_only
221,26704681,TATIANOVA ABANTO TAFUR,ABANTO TAFUR TATIANOVA,,,,,left_only
222,10019679,GIANNY ELEISER MORALES FERNANDEZ,MORALES FERNANDEZ GIANNY ELEISER,,,,,left_only
223,41332693,NORA ELIZABETH LLANCA VARA,LLANCA VARA NORA ELIZABETH,,,,,left_only


In [72]:
result_2 = pd.merge(result_1, apellido_nombre_merge, left_on="DNI", right_on = "nrodocumento", indicator=True, how="outer")

In [73]:
result_2 = result_2[result_2["_merge"] == "left_only"]

In [74]:
result_2

Unnamed: 0,DNI,participant_nombre_apellido_x,ApellidoPaterno,ApellidoMaterno,Nombres,index,juez,nrodocumento,participant_nombre_apellido_y,participant_apellido_nombre,_merge
0,40070874,JUAN ANTONIO ROSAS CASTAÑEDA,ROSAS,CASTAÑEDA,JUAN ANTONIO,,,,,,left_only
1,4069691,ELVIRA LAURA HUAMAN PORTAL,HUAMAN,PORTAL,ELVIRA LAURA,,,,,,left_only
2,8254850,ELARD FERNANDO ZAVALAGA VARGAS,ZAVALAGA,VARGAS,ELARD FERNANDO,,,,,,left_only
3,42135048,GLORIA LUCILA LAIZA ESPINOZA,LAIZA,ESPINOZA,GLORIA LUCILA,,,,,,left_only
4,21462609,BENJAMIN ISRAEL MORON DOMINGUEZ,MORON,DOMINGUEZ,BENJAMIN ISRAEL,,,,,,left_only
...,...,...,...,...,...,...,...,...,...,...,...
215,7242034,CARMEN LEONOR BARRERA UTANO,BARRERA,UTANO,CARMEN LEONOR,,,,,,left_only
216,17435860,CARMEN JULIA PALMER OLIDEN,PALMER,OLIDEN,CARMEN JULIA,,,,,,left_only
217,26704681,TATIANOVA ABANTO TAFUR,ABANTO,TAFUR,TATIANOVA,,,,,,left_only
218,10019679,GIANNY ELEISER MORALES FERNANDEZ,MORALES,FERNANDEZ,GIANNY ELEISER,,,,,,left_only


In [75]:
result_2.to_excel(f'{dc_temp_path}/unmatched_judges.xlsx', index=False)

In [76]:
len(apellido_nombre_merge)

47

In [77]:
len(nombre_apellido_merge)

4

In [78]:
judge_names_only_cleaned = judge_names_only_cleaned[~judge_names_only_cleaned["juez"].isin(apellido_nombre_merge["juez"])]

In [79]:
exp_participants_cleaned = exp_participants[~exp_participants["participant_nombre_apellido"].isin(
                                            nombre_apellido_merge["juez"])]

In [80]:
exp_participants_cleaned = exp_participants_cleaned[~exp_participants_cleaned["participant_apellido_nombre"].isin(
                                            apellido_nombre_merge["juez"])]

In [81]:
multiple_judge_names["juez_1"].nunique()

55

In [82]:
multiple_judge_names

Unnamed: 0,expediente_n°_,organo_jurisdiccional_,distrito_judicial_,juez_,especialista_legal_,fecha_de_inicio_,proceso_,observacion_,especialidad_,materia_s_,...,motivo_conclusion_,sumilla_,juez_splitted,n_judges_case,juez_1,juez_2,juez_3,juez_4,juez_5,juez_6
2540,00001-2022-0-0102-JR-LA-01,SALA CIVIL - SEDE UTCUBAMBA,AMAZONAS,ARTEAGA RAMIREZ FLORMIRA. VIGIL CURO LUZ CAROL...,PILCO MASLUCAN MERCEDES,12/01/2022,ORDINARIO,\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t...,LABORAL,REINTEGRO DE BENEFICIOS SOCIALES,...,-------,DEMANDA REINTEGRO DE BENEFICIOS SOCIALES,"[ARTEAGA RAMIREZ FLORMIRA, VIGIL CURO LUZ CAR...",3,ARTEAGA RAMIREZ FLORMIRA,VIGIL CURO LUZ CAROLINA,MOROCHO NUÑEZ GELNER,,,
2548,00101-2022-0-0102-JR-LA-01,SALA CIVIL - SEDE UTCUBAMBA,AMAZONAS,ARTEAGA RAMIREZ FLORMIRA. VIGIL CURO LUZ CAROL...,PILCO MASLUCAN MERCEDES,22/07/2022,ESPECIAL,\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t...,LABORAL,ACCION CONTENCIOSA ADMINISTRATIVA,...,-------,DEMANDA CONTENCIOSO ADMINISTRATIVO DE NULIDAD ...,"[ARTEAGA RAMIREZ FLORMIRA, VIGIL CURO LUZ CAR...",3,ARTEAGA RAMIREZ FLORMIRA,VIGIL CURO LUZ CAROLINA,MOROCHO NUÑEZ GELNER,,,
2565,00106-2022-95-0107-JR-LA-01,SALA CIVIL - SEDE UTCUBAMBA,AMAZONAS,ARTEAGA RAMIREZ FLORMIRA. CRISPIN QUISPE ALEJA...,PILCO MASLUCAN MERCEDES,28/10/2022,ORDINARIO,\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t...,LABORAL,RECONOCIMIENTO DE LOS DERECHOS COMPRENDIDOS EN...,...,-------,INTERPONGO MEDIDA CAUTELAR GENERICA,"[ARTEAGA RAMIREZ FLORMIRA, CRISPIN QUISPE ALE...",4,ARTEAGA RAMIREZ FLORMIRA,CRISPIN QUISPE ALEJANDRO,MOROCHO NUÑEZ GELNER,VIGIL CURO LUZ CAROLINA,,
2603,00117-2022-0-0101-JR-LA-01,SALA CIVIL - SEDE CENTRAL,AMAZONAS,TAFUR GUPIOC ESPERANZA. MARTINEZ CHASQUERO EDD...,VILCARROMERO CULQUI MARIA DOLORES,17/03/2022,ORDINARIO,\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t...,LABORAL,ACCION CONTENCIOSA ADMINISTRATIVA,...,-------,DEMANDA NULIDAD DE RESOLUCIONES ADMINISTRATIVA...,"[TAFUR GUPIOC ESPERANZA, MARTINEZ CHASQUERO E...",3,TAFUR GUPIOC ESPERANZA,MARTINEZ CHASQUERO EDDY,TORREJÓN RENGIFO LUIS,,,
2619,00121-2022-0-0102-JR-LA-01,SALA CIVIL - SEDE UTCUBAMBA,AMAZONAS,ARTEAGA RAMIREZ FLORMIRA. CRISPIN QUISPE ALEJA...,PILCO MASLUCAN MERCEDES,01/09/2022,ORDINARIO,\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t...,LABORAL,ACCION CONTENCIOSA ADMINISTRATIVA,...,-------,DEMANDA ACCION CONTENCIOSA ADMINISTRATIVA,"[ARTEAGA RAMIREZ FLORMIRA, CRISPIN QUISPE ALE...",4,ARTEAGA RAMIREZ FLORMIRA,CRISPIN QUISPE ALEJANDRO,MOROCHO NUÑEZ GELNER,VIGIL CURO LUZ CAROLINA,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55968,00003-2022-0-2402-JR-LA-02,SALA LABORAL,UCAYALI,CRUZ COBEÑAS MARLENY. BARBARAN RIOS ASELA ISAB...,SUELLEN Y. POPOLIZIO PANDURO,04/02/2022,ORDINARIO,\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t...,LABORAL,DESNATURALIZACIÓN DE CONTRATO,...,-------,INTERPONE DEMANDA DE DESNATURALIZACION DE CONT...,"[CRUZ COBEÑAS MARLENY, BARBARAN RIOS ASELA IS...",3,CRUZ COBEÑAS MARLENY,BARBARAN RIOS ASELA ISABEL,ELIANA TUESTA OYARCE DE CACERES,,,
56220,00012-2022-0-3301-JR-LA-01,SALA LABORAL,PUENTE PIEDRA - VENTANILLA,CAMPOS MURILLO WALTER EDUARDO. ZUÑIGA HERRERA ...,HINOJOSA GRANADOS LAURA ESTHEFANY,14/01/2022,ORDINARIO,\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t...,LABORAL,DESNATURALIZACIÓN DE CONTRATO,...,-------,RECONOCIMIENTO DE VINCULO LABORAL BAJO LOS ALC...,"[CAMPOS MURILLO WALTER EDUARDO, ZUÑIGA HERRER...",3,CAMPOS MURILLO WALTER EDUARDO,ZUÑIGA HERRERA ELICEA INES,CARRASCO ALVAREZ BRIZALINA,,,
56227,00018-2022-0-3301-JR-LA-01,SALA LABORAL,PUENTE PIEDRA - VENTANILLA,CAMPOS MURILLO WALTER EDUARDO. ZUÑIGA HERRERA ...,HINOJOSA GRANADOS LAURA ESTHEFANY,18/01/2022,ORDINARIO,\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t...,LABORAL,DESNATURALIZACIÓN DE CONTRATO,...,-------,DEMANDA SOBRE INCUMPLIMIENTO,"[CAMPOS MURILLO WALTER EDUARDO, ZUÑIGA HERRER...",3,CAMPOS MURILLO WALTER EDUARDO,ZUÑIGA HERRERA ELICEA INES,CARRASCO ALVAREZ BRIZALINA,,,
56232,00022-2022-0-3301-JR-LA-01,SALA LABORAL,PUENTE PIEDRA - VENTANILLA,CAMPOS MURILLO WALTER EDUARDO. ZUÑIGA HERRERA ...,HINOJOSA GRANADOS LAURA ESTHEFANY,22/01/2022,ORDINARIO,\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t...,LABORAL,INCUMPLIMIENTO DE DISPOSICIONES Y NORMAS LABOR...,...,-------,DESNATURALIZACION DEL CONTRATO DE TRABAJO Y OTRO,"[CAMPOS MURILLO WALTER EDUARDO, ZUÑIGA HERRER...",3,CAMPOS MURILLO WALTER EDUARDO,ZUÑIGA HERRERA ELICEA INES,CARRASCO ALVAREZ BRIZALINA,,,


In [83]:
multiple_judge_names["juez_2"].nunique()

67

In [84]:
multiple_judge_names["juez_3"].nunique()

68

In [85]:
multiple_judge_names["juez_4"].nunique()

19

In [86]:
multiple_judge_names["juez_5"].nunique()

4

In [87]:
multiple_judge_names["juez_6"].nunique()

3

**Fuzzy match of cases: Cases with 1 judge**

In [88]:
matched_judge_name1 = d6tjoin.top1.MergeTop1(judge_names, exp_participants, fuzzy_left_on=["juez"], 
                       fuzzy_right_on=["participant_apellido_nombre"]).merge()["top1"]['juez']


# d6tjoin.top1.MergeTop1 es el comando para 

  df_candidates = df_candidates_exact.append(df_candidates_fuzzy, ignore_index=True)


In [95]:
matched_judge_name1 = matched_judge_name1.rename(columns={'__top1left__': 'juez', '__top1right__': 'participant_apellido_nombre'})

matched_judge_name1

Unnamed: 0,juez,participant_apellido_nombre,__matchtype__,__top1diff__
49833,<NO DEFINIDO>,SANTOS PEREZ CARLOS,top1 left,14.0
49918,<NO DEFINIDO>,TELLO PONCE MARLO,top1 left,14.0
130113,ADAUTO PIMENTEL MIRIAN YESSICA,MALQUI MOSCOSO MIRIAM LUISA,top1 left,19.0
130186,ADAUTO PIMENTEL MIRIAN YESSICA,GALLEGOS CANDELA MARIA ESTHER,top1 left,19.0
153079,AGREDA GAITAN KATIA SOLEDAD,BELLIDO ANGULO MARIA SOLEDAD,top1 left,14.0
...,...,...,...,...
160740,ZUÑIGA PORTOCARRERO LINO,ZUÑIGA PORTOCARRERO LINO YSAURO,top1 left,7.0
86944,ZUÑIGA VALLEJOS JULIO CESAR,LEYVA PEREZ JULIO CESAR,top1 left,12.0
135071,ÑAUPA CHACALTANA CARLOS JAVIER,CAYLLAHUA PEÑA MAXIMO JAVIER,top1 left,17.0
135216,ÑAUPA CHACALTANA CARLOS JAVIER,CARY CHOQUE CARLOS ALVARO,top1 left,17.0


In [96]:
# fuzzy_1_judge = matched_judge_name1[matched_judge_name1["__top1diff__"] <= 7.1].copy()
fuzzy_1_judge = matched_judge_name1[(matched_judge_name1["__top1diff__"] >= 0.1) & (matched_judge_name1["__top1diff__"] <= 7.1)].copy()
fuzzy_1_judge


Unnamed: 0,juez,participant_apellido_nombre,__matchtype__,__top1diff__
160740,ZUÑIGA PORTOCARRERO LINO,ZUÑIGA PORTOCARRERO LINO YSAURO,top1 left,7.0


In [90]:
type(matched_judge_name1)

pandas.core.frame.DataFrame

In [98]:
matched_judge_name2 = d6tjoin.top1.MergeTop1(judge_names, exp_participants, fuzzy_left_on=["juez"], 
                       fuzzy_right_on=["participant_nombre_apellido"]).merge()["top1"]['juez']

  df_candidates = df_candidates_exact.append(df_candidates_fuzzy, ignore_index=True)


In [99]:
matched_judge_name2 = matched_judge_name2.rename(columns={'__top1left__': 'juez', '__top1right__': 'participant_nombre_apellido'})
matched_judge_name2

Unnamed: 0,juez,participant_nombre_apellido,__matchtype__,__top1diff__
52238,<NO DEFINIDO>,MARLO TELLO PONCE,top1 left,13.0
137842,ADAUTO PIMENTEL MIRIAN YESSICA,TANIA LIZETT MEDINA SAYRA,top1 left,19.0
137880,ADAUTO PIMENTEL MIRIAN YESSICA,FAUSTINO FIDEL MINAYA HERRERA,top1 left,19.0
161903,AGREDA GAITAN KATIA SOLEDAD,CARMEN LEIVA CASTAÑEDA,top1 left,17.0
161909,AGREDA GAITAN KATIA SOLEDAD,ALEX NILTON CACHI ROJAS,top1 left,17.0
...,...,...,...,...
142476,ÑAUPA CHACALTANA CARLOS JAVIER,ROSA ISABEL VARGAS PEREZ,top1 left,20.0
142528,ÑAUPA CHACALTANA CARLOS JAVIER,TATIANOVA ABANTO TAFUR,top1 left,20.0
142563,ÑAUPA CHACALTANA CARLOS JAVIER,KAREN JACQUELINE ALEJOS JAQUI,top1 left,20.0
142616,ÑAUPA CHACALTANA CARLOS JAVIER,MARCO ANTONIO CELIS VASQUEZ,top1 left,20.0


In [100]:
fuzzy_2_judge = matched_judge_name2[(matched_judge_name2["__top1diff__"] >= 0.1) & (matched_judge_name2["__top1diff__"] <= 7.1)].copy()
fuzzy_2_judge

Unnamed: 0,juez,participant_nombre_apellido,__matchtype__,__top1diff__


**Fuzzy match of cases: Cases with 1+ judge**

Preparing `multiple_judge_names["juez_1"]` for first judge only

In [102]:
matched_judge_name3 = d6tjoin.top1.MergeTop1(multiple_judge_names, exp_participants, fuzzy_left_on=["juez_1"], 
                       fuzzy_right_on=["participant_apellido_nombre"]).merge()["top1"]['juez_1']

  df_candidates = df_candidates_exact.append(df_candidates_fuzzy, ignore_index=True)


In [103]:
matched_judge_name3 = matched_judge_name3.rename(columns={'__top1left__': 'juez', '__top1right__': 'participant_apellido_nombre'})

matched_judge_name3

Unnamed: 0,juez,participant_apellido_nombre,__matchtype__,__top1diff__
125,,AMAYA PAZO ROXANA,top1 left,17.0
150,,TELLO PONCE MARLO,top1 left,17.0
4023,AGUADO SEMINO ALFREDO ALBERTO,ANTICONA LUJAN CARLOS ALBERTO,top1 left,17.0
1281,ALEJANDRO PAUCAR FELIX,CALLE VERA OSCAR FRANCIS,top1 left,15.0
1287,ALEJANDRO PAUCAR FELIX,FERNANDEZ SANCHEZ FREDY,top1 left,15.0
...,...,...,...,...
7334,VIVANCO HERRERA HENRRY GERMAN,VIVANCO HUAMAN GLORIA TERESA,top1 left,15.0
7402,VIVANCO HERRERA HENRRY GERMAN,GALVEZ HERRERA HERIBERTO,top1 left,15.0
11120,YOLANDA B\,ROJAS ZEPEDA ALICIA,top1 left,14.0
11150,YOLANDA B\,AMAYA PAZO ROXANA,top1 left,14.0


In [104]:
fuzzy_3_judge = matched_judge_name3[(matched_judge_name3["__top1diff__"] >= 0.1) & (matched_judge_name3["__top1diff__"] <= 7.1)].copy()
fuzzy_3_judge

Unnamed: 0,juez,participant_apellido_nombre,__matchtype__,__top1diff__


In [105]:
matched_judge_name4 = d6tjoin.top1.MergeTop1(multiple_judge_names, exp_participants, fuzzy_left_on=["juez_1"], 
                       fuzzy_right_on=["participant_nombre_apellido"]).merge()["top1"]['juez_1']

  df_candidates = df_candidates_exact.append(df_candidates_fuzzy, ignore_index=True)


In [106]:
matched_judge_name4 = matched_judge_name4.rename(columns={'__top1left__': 'juez', '__top1right__': 'participant_nombre_apellido'})

matched_judge_name4

Unnamed: 0,juez,participant_nombre_apellido,__matchtype__,__top1diff__
17,,ROXANA AMAYA PAZO,top1 left,17.0
36,,MARLO TELLO PONCE,top1 left,17.0
3864,AGUADO SEMINO ALFREDO ALBERTO,ARTEMIO ORE FLORES,top1 left,18.0
3988,AGUADO SEMINO ALFREDO ALBERTO,ALBERTO RAMIRO CRUZADO ALIAGA,top1 left,18.0
1186,ALEJANDRO PAUCAR FELIX,SUSAN CAMARA TELLO,top1 left,14.0
...,...,...,...,...
10183,VALENCIA BARRIENTOS FAUSTINO,ALICIA VARGAS LEON,top1 left,17.0
10218,VALENCIA BARRIENTOS FAUSTINO,CRISTHIAN BARRANTES DIAZ,top1 left,17.0
10423,VEGA RODRIGUEZ LUIS ALBERTO,PEDRO RUBEN CHIRA TELLO,top1 left,17.0
7598,VIVANCO HERRERA HENRRY GERMAN,MARCO ANTONIO HERRERA GUZMAN,top1 left,16.0


In [107]:
fuzzy_4_judge = matched_judge_name4[(matched_judge_name4["__top1diff__"] >= 0.1) & (matched_judge_name4["__top1diff__"] <= 7.1)].copy()
fuzzy_4_judge

Unnamed: 0,juez,participant_nombre_apellido,__matchtype__,__top1diff__
8838,JULISSA ASEIJAS SILVA,JULISSA ISABEL ASEIJAS SILVA,top1 left,7.0


Preparing `multiple_judge_names["juez_2"]` for second judge only

In [109]:
matched_judge_name5 = d6tjoin.top1.MergeTop1(multiple_judge_names, exp_participants, fuzzy_left_on=["juez_2"], 
                       fuzzy_right_on=["participant_apellido_nombre"]).merge()["top1"]['juez_2']

  df_candidates = df_candidates_exact.append(df_candidates_fuzzy, ignore_index=True)


In [110]:
matched_judge_name5 = matched_judge_name5.rename(columns={'__top1left__': 'juez', '__top1right__': 'participant_apellido_nombre'})

matched_judge_name5

Unnamed: 0,juez,participant_apellido_nombre,__matchtype__,__top1diff__
121,,AMAYA PAZO ROXANA,top1 left,17.0
146,,TELLO PONCE MARLO,top1 left,17.0
7746,IRRAZABAL NUÑEZ WILSON,CACHI ROJAS ALEX NILTON,top1 left,16.0
13777,SULCA QUISPE MARIO ELOY,MEDRANO QUISPE PABLO CESAR,top1 left,14.0
8003,URBANO MENACHO ALEXANDER A,MEDRANO ALIAGA ANA PAULA,top1 left,18.0
...,...,...,...,...
5598,VIVANCO HERRERA HENRRY GERMAN,GALVEZ HERRERA HERIBERTO,top1 left,15.0
9630,ZAMALLOA FLORES,CAMARA TELLO SUSAN,top1 left,13.0
9647,ZAMALLOA FLORES,VARGAS LEON ALICIA,top1 left,13.0
11997,ZUÑIGA HERRERA ELICEA INES,CEPIDA GUERRERO IVAN,top1 left,16.0


In [111]:
fuzzy_5_judge = matched_judge_name5[(matched_judge_name5["__top1diff__"] >= 0.1) & (matched_judge_name5["__top1diff__"] <= 7.1)].copy()
fuzzy_5_judge

Unnamed: 0,juez,participant_apellido_nombre,__matchtype__,__top1diff__
632,HERRERA GUZMAN MARCO ANTONIO,HERRERA GUZMAN MARCO ANTONIO,top1 left,1.0


In [112]:
matched_judge_name6 = d6tjoin.top1.MergeTop1(multiple_judge_names, exp_participants, fuzzy_left_on=["juez_2"], 
                       fuzzy_right_on=["participant_nombre_apellido"]).merge()["top1"]['juez_2']

  df_candidates = df_candidates_exact.append(df_candidates_fuzzy, ignore_index=True)


In [113]:
matched_judge_name6 = matched_judge_name6.rename(columns={'__top1left__': 'juez', '__top1right__': 'participant_nombre_apellido'})

matched_judge_name6

Unnamed: 0,juez,participant_nombre_apellido,__matchtype__,__top1diff__
15,,ROXANA AMAYA PAZO,top1 left,17.0
34,,MARLO TELLO PONCE,top1 left,17.0
7706,IRRAZABAL NUÑEZ WILSON,ALICIA VARGAS LEON,top1 left,17.0
13861,SULCA QUISPE MARIO ELOY,SRI QUISPE PACHECO,top1 left,13.0
7960,URBANO MENACHO ALEXANDER A,JULIO CESAR LEYVA PEREZ,top1 left,18.0
...,...,...,...,...
5571,VIVANCO HERRERA HENRRY GERMAN,MARCO ANTONIO HERRERA GUZMAN,top1 left,17.0
9487,ZAMALLOA FLORES,ARTEMIO ORE FLORES,top1 left,9.0
12010,ZUÑIGA HERRERA ELICEA INES,JULIO CESAR LEYVA PEREZ,top1 left,18.0
12028,ZUÑIGA HERRERA ELICEA INES,JUSTO VERA PAREDES,top1 left,18.0


In [114]:
fuzzy_6_judge = matched_judge_name6[(matched_judge_name6["__top1diff__"] >= 0.1) & (matched_judge_name6["__top1diff__"] <= 7.1)].copy()
fuzzy_6_judge

Unnamed: 0,juez,participant_nombre_apellido,__matchtype__,__top1diff__
14960,ESAU CHANCO CASTILLON,ESAU CHANCO CASTILLON,top1 left,1.0
3910,JULIO CESAR LEYVA PEREZ,JULIO CESAR LEYVA PEREZ,top1 left,1.0


Preparing `multiple_judge_names["juez_3"]` for third judge only

In [115]:
matched_judge_name7 = d6tjoin.top1.MergeTop1(multiple_judge_names, exp_participants, fuzzy_left_on=["juez_3"], 
                       fuzzy_right_on=["participant_apellido_nombre"]).merge()["top1"]['juez_3']

  df_candidates = df_candidates_exact.append(df_candidates_fuzzy, ignore_index=True)


In [116]:
matched_judge_name7 = matched_judge_name7.rename(columns={'__top1left__': 'juez', '__top1right__': 'participant_apellido_nombre'})

matched_judge_name7

Unnamed: 0,juez,participant_apellido_nombre,__matchtype__,__top1diff__
121,,AMAYA PAZO ROXANA,top1 left,17.0
146,,TELLO PONCE MARLO,top1 left,17.0
7674,BARBOZA LUDEÑA MAXIMO SAUL,CAYLLAHUA PEÑA MAXIMO JAVIER,top1 left,16.0
644,AGUADO SEMINO ALFREDO ALBERTO,ANTICONA LUJAN CARLOS ALBERTO,top1 left,18.0
5911,ANGEL GUTIERREZ VALDIVIEZO,SANTOS PEREZ CARLOS,top1 left,18.0
...,...,...,...,...
3974,VENTURA PADILLA WILLIAMS,VICUÑA ZAMORA JESUS,top1 left,17.0
6136,VIGIL CURO LUZ CAROLINA,SANTOS PEREZ CARLOS,top1 left,16.0
6149,VIGIL CURO LUZ CAROLINA,RIVAS VARGAS SARITA,top1 left,16.0
6206,VIGIL CURO LUZ CAROLINA,NOLE LUPU JESUS MARTIN,top1 left,16.0


In [117]:
fuzzy_7_judge = matched_judge_name7[(matched_judge_name7["__top1diff__"] >= 0.1) & (matched_judge_name7["__top1diff__"] <= 7.1)].copy()
fuzzy_7_judge

Unnamed: 0,juez,participant_apellido_nombre,__matchtype__,__top1diff__
3597,CARDENAS ALVARADO BORIS FAUSTO,CARDENAS ALVARADO BORIS FAUSTO,top1 left,1.0
2819,CARHUAPOMA GRANDA EDGARD JESUS,CARHUAPOMA GRANDA EDGARD JESUS,top1 left,1.0
2120,CRUZ COBEÑAS MARLENY,CRUZ COBEÑAS MARLENY,top1 left,1.0
13917,TAPAHUASCO PALOMINO RICHARD,TAPAHUASCO PALOMINO RICHARD,top1 left,1.0


In [119]:
matched_judge_name8 = d6tjoin.top1.MergeTop1(multiple_judge_names, exp_participants, fuzzy_left_on=["juez_3"], 
                       fuzzy_right_on=["participant_nombre_apellido"]).merge()["top1"]['juez_3']

  df_candidates = df_candidates_exact.append(df_candidates_fuzzy, ignore_index=True)


In [120]:
matched_judge_name8 = matched_judge_name8.rename(columns={'__top1left__': 'juez', '__top1right__': 'participant_nombre_apellido'})

matched_judge_name8

Unnamed: 0,juez,participant_nombre_apellido,__matchtype__,__top1diff__
15,,ROXANA AMAYA PAZO,top1 left,17.0
34,,MARLO TELLO PONCE,top1 left,17.0
7788,BARBOZA LUDEÑA MAXIMO SAUL,TANIA LIZETT MEDINA SAYRA,top1 left,18.0
487,AGUADO SEMINO ALFREDO ALBERTO,ARTEMIO ORE FLORES,top1 left,19.0
611,AGUADO SEMINO ALFREDO ALBERTO,ALBERTO RAMIRO CRUZADO ALIAGA,top1 left,19.0
...,...,...,...,...
6145,VIGIL CURO LUZ CAROLINA,VICTOR CALIZAYA COILA,top1 left,16.0
9704,ZABARBURU SAAVEDRA GONZALO,NOLAM ELIAS TALAVERA ZAPANA,top1 left,19.0
9734,ZABARBURU SAAVEDRA GONZALO,SUSAN CAMARA TELLO,top1 left,19.0
9746,ZABARBURU SAAVEDRA GONZALO,PEDRO RUBEN CHIRA TELLO,top1 left,19.0


In [121]:
fuzzy_8_judge = matched_judge_name8[(matched_judge_name8["__top1diff__"] >= 0.1) & (matched_judge_name8["__top1diff__"] <= 7.1)].copy()
fuzzy_8_judge

Unnamed: 0,juez,participant_nombre_apellido,__matchtype__,__top1diff__


In [157]:
type(matched_judge_name8)

pandas.core.frame.DataFrame

In [None]:
# 1. Arreglar los fuzzies 
# 2. Arreglar el dataframe de los que si matchearon para poder crear amag_ii_cases
# 3. Coincidan las columnas de los fuzzies y de los que si matchean
# 4. Luego viene lo de files_reports y amag_ii_cases, usar la variable juez
# 5. Usar el resultado de 4 para hacer un merge con las otras tablas, y luego usar exp_nro


In [145]:
fuzzies_concat = pd.concat([fuzzy_1_judge, fuzzy_2_judge, fuzzy_3_judge, fuzzy_4_judge, fuzzy_5_judge, fuzzy_6_judge, fuzzy_7_judge, fuzzy_8_judge])

fuzzies_concat = fuzzies_concat.drop(columns=['__matchtype__', '__top1diff__'])

# dividir en dos df el fuzzies_concat

fuzzies_concat_apellido = fuzzies_concat.dropna(subset=['participant_apellido_nombre'])
fuzzies_concat_apellido = fuzzies_concat_apellido.drop(columns=['participant_nombre_apellido'])

fuzzies_concat_nombre = fuzzies_concat.dropna(subset=['participant_nombre_apellido'])
fuzzies_concat_nombre = fuzzies_concat_nombre.drop(columns=['participant_apellido_nombre'])

In [150]:
fuzzies_1 = fuzzies_concat_apellido.merge(lab_data[["nrodocumento","ApellidoPaterno", "ApellidoMaterno", "Nombres", 'participant_apellido_nombre']], on='participant_apellido_nombre', how = 'left')

fuzzies_1 = fuzzies_1.drop(columns=['participant_apellido_nombre'])
fuzzies_1
# fuzzies_concat_apellido['participant_nombre_apellido'] = fuzzies_1['participant_nombre_apellido']

Unnamed: 0,juez,nrodocumento,ApellidoPaterno,ApellidoMaterno,Nombres
0,ZUÑIGA PORTOCARRERO LINO,29558906,ZUÑIGA,PORTOCARRERO,LINO YSAURO
1,HERRERA GUZMAN MARCO ANTONIO,29582436,HERRERA,GUZMAN,MARCO ANTONIO
2,CARDENAS ALVARADO BORIS FAUSTO,18071270,CARDENAS,ALVARADO,BORIS FAUSTO
3,CARHUAPOMA GRANDA EDGARD JESUS,29602737,CARHUAPOMA,GRANDA,EDGARD JESUS
4,CRUZ COBEÑAS MARLENY,10407408,CRUZ,COBEÑAS,MARLENY
5,TAPAHUASCO PALOMINO RICHARD,7460242,TAPAHUASCO,PALOMINO,RICHARD


In [148]:
fuzzies_2 = fuzzies_concat_nombre.merge(lab_data[["nrodocumento","ApellidoPaterno", "ApellidoMaterno", "Nombres", 'participant_nombre_apellido']], on='participant_nombre_apellido', how = 'left')

fuzzies_2 = fuzzies_2.drop(columns=['participant_nombre_apellido'])

fuzzies_2

Unnamed: 0,juez,nrodocumento,ApellidoPaterno,ApellidoMaterno,Nombres
0,JULISSA ASEIJAS SILVA,26729323,ASEIJAS,SILVA,JULISSA ISABEL
1,ESAU CHANCO CASTILLON,20009702,CHANCO,CASTILLON,ESAU
2,JULIO CESAR LEYVA PEREZ,8062416,LEYVA,PEREZ,JULIO CESAR


In [176]:
matched_concat = pd.concat([nombre_apellido_merge, apellido_nombre_merge])
matched_concat

Unnamed: 0,index,juez,nrodocumento,participant_nombre_apellido,participant_apellido_nombre
0,26886,LUIS ALVIN QUISPE SANCHEZ,41929968,LUIS ALVIN QUISPE SANCHEZ,QUISPE SANCHEZ LUIS ALVIN
1,45112,AMPARO BEATRIZ RODRIGUEZ CASTILLO,18090278,AMPARO BEATRIZ RODRIGUEZ CASTILLO,RODRIGUEZ CASTILLO AMPARO BEATRIZ
2,47010,FELIX ENRIQUE RAMIREZ SANCHEZ,18135223,FELIX ENRIQUE RAMIREZ SANCHEZ,RAMIREZ SANCHEZ FELIX ENRIQUE
3,52934,JUAN MANUEL FLORES SANCHEZ,1325221,JUAN MANUEL FLORES SANCHEZ,FLORES SANCHEZ JUAN MANUEL
0,10,CHIGNE MOZOMBITE MARIA DEL PILAR,41371819,MARIA DEL PILAR CHIGNE MOZOMBITE,CHIGNE MOZOMBITE MARIA DEL PILAR
1,684,OCAMPO VARGAS CLODOMIRA,18138792,CLODOMIRA OCAMPO VARGAS,OCAMPO VARGAS CLODOMIRA
2,3587,DIAZ ALVAN ROBERTO CARLOS,41359917,ROBERTO CARLOS DIAZ ALVAN,DIAZ ALVAN ROBERTO CARLOS
3,5507,BACILIO SALAZAR EMMA CONSUELO,7299339,EMMA CONSUELO BACILIO SALAZAR,BACILIO SALAZAR EMMA CONSUELO
4,5704,ROJAS OBREGON FIORELLA MAGALI,42897314,FIORELLA MAGALI ROJAS OBREGON,ROJAS OBREGON FIORELLA MAGALI
5,6678,GUTIERREZ GALVEZ ZONIA VIRGINIA,28260854,ZONIA VIRGINIA GUTIERREZ GALVEZ,GUTIERREZ GALVEZ ZONIA VIRGINIA


In [161]:
lab_data[["ApellidoPaterno", "ApellidoMaterno", "Nombres", 'participant_apellido_nombre']]

Unnamed: 0,ApellidoPaterno,ApellidoMaterno,Nombres,participant_apellido_nombre
0,ROSAS,CASTAÑEDA,JUAN ANTONIO,ROSAS CASTAÑEDA JUAN ANTONIO
1,HUAMAN,PORTAL,ELVIRA LAURA,HUAMAN PORTAL ELVIRA LAURA
2,ZAVALAGA,VARGAS,ELARD FERNANDO,ZAVALAGA VARGAS ELARD FERNANDO
3,LAIZA,ESPINOZA,GLORIA LUCILA,LAIZA ESPINOZA GLORIA LUCILA
4,MORON,DOMINGUEZ,BENJAMIN ISRAEL,MORON DOMINGUEZ BENJAMIN ISRAEL
...,...,...,...,...
220,PALMER,OLIDEN,CARMEN JULIA,PALMER OLIDEN CARMEN JULIA
221,ABANTO,TAFUR,TATIANOVA,ABANTO TAFUR TATIANOVA
222,MORALES,FERNANDEZ,GIANNY ELEISER,MORALES FERNANDEZ GIANNY ELEISER
223,LLANCA,VARA,NORA ELIZABETH,LLANCA VARA NORA ELIZABETH


In [177]:
len(matched_concat)

51

In [169]:
matched_concat_c = matched_concat.merge(lab_data[["ApellidoPaterno", "ApellidoMaterno", "Nombres", 'participant_apellido_nombre']], on='participant_apellido_nombre', how = 'left')

matched_concat_c = matched_concat_c.drop(columns=['participant_nombre_apellido', 'participant_apellido_nombre', 'index'])

matched_concat_c

Unnamed: 0,juez,nrodocumento,ApellidoPaterno,ApellidoMaterno,Nombres
0,LUIS ALVIN QUISPE SANCHEZ,41929968,QUISPE,SANCHEZ,LUIS ALVIN
1,AMPARO BEATRIZ RODRIGUEZ CASTILLO,18090278,RODRIGUEZ,CASTILLO,AMPARO BEATRIZ
2,FELIX ENRIQUE RAMIREZ SANCHEZ,18135223,RAMIREZ,SANCHEZ,FELIX ENRIQUE
3,JUAN MANUEL FLORES SANCHEZ,1325221,FLORES,SANCHEZ,JUAN MANUEL
4,CHIGNE MOZOMBITE MARIA DEL PILAR,41371819,CHIGNE,MOZOMBITE,MARIA DEL PILAR
5,OCAMPO VARGAS CLODOMIRA,18138792,OCAMPO,VARGAS,CLODOMIRA
6,DIAZ ALVAN ROBERTO CARLOS,41359917,DIAZ,ALVAN,ROBERTO CARLOS
7,BACILIO SALAZAR EMMA CONSUELO,7299339,BACILIO,SALAZAR,EMMA CONSUELO
8,ROJAS OBREGON FIORELLA MAGALI,42897314,ROJAS,OBREGON,FIORELLA MAGALI
9,GUTIERREZ GALVEZ ZONIA VIRGINIA,28260854,GUTIERREZ,GALVEZ,ZONIA VIRGINIA


In [178]:
amag_ii_cases = pd.concat([matched_concat_c, fuzzies_1, fuzzies_2])
amag_ii_cases

Unnamed: 0,juez,nrodocumento,ApellidoPaterno,ApellidoMaterno,Nombres
0,LUIS ALVIN QUISPE SANCHEZ,41929968,QUISPE,SANCHEZ,LUIS ALVIN
1,AMPARO BEATRIZ RODRIGUEZ CASTILLO,18090278,RODRIGUEZ,CASTILLO,AMPARO BEATRIZ
2,FELIX ENRIQUE RAMIREZ SANCHEZ,18135223,RAMIREZ,SANCHEZ,FELIX ENRIQUE
3,JUAN MANUEL FLORES SANCHEZ,1325221,FLORES,SANCHEZ,JUAN MANUEL
4,CHIGNE MOZOMBITE MARIA DEL PILAR,41371819,CHIGNE,MOZOMBITE,MARIA DEL PILAR
5,OCAMPO VARGAS CLODOMIRA,18138792,OCAMPO,VARGAS,CLODOMIRA
6,DIAZ ALVAN ROBERTO CARLOS,41359917,DIAZ,ALVAN,ROBERTO CARLOS
7,BACILIO SALAZAR EMMA CONSUELO,7299339,BACILIO,SALAZAR,EMMA CONSUELO
8,ROJAS OBREGON FIORELLA MAGALI,42897314,ROJAS,OBREGON,FIORELLA MAGALI
9,GUTIERREZ GALVEZ ZONIA VIRGINIA,28260854,GUTIERREZ,GALVEZ,ZONIA VIRGINIA


In [179]:
len(amag_ii_cases)

60

In [180]:
amag_ii_cases

Unnamed: 0,juez,nrodocumento,ApellidoPaterno,ApellidoMaterno,Nombres
0,LUIS ALVIN QUISPE SANCHEZ,41929968,QUISPE,SANCHEZ,LUIS ALVIN
1,AMPARO BEATRIZ RODRIGUEZ CASTILLO,18090278,RODRIGUEZ,CASTILLO,AMPARO BEATRIZ
2,FELIX ENRIQUE RAMIREZ SANCHEZ,18135223,RAMIREZ,SANCHEZ,FELIX ENRIQUE
3,JUAN MANUEL FLORES SANCHEZ,1325221,FLORES,SANCHEZ,JUAN MANUEL
4,CHIGNE MOZOMBITE MARIA DEL PILAR,41371819,CHIGNE,MOZOMBITE,MARIA DEL PILAR
5,OCAMPO VARGAS CLODOMIRA,18138792,OCAMPO,VARGAS,CLODOMIRA
6,DIAZ ALVAN ROBERTO CARLOS,41359917,DIAZ,ALVAN,ROBERTO CARLOS
7,BACILIO SALAZAR EMMA CONSUELO,7299339,BACILIO,SALAZAR,EMMA CONSUELO
8,ROJAS OBREGON FIORELLA MAGALI,42897314,ROJAS,OBREGON,FIORELLA MAGALI
9,GUTIERREZ GALVEZ ZONIA VIRGINIA,28260854,GUTIERREZ,GALVEZ,ZONIA VIRGINIA


In [185]:
amag_ii_cases.to_excel(f'{dc_interm_path}/amag_ii_cases.xlsx', index=False)

# 3. Creating CEJ datasets

A pending task would be to bind the rows of all the dataframes

## Follow up dataframe

In [186]:
files_follow_up = pd.read_csv(data_path + "/DF_follow_up_cleaner_2022.csv", error_bad_lines=False)
files_follow_up = clean_names(files_follow_up)



  files_follow_up = pd.read_csv(data_path + "/DF_follow_up_cleaner_2022.csv", error_bad_lines=False)
Skipping line 138972: expected 10 fields, saw 11
Skipping line 139148: expected 10 fields, saw 11
Skipping line 140120: expected 10 fields, saw 13

Skipping line 203178: expected 10 fields, saw 21

Skipping line 297454: expected 10 fields, saw 16
Skipping line 304348: expected 10 fields, saw 13
Skipping line 307125: expected 10 fields, saw 12
Skipping line 307552: expected 10 fields, saw 12

Skipping line 352440: expected 10 fields, saw 17
Skipping line 386692: expected 10 fields, saw 11

Skipping line 409540: expected 10 fields, saw 15
Skipping line 421440: expected 10 fields, saw 15
Skipping line 446936: expected 10 fields, saw 11

  files_follow_up = pd.read_csv(data_path + "/DF_follow_up_cleaner_2022.csv", error_bad_lines=False)


# Procedural parts dataframe

In [187]:
files_procedural_parts = pd.read_csv(data_path + "/DF_procedural_parts_2022.csv")
files_procedural_parts = clean_names(files_procedural_parts)

In [188]:
files_procedural_parts["expediente_n°_"] = files_procedural_parts["expediente_n°_"].apply(lambda row: row.split("\\")[-1])

## Downloads dataframe

In [189]:
files_downloads = pd.read_csv(data_path + "/DF_DOWNLOADS_2022.csv")
files_downloads = clean_names(files_downloads)

### Merging complementary case data with amag ii cases

In [198]:
files_reports = files_reports.rename(columns = {'juez_':'juez'})
files_reports

Unnamed: 0,expediente_n°_,organo_jurisdiccional_,distrito_judicial_,juez,especialista_legal_,fecha_de_inicio_,proceso_,observacion_,especialidad_,materia_s_,estado_,etapa_procesal_,fecha_conclusion_,ubicacion_,motivo_conclusion_,sumilla_,juez_splitted,n_judges_case
0,00001-2022-0-0101-JP-CI-01,JUZGADO DE PAZ LETRADO - Leymebamba,AMAZONAS,PISCOYA SOSA ALDO FRANCISCO,ROJAS SILVA EDA,14/01/2022,NO CONTENCIOSO,\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t...,CIVIL,SUCESION INTESTADA,ARCHIVO DEFINITIVO,GENERAL,,ARCHIVO GENERAL,-------,SUCESION INTESTADA,[PISCOYA SOSA ALDO FRANCISCO],1
1,00001-2022-0-0102-JP-CI-01,JUZGADO PAZ LETRADO - Imaza,AMAZONAS,VEGA BOCANEGRA BETO,GARCIA ODAR JOSE ALFREDO,27/01/2022,SUMARISIMO,\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t...,CIVIL,OBLIGACION DE DAR SUMA DE DINERO,TRAMITE,GENERAL,,ESPECIALISTA,-------,DEMANDA DE OBLIGACION DE DAR SUMA DE DINERO,[VEGA BOCANEGRA BETO],1
2,00001-2022-0-0103-JP-CI-01,JUZGADO DE PAZ LETRADO - FLORIDA,AMAZONAS,LOLO JARA ESTELA,DAVILA HORNA HERMES WILMAN,10/01/2022,NO CONTENCIOSO,\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t...,CIVIL,SUCESION INTESTADA,ARCHIVO PROVISIONAL,GENERAL,,POOL ASIST. JUDICIAL,-------,SUCESION INTESTADA,[LOLO JARA ESTELA],1
3,00001-2022-0-0104-JP-CI-01,JUZGADO PAZ LETRADO - Sede Yutupis,AMAZONAS,HUAMAN CULQUI IRMA MERCEDES,ELORRIAGA CHAVEZ ROMINA PAOLA,16/03/2022,UNICO DE EJECUCION,\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t...,CIVIL,OBLIGACION DE DAR SUMA DE DINERO,EN PLAZO DE IMPUGNACION,GENERAL,,POOL ASIST. JUDICIAL,-------,DEMANDA OBLIGACION DE DAR SUMA DE DINERO,[HUAMAN CULQUI IRMA MERCEDES],1
4,00001-2022-0-0105-JP-CI-01,JUZGADO DE PAZ LETRADO - Sede Luya,AMAZONAS,BARRERA BARDALES GUIMO ALBERTO,ANGULO CULLAMPE LIZETH DEL CARMEN,06/01/2022,NO CONTENCIOSO,\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t...,CIVIL,SUCESION INTESTADA,SENTENCIADO/ RESUELTO,GENERAL,,POOL ASIST. JUDICIAL,-------,DEMANDA DE SUCESIÓN INTESTADA,[BARRERA BARDALES GUIMO ALBERTO],1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56241,00006-2022-0-3301-JR-LA-01,JUZGADO DE TRABAJO,PUENTE PIEDRA - VENTANILLA,POMA ALOSILLA NARDA KATHERINE,SALDAÑA GONZALEZ ADRIANA A.,06/01/2022,ORDINARIO,\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t...,LABORAL,PAGO DE BENEFICIOS SOCIALES Y/O INDEMNIZACION ...,EN TRAMITE(Pendiente),GENERAL,,POOL ASIST. JUDICIAL,-------,PAGO DE BENEFICIOS SOCIALES,[POMA ALOSILLA NARDA KATHERINE],1
56242,00007-2022-0-3301-JP-LA-01,JUZGADO DE TRABAJO,PUENTE PIEDRA - VENTANILLA,POMA ALOSILLA NARDA KATHERINE,SALDAÑA GONZALEZ ADRIANA A.,07/01/2022,ABREVIADO,\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t...,LABORAL,PAGO DE BENEFICIOS SOCIALES Y/O INDEMNIZACION ...,EN TRAMITE(Pendiente),GENERAL,,POOL ASIST. DE AUDIO,-------,PAGO DE BENEFICIOS LABORALES,[POMA ALOSILLA NARDA KATHERINE],1
56243,00007-2022-0-3301-JR-LA-01,JUZGADO DE TRABAJO,PUENTE PIEDRA - VENTANILLA,POMA ALOSILLA NARDA KATHERINE,SALDAÑA GONZALEZ ADRIANA A.,07/01/2022,ORDINARIO,\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t...,LABORAL,DESNATURALIZACIÓN DE CONTRATO,EN TRAMITE(Pendiente),GENERAL,,ESPECIALISTA,-------,INTERPONGO DEMANDA POR DESNATURALIZACION DE CO...,[POMA ALOSILLA NARDA KATHERINE],1
56244,00008-2022-0-3301-JR-LA-01,JUZGADO DE TRABAJO,PUENTE PIEDRA - VENTANILLA,POMA ALOSILLA NARDA KATHERINE,SALDAÑA GONZALEZ ADRIANA A.,11/01/2022,ORDINARIO,\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t...,LABORAL,INCUMPLIMIENTO DE DISPOSICIONES Y NORMAS LABOR...,EN TRAMITE(Pendiente),GENERAL,,POOL ASIST. DE AUDIO,-------,DEMANDA SOBRE INCUMPLIMIENTO,[POMA ALOSILLA NARDA KATHERINE],1


In [200]:
reportes_amag_ii_raw = pd.merge(amag_ii_cases, files_reports, how="inner", on="juez")

In [201]:
reportes_amag_ii_raw

Unnamed: 0,juez,nrodocumento,ApellidoPaterno,ApellidoMaterno,Nombres,expediente_n°_,organo_jurisdiccional_,distrito_judicial_,especialista_legal_,fecha_de_inicio_,...,especialidad_,materia_s_,estado_,etapa_procesal_,fecha_conclusion_,ubicacion_,motivo_conclusion_,sumilla_,juez_splitted,n_judges_case
0,LUIS ALVIN QUISPE SANCHEZ,41929968,QUISPE,SANCHEZ,LUIS ALVIN,00041-2022-0-0605-JP-CI-01,JUZGADO CIVIL - SEDE BAMBAMARCA,CAJAMARCA,VASQUEZ DIAZ ROYER JARLIN,16/06/2022,...,CIVIL,INDEMNIZACION POR DAÑOS Y PERJUICIOS,INADMISIBLE,GENERAL,,ESPECIALISTA,-------,DEMANDA DE DAÑOS Y PERJUICIOS DERIVADOS DE RES...,[LUIS ALVIN QUISPE SANCHEZ],1
1,LUIS ALVIN QUISPE SANCHEZ,41929968,QUISPE,SANCHEZ,LUIS ALVIN,00020-2022-65-0605-JP-FC-01,JUZGADO CIVIL - SEDE BAMBAMARCA,CAJAMARCA,VASQUEZ DIAZ ROYER JARLIN,31/08/2022,...,FAMILIA CIVIL,AUMENTO DE ALIMENTOS,SENTENCIADO/ RESUELTO,GENERAL,,POOL ASIST. JUDICIAL,-------,CUADERNO DE APELACION DE SENTENCIA,[LUIS ALVIN QUISPE SANCHEZ],1
2,LUIS ALVIN QUISPE SANCHEZ,41929968,QUISPE,SANCHEZ,LUIS ALVIN,00025-2022-0-0610-JP-FC-01,JUZGADO CIVIL - SEDE BAMBAMARCA,CAJAMARCA,VASQUEZ DIAZ ROYER JARLIN,04/02/2022,...,FAMILIA CIVIL,FILIACION,PARA VISTA DE LA CAUSA,GENERAL,,POOL ASIST. JUDICIAL,-------,DEMANDA DE FILIACIÓN DE PATERNIDAD EXTRAMATRIM...,[LUIS ALVIN QUISPE SANCHEZ],1
3,LUIS ALVIN QUISPE SANCHEZ,41929968,QUISPE,SANCHEZ,LUIS ALVIN,00026-2022-72-0605-JP-FC-01,JUZGADO CIVIL - SEDE BAMBAMARCA,CAJAMARCA,VASQUEZ DIAZ ROYER JARLIN,10/03/2023,...,FAMILIA CIVIL,ALIMENTOS,PARA VISTA DE LA CAUSA,GENERAL,,POOL ASIST. JUDICIAL,-------,CUADERNO DEAPELACION DE SENTENCIA,[LUIS ALVIN QUISPE SANCHEZ],1
4,LUIS ALVIN QUISPE SANCHEZ,41929968,QUISPE,SANCHEZ,LUIS ALVIN,00027-2022-2-0605-JP-FC-01,JUZGADO CIVIL - SEDE BAMBAMARCA,CAJAMARCA,VASQUEZ DIAZ ROYER JARLIN,10/03/2023,...,FAMILIA CIVIL,ALIMENTOS,APELADO,GENERAL,,POOL ASIST. JUDICIAL,-------,CUADERNO DE APELACION DE SENTENCIA,[LUIS ALVIN QUISPE SANCHEZ],1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2655,ZUÑIGA PORTOCARRERO LINO,29558906,ZUÑIGA,PORTOCARRERO,LINO YSAURO,00221-2022-0-0412-JR-CI-01,1º JUZGADO CIVIL DE PAUCARPATA,AREQUIPA,BENAVIDES CASTILLO MAX MAYKOL,13/01/2022,...,CIVIL,NULIDAD DE ACTO JURIDICO,IMPROCEDENTE,GENERAL,,ARCHIVO (ENVIADO),-------,DEMANDA,[ZUÑIGA PORTOCARRERO LINO],1
2656,ZUÑIGA PORTOCARRERO LINO,29558906,ZUÑIGA,PORTOCARRERO,LINO YSAURO,00023-2022-0-0412-JR-CI-01,1º JUZGADO CIVIL DE PAUCARPATA,AREQUIPA,"SANDOVAL LOAYZA, SHEYLA MAGALY",05/01/2022,...,CIVIL,INDEMNIZACION,IMPROCEDENTE,GENERAL,,ARCHIVO GENERAL,-------,DEMANDA,[ZUÑIGA PORTOCARRERO LINO],1
2657,ZUÑIGA PORTOCARRERO LINO,29558906,ZUÑIGA,PORTOCARRERO,LINO YSAURO,00080-2022-0-0412-JR-CI-01,1º JUZGADO CIVIL DE PAUCARPATA,AREQUIPA,TORREBLANCA GOMEZ CHRISTIAN OMAR,06/01/2022,...,CIVIL,OBLIGACION DE DAR SUMA DE DINERO,EJECUCION,GENERAL,,ARCHIVO MODULAR,-------,DEMANDA,[ZUÑIGA PORTOCARRERO LINO],1
2658,ZUÑIGA PORTOCARRERO LINO,29558906,ZUÑIGA,PORTOCARRERO,LINO YSAURO,00009-2022-0-0412-JR-CI-01,1º JUZGADO CIVIL DE PAUCARPATA,AREQUIPA,BENAVIDES CASTILLO MAX MAYKOL,05/01/2022,...,CIVIL,PRESCRIPCION ADQUISITIVA,IMPROCEDENTE,GENERAL,,ARCHIVO (ENVIADO),-------,DEMANDA,[ZUÑIGA PORTOCARRERO LINO],1


In [202]:
follow_up_amag_ii_raw = pd.merge(reportes_amag_ii_raw, files_follow_up, how="inner", on="expediente_n°_")
follow_up_amag_ii_raw

Unnamed: 0,juez,nrodocumento,ApellidoPaterno,ApellidoMaterno,Nombres,expediente_n°_,organo_jurisdiccional_,distrito_judicial_,especialista_legal_,fecha_de_inicio_,...,n_judges_case,link,fecha_de_resolucion_ingreso_,resolucion_,tipo_de_notificacion_,acto_,fojas_folios_,proveido_,sumilla__y,descripcion_de_usuario_
0,LUIS ALVIN QUISPE SANCHEZ,41929968,QUISPE,SANCHEZ,LUIS ALVIN,00041-2022-0-0605-JP-CI-01,JUZGADO CIVIL - SEDE BAMBAMARCA,CAJAMARCA,VASQUEZ DIAZ ROYER JARLIN,16/06/2022,...,1,documentoD.html?nid=cPgcldZpqPFZqkAgOKDZ,21/03/2023,DOS,,AUTO INADMISIBLE,3.0,21/03/2023,\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t1.\tDECLARAR...,\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tDESCARGADO P...
1,LUIS ALVIN QUISPE SANCHEZ,41929968,QUISPE,SANCHEZ,LUIS ALVIN,00041-2022-0-0605-JP-CI-01,JUZGADO CIVIL - SEDE BAMBAMARCA,CAJAMARCA,VASQUEZ DIAZ ROYER JARLIN,16/06/2022,...,1,,25/07/2022 09:08,,,NOTA,,25/07/2022,\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tEL EXPEDIENT...,\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tINGRESADO PO...
2,LUIS ALVIN QUISPE SANCHEZ,41929968,QUISPE,SANCHEZ,LUIS ALVIN,00041-2022-0-0605-JP-CI-01,JUZGADO CIVIL - SEDE BAMBAMARCA,CAJAMARCA,VASQUEZ DIAZ ROYER JARLIN,16/06/2022,...,1,documentoD.html?nid=nTSaIWwzePYhvLfpaBlX,28/06/2022,UNO,Pta. Cedula Not.,AUTO IMPROCEDENTE,2.0,28/06/2022,\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tRESUELVE: \n...,\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tDESCARGADO P...
3,LUIS ALVIN QUISPE SANCHEZ,41929968,QUISPE,SANCHEZ,LUIS ALVIN,00041-2022-0-0605-JP-CI-01,JUZGADO CIVIL - SEDE BAMBAMARCA,CAJAMARCA,VASQUEZ DIAZ ROYER JARLIN,16/06/2022,...,1,,11/01/2023 16:00,,,REDISTRIBUCION,8.0,11/01/2023,\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tCONTIENDA DE...,\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tINGRESADO PO...
4,LUIS ALVIN QUISPE SANCHEZ,41929968,QUISPE,SANCHEZ,LUIS ALVIN,00041-2022-0-0605-JP-CI-01,JUZGADO CIVIL - SEDE BAMBAMARCA,CAJAMARCA,VASQUEZ DIAZ ROYER JARLIN,16/06/2022,...,1,,22/07/2022 17:25,,,NOTA,,22/07/2022,\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tEL EXPEDIENT...,\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tINGRESADO PO...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23825,ZUÑIGA PORTOCARRERO LINO,29558906,ZUÑIGA,PORTOCARRERO,LINO YSAURO,00092-2022-0-0412-JR-CI-01,1º JUZGADO CIVIL DE PAUCARPATA,AREQUIPA,TORREBLANCA GOMEZ CHRISTIAN OMAR,06/01/2022,...,1,,03/03/2022 11:54,AUDIENCIA Y SENTENCIA,,ESCRITO,3.0,11/03/2022,\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tADJUNTA PUBL...,\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tINGRESADO PO...
23826,ZUÑIGA PORTOCARRERO LINO,29558906,ZUÑIGA,PORTOCARRERO,LINO YSAURO,00092-2022-0-0412-JR-CI-01,1º JUZGADO CIVIL DE PAUCARPATA,AREQUIPA,TORREBLANCA GOMEZ CHRISTIAN OMAR,06/01/2022,...,1,,11/08/2022,S/N,,NOTA,1.0,11/08/2022,\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tNOTA\n\t\t\t...,\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tDESCARGADO P...
23827,ZUÑIGA PORTOCARRERO LINO,29558906,ZUÑIGA,PORTOCARRERO,LINO YSAURO,00092-2022-0-0412-JR-CI-01,1º JUZGADO CIVIL DE PAUCARPATA,AREQUIPA,TORREBLANCA GOMEZ CHRISTIAN OMAR,06/01/2022,...,1,,14/03/2022 17:14,SIETE,,ESCRITO,3.0,18/03/2022,\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tCURSAR OFICI...,\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tINGRESADO PO...
23828,ZUÑIGA PORTOCARRERO LINO,29558906,ZUÑIGA,PORTOCARRERO,LINO YSAURO,00092-2022-0-0412-JR-CI-01,1º JUZGADO CIVIL DE PAUCARPATA,AREQUIPA,TORREBLANCA GOMEZ CHRISTIAN OMAR,06/01/2022,...,1,documentoD.html?nid=aiSlEDjOMgfGusc,10/03/2022,RESUMEN DE ACUERDOS,Pta. Cedula Not.,DECRETO,2.0,10/03/2022,\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tRESUMEN DE A...,\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tDESCARGADO P...


In [203]:
procedural_parts_amag_ii_raw = pd.merge(reportes_amag_ii_raw, files_procedural_parts, how="inner", on="expediente_n°_")
procedural_parts_amag_ii_raw

Unnamed: 0,juez,nrodocumento,ApellidoPaterno,ApellidoMaterno,Nombres,expediente_n°_,organo_jurisdiccional_,distrito_judicial_,especialista_legal_,fecha_de_inicio_,...,ubicacion_,motivo_conclusion_,sumilla_,juez_splitted,n_judges_case,parte,tipo_depersona,apellido_paterno_razon_social,apellidomaterno,nombres
0,LUIS ALVIN QUISPE SANCHEZ,41929968,QUISPE,SANCHEZ,LUIS ALVIN,00041-2022-0-0605-JP-CI-01,JUZGADO CIVIL - SEDE BAMBAMARCA,CAJAMARCA,VASQUEZ DIAZ ROYER JARLIN,16/06/2022,...,ESPECIALISTA,-------,DEMANDA DE DAÑOS Y PERJUICIOS DERIVADOS DE RES...,[LUIS ALVIN QUISPE SANCHEZ],1,DEMANDANTE,NATURAL,CARUAJULCA,MASABEL,NANCI
1,LUIS ALVIN QUISPE SANCHEZ,41929968,QUISPE,SANCHEZ,LUIS ALVIN,00041-2022-0-0605-JP-CI-01,JUZGADO CIVIL - SEDE BAMBAMARCA,CAJAMARCA,VASQUEZ DIAZ ROYER JARLIN,16/06/2022,...,ESPECIALISTA,-------,DEMANDA DE DAÑOS Y PERJUICIOS DERIVADOS DE RES...,[LUIS ALVIN QUISPE SANCHEZ],1,DEMANDADO,NATURAL,TAMAY,BAUTISTA,ELTER ANIBAL
2,LUIS ALVIN QUISPE SANCHEZ,41929968,QUISPE,SANCHEZ,LUIS ALVIN,00041-2022-0-0605-JP-CI-01,JUZGADO CIVIL - SEDE BAMBAMARCA,CAJAMARCA,VASQUEZ DIAZ ROYER JARLIN,16/06/2022,...,ESPECIALISTA,-------,DEMANDA DE DAÑOS Y PERJUICIOS DERIVADOS DE RES...,[LUIS ALVIN QUISPE SANCHEZ],1,DEMANDANTE,NATURAL,CARUAJULCA,MASABEL,NANCI
3,LUIS ALVIN QUISPE SANCHEZ,41929968,QUISPE,SANCHEZ,LUIS ALVIN,00041-2022-0-0605-JP-CI-01,JUZGADO CIVIL - SEDE BAMBAMARCA,CAJAMARCA,VASQUEZ DIAZ ROYER JARLIN,16/06/2022,...,ESPECIALISTA,-------,DEMANDA DE DAÑOS Y PERJUICIOS DERIVADOS DE RES...,[LUIS ALVIN QUISPE SANCHEZ],1,DEMANDADO,NATURAL,TAMAY,BAUTISTA,ELTER ANIBAL
4,LUIS ALVIN QUISPE SANCHEZ,41929968,QUISPE,SANCHEZ,LUIS ALVIN,00041-2022-0-0605-JP-CI-01,JUZGADO CIVIL - SEDE BAMBAMARCA,CAJAMARCA,VASQUEZ DIAZ ROYER JARLIN,16/06/2022,...,ESPECIALISTA,-------,DEMANDA DE DAÑOS Y PERJUICIOS DERIVADOS DE RES...,[LUIS ALVIN QUISPE SANCHEZ],1,DEMANDANTE,NATURAL,CARUAJULCA,MASABEL,NANCI
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7076,ZUÑIGA PORTOCARRERO LINO,29558906,ZUÑIGA,PORTOCARRERO,LINO YSAURO,00009-2022-0-0412-JR-CI-01,1º JUZGADO CIVIL DE PAUCARPATA,AREQUIPA,BENAVIDES CASTILLO MAX MAYKOL,05/01/2022,...,ARCHIVO (ENVIADO),-------,DEMANDA,[ZUÑIGA PORTOCARRERO LINO],1,DEMANDADO,NATURAL,ROJAS,FLORES,ENEO FREDY
7077,ZUÑIGA PORTOCARRERO LINO,29558906,ZUÑIGA,PORTOCARRERO,LINO YSAURO,00009-2022-0-0412-JR-CI-01,1º JUZGADO CIVIL DE PAUCARPATA,AREQUIPA,BENAVIDES CASTILLO MAX MAYKOL,05/01/2022,...,ARCHIVO (ENVIADO),-------,DEMANDA,[ZUÑIGA PORTOCARRERO LINO],1,DEMANDADO,NATURAL,ROJAS,FLORES,VILMA GLORIA
7078,ZUÑIGA PORTOCARRERO LINO,29558906,ZUÑIGA,PORTOCARRERO,LINO YSAURO,00009-2022-0-0412-JR-CI-01,1º JUZGADO CIVIL DE PAUCARPATA,AREQUIPA,BENAVIDES CASTILLO MAX MAYKOL,05/01/2022,...,ARCHIVO (ENVIADO),-------,DEMANDA,[ZUÑIGA PORTOCARRERO LINO],1,DEMANDANTE,NATURAL,ROJAS,ROJAS,DANIEL ALBERTO
7079,ZUÑIGA PORTOCARRERO LINO,29558906,ZUÑIGA,PORTOCARRERO,LINO YSAURO,00092-2022-0-0412-JR-CI-01,1º JUZGADO CIVIL DE PAUCARPATA,AREQUIPA,TORREBLANCA GOMEZ CHRISTIAN OMAR,06/01/2022,...,ARCHIVO MODULAR,-------,DEMANDA,[ZUÑIGA PORTOCARRERO LINO],1,DEMANDANTE,NATURAL,ARELA,FLORES,SOFIO MIGUEL


In [46]:
downloads_amag_ii_raw = pd.merge(reportes_amag_ii_raw, files_downloads, how="inner", left_on="expediente_n°_", right_on="expediente_num")

In [47]:
help(create_pickle)

Help on function create_pickle in module __main__:

create_pickle(object_name, file_name: str, path: str) -> None
    Creates a pickle file for object. Note: Path should have no slash 
    at the end



Storing data on `.pkl` files

In [48]:
create_pickle(amag_ii_cases, "amag_ii_cases.pkl", dc_temp_path)

In [49]:
create_pickle(reportes_amag_ii_raw, "reportes_amag_ii_raw.pkl", dc_temp_path)

In [50]:
create_pickle(follow_up_amag_ii_raw, "follow_up_amag_ii_raw.pkl", dc_temp_path)

In [51]:
create_pickle(procedural_parts_amag_ii_raw, "procedural_parts_amag_ii_raw.pkl", dc_temp_path)

In [52]:
create_pickle(downloads_amag_ii_raw, "downloads_amag_ii_raw.pkl", dc_temp_path)

# 4. Preprocessing and cleaning of datasets

Reading gender dataframe

In [53]:
gender_dataset = pd.read_csv(dc_raw_path + "/harvard_set_gender.csv")

In [54]:
def spanish_cleaner(txt_file):
    text = txt_file
    text = re.sub(r"(&[a-zA-Z]*;)", " ", text)  # the txt files had some unwanted text like &rsquo; this line removes such text
    text = text.lower()

    # remove punctuation and numbers from the string
    punctuations = '''!()[]{};:'"\,<>./¿?@#$%^&*_–~=+¨`“”’|0123456789'''  # all but hyphens
    for x in text.lower(): 
        if x in punctuations: 
            text = text.replace(x, "")

    # replacing encoding characters
    enc_characters = [" st ", " nd ", " rd ", " th ", "srl", "lpfvf", "pctc", "jmxcff", "ayrq", "axu", "oadk", "jcxj", "nplt", "eef", "fcfc", "qyoc", "gobpe", "pfg", "vqrx", "csjppj", "xas", "feeback", "hafceqc", "xqj", "hellip", "rsquo", "ldquo", "rdquo", "ndash", "-", "n°", "nº", "º", "°", "dprgdonpdl", "«", "»", "…", "derjudicialpoderjudicialpoderjudicialpoderjudicialpoderjudicialpoderjudicialpoderjudicialpoderjudicialpoderjudicialpoderjudicialpoderjudicialpoderjudicialpoderjudicialpoderjudicialpoderjudicialpoderjudicialpoderjudicialpoderjudicialpoderjudicialpoderj", "ii", "iii", "vii", "viii"]
    
    for item in enc_characters:
        text = text.replace(item, " ")
    
    # cleaning for spanish stop words
    stopword_es = nltk.corpus.stopwords.words('spanish') # loading spanish stop words
    custom_substrs = ["http", "hangouts", "meet", "gmailcom"] # html related
    custom_gender_words = ["él", "ella", "la", "ese", "esa", "esos", "esas", "este", "esta", "aquel", "aquella", "aquellos", "aquellas", "lo", "la", "los", "las", "aquel", "aquella", "mío", "mía", "míos", "mías", "suyo", "suya", "suyos", "suyas"] # list with pronouns associated to a specific gender
    length_custom_stopwords = len(custom_substrs)
    words = text.split() # tokenizing sentence
    cleaned_words = [word for word in words if (word not in stopword_es and len(word) > 1) or word in custom_gender_words]
     
    sentence_no_custom = [] # omitting words that contain 
    for cleaned_word in cleaned_words:
        counter_stopwords = 0
        for word in custom_substrs: # evaluating if word contains substr
            if word not in cleaned_word: # if passes, +1 for counter
                counter_stopwords += 1
            if counter_stopwords == length_custom_stopwords: # append if passes all custom substrs tests
                sentence_no_custom.append(cleaned_word)

    return " ".join(sentence_no_custom)

  punctuations = '''!()[]{};:'"\,<>./¿?@#$%^&*_–~=+¨`“”’|0123456789'''  # all but hyphens


## 4.1. Preprocessing of downloads and follow up 

### Downloads Dataframe

In [55]:
# lowercase to text
downloads_amag_ii_raw["text"] = downloads_amag_ii_raw["text"].apply(lambda text: text.lower() 
                                                                    if type(text) is str else text)

In [56]:
# cleaning the text from the cases
downloads_amag_ii_raw["text"] = downloads_amag_ii_raw["text"].apply(lambda text: spanish_cleaner(text) 
                                                                    if type(text) is str else text)

### Downloads Follow Up

Fixing the date of the resolution

In [57]:
follow_up_amag_ii_raw["fecha_de_resolucion_ingreso_"] = follow_up_amag_ii_raw["fecha_de_resolucion_ingreso_"].apply(
                                                        lambda date: datetime.strptime(re.match("(\d+[-/]\d+[-/]\d+)", 
                                                        date)[0], "%d/%m/%Y"))

  lambda date: datetime.strptime(re.match("(\d+[-/]\d+[-/]\d+)",


Creating date and hour variable

In [58]:
follow_up_amag_ii_raw = follow_up_amag_ii_raw.rename(columns={"fecha_de_resolucion_ingreso_": "date"})

Remove extra white space and lower "acto"

In [59]:
# lowercase to text
follow_up_amag_ii_raw["acto_"] = follow_up_amag_ii_raw["acto_"].apply(lambda text: text.lower() 
                                                                    if type(text) is str else text)

In [60]:
# cleaning the text from the cases
follow_up_amag_ii_raw["acto_"] = follow_up_amag_ii_raw["acto_"].apply(lambda text: spanish_cleaner(text) 
                                                                    if type(text) is str else text)

Remove extra white space and lower "sumilla" (from merged dataframes)

In [61]:
sumillas = ["sumilla__x", "sumilla__y"]

In [62]:
for sumilla in sumillas:
    # lowercase to text
    follow_up_amag_ii_raw[sumilla] = follow_up_amag_ii_raw[sumilla].apply(lambda text: text.lower() 
                                                                    if type(text) is str else text)
    # cleaning the text from the cases
    follow_up_amag_ii_raw[sumilla] = follow_up_amag_ii_raw[sumilla].apply(lambda text: spanish_cleaner(text)
                                                                    if type(text) is str else text)

Remove extra white space and lower "descripcion de usuario"

In [63]:
# lowercase to text
follow_up_amag_ii_raw["descripcion_de_usuario_"] = follow_up_amag_ii_raw["descripcion_de_usuario_"].apply(lambda text: text.lower() 
                                                                    if type(text) is str else text)

In [64]:
# cleaning the text from the cases
follow_up_amag_ii_raw["descripcion_de_usuario_"] = follow_up_amag_ii_raw["descripcion_de_usuario_"].apply(lambda text: spanish_cleaner(text) 
                                                                    if type(text) is str else text)

Create variable that identifies rows with pdf or docx file in downloads dataset

In [65]:
follow_up_amag_ii_raw["descargado"] = follow_up_amag_ii_raw["descripcion_de_usuario_"].apply(lambda text: 1
                                                                                            if "descargado" in text
                                                                                            else 0)

### Create dataset of case_id/number of documents (obs with no duplicates)

Droping duplicates in terms `expediente_n°_` and `num`

In [66]:
downloads_full = downloads_amag_ii_raw[downloads_amag_ii_raw["link"].notna()]

In [67]:
downloads_full.drop_duplicates(subset=["expediente_n°_", "num"], inplace=True)

Merging the data with no duplicates

In [68]:
documents_amag = pd.merge(follow_up_amag_ii_raw, downloads_full, on=["expediente_n°_", "nrodocumento", "link"], how="left")

## 4.2. Identify keywords from text, acto and sumilla columns 

### Filtering rows without information

In [69]:
documents_amag = documents_amag[(documents_amag["acto_"] !="auto de saneamiento") & (documents_amag["acto_"] != "nota")]

In [70]:
documents_amag["text"] = documents_amag["text"].apply(lambda text: text if type(text) is not float else "")

### 4.2.1. `Parte resolutiva` variable

In [71]:
documents_amag["parte_resolutiva"] = documents_amag["text"].apply(lambda text: extract_text(text,
                                                                                           r"resuelve\s*([^\n\r]*)|fallo\s*([^\n\r]*)|resuelvo\s*([^\n\r]*)"))

### 4.2.2. `Apela` variable

In [72]:
documents_amag["appeal"] = documents_amag.apply(lambda row: 1 if "apela" in row.acto_ or "apela" in row.sumilla__x
                                                or "apela" in row.sumilla__y else 0, axis=1)

In [73]:
documents_amag["appeal"].value_counts()

0    977
1     19
Name: appeal, dtype: int64

### 4.2.3. `Sentencia` variable

In [74]:
documents_amag["sentencia_acto"] = documents_amag.apply(lambda row: 1 if "sentencia" in row.acto_ else 0, axis=1)
documents_amag["sentencia_sumilla"] = documents_amag.apply(lambda row: 1 if "sentencia" in row.sumilla__x or 
                                                           "sentencia" in row.sumilla__y else 0, axis=1)
documents_amag["sentencia"] = documents_amag.apply(lambda row: 1 if row.sentencia_acto == 1 or row.sentencia_sumilla == 1 
                                                   else 0, axis=1)

In [75]:
documents_amag["sentencia_acto"].value_counts()

0    984
1     12
Name: sentencia_acto, dtype: int64

In [76]:
documents_amag["sentencia_sumilla"].value_counts()

0    985
1     11
Name: sentencia_sumilla, dtype: int64

In [77]:
documents_amag["sentencia"].value_counts()

0    973
1     23
Name: sentencia, dtype: int64

### 4.2.4. `Auto` variable

In [78]:
documents_amag["auto_final"] = documents_amag.apply(lambda row: 1 if "auto final" in row.acto_ or 
                                                           "auto final" in row.sumilla__x or
                                                           "auto final" in row.sumilla__y else 0, axis=1)

In [79]:
documents_amag["auto_definitivo"] = documents_amag.apply(lambda row: 1 if "auto definitivo" in row.acto_ or 
                                                           "auto definitivo" in row.sumilla__x or
                                                           "auto definitivo" in row.sumilla__y else 0, axis=1)

In [80]:
documents_amag["auto_definitivo"].value_counts()

0    993
1      3
Name: auto_definitivo, dtype: int64

### 4.2.5. `Final` variable

In [81]:
documents_amag["auto_improcedente"] = documents_amag.apply(lambda row: 1 if "auto que declara improcedente" in row.acto_ or 
                                                           "auto que declara improcedente" in row.sumilla__x or
                                                           "auto que declara improcedente" in row.sumilla__y or 
                                                           "auto improcedente" in row.acto_ or 
                                                           "auto improcedente" in row.sumilla__x or
                                                           "auto improcedente" in row.sumilla__y else 0, axis=1)

In [82]:
documents_amag["auto_improcedente"].value_counts()

0    986
1     10
Name: auto_improcedente, dtype: int64

### 4.2.6. `Vista 2` variable

In [83]:
documents_amag["vista2"] = documents_amag.apply(lambda row: 1 if "sentencia de vista" in row.acto_ or 
                                                           "sentencia de vista" in row.sumilla__x or
                                                           "sentencia de vista" in row.sumilla__y or 
                                                           "auto de vista" in row.acto_ or 
                                                           "auto de vista" in row.sumilla__x or
                                                           "auto de vista" in row.sumilla__y else 0, axis=1)

In [84]:
documents_amag["vista2"].value_counts()

0    996
Name: vista2, dtype: int64

### 4.2.7. `Revoca 2` variable

In [85]:
documents_amag["revoca2"] = documents_amag.apply(lambda row: 1 if "vista que revoca" in row.acto_ else 0, axis=1)

In [86]:
documents_amag["revoca2"].value_counts()

0    996
Name: revoca2, dtype: int64

### 4.2.8. `Anula 2` variable

In [87]:
documents_amag["nula2"] = documents_amag.apply(lambda row: 1 if "vista que anula" in row.acto_ else 0, axis=1)

In [88]:
documents_amag["nula2"].value_counts()

0    996
Name: nula2, dtype: int64

### 4.2.9. `Confirma 2` variable

In [89]:
documents_amag["confirma2"] = documents_amag.apply(lambda row: 1 if "vista que confirma" in row.acto_ else 0, axis=1)

In [90]:
documents_amag["confirma2"].value_counts()

0    996
Name: confirma2, dtype: int64

### 4.2.10. `Fundada` variable

In [91]:
def evaluate_vals(row, list_substrs: list) -> int:
    """determines whether the case matches a substr from list"""
    
    values = [row.parte_resolutiva, row.sumilla__x, row.sumilla__y, row.acto_]
    output = 0
    for value in values:
        if type(value) is not str:
            break
        else:
            for substr in list_substrs:
                if substr in value:
                    output = 1
                    return output
    return output

In [92]:
documents_amag["fundada"] = documents_amag.apply(lambda row: evaluate_vals(row, [" fundada la demanda", "sentencia fundada"]), axis=1)

In [93]:
documents_amag["fundada"].value_counts()

0    988
1      8
Name: fundada, dtype: int64

### 4.2.11. `Fundada en parte` variable

In [94]:
documents_amag["fundada_parte"] = documents_amag.apply(lambda row: evaluate_vals(row, ["fundada en parte"]), axis=1)

In [95]:
documents_amag["fundada_parte"].value_counts()

0    996
Name: fundada_parte, dtype: int64

### 4.2.12. `Infundada` variable

In [96]:
documents_amag["infundada"] = documents_amag.apply(lambda row: evaluate_vals(row, ["infundada la demanda", "sentencia infundada"]), axis=1)

In [97]:
documents_amag["infundada"].value_counts()

0    996
Name: infundada, dtype: int64

### 4.2.12. `Vista` variable

In [98]:
documents_amag["vista"] = documents_amag.apply(lambda row: evaluate_vals(row, ["sentencia de vista", "auto de vista"]), axis=1)

In [99]:
documents_amag["vista"].value_counts()

0    996
Name: vista, dtype: int64

### 4.2.13. `Revoca` variable

In [100]:
documents_amag["revoca"] = documents_amag.apply(lambda row: evaluate_vals(row, ["vista que revoca", "revocar la sentencia", "revocar la resolucion", "revocar en parte", "revocaron la sentencia", "revocaron la resolución"]), axis=1)

In [101]:
documents_amag["revoca"].value_counts()

0    996
Name: revoca, dtype: int64

### 4.2.13. `Anula` variable

In [102]:
documents_amag["nula"] = documents_amag.apply(lambda row: evaluate_vals(row, ["vista que anula", "declarar nula", "declara nula", "declara nulo", "declarar nulo", "declarar: nula", "declarar la nulidad", "declararon nula"]), axis=1)

In [103]:
documents_amag["nula"].value_counts()

0    996
Name: nula, dtype: int64

### 4.2.14. `Confirma` variable

In [104]:
documents_amag["confirma"] = documents_amag.apply(lambda row: evaluate_vals(row, ["vista que confirma", "confirmaron el auto", "confirmaron la sentencia", "aprobaron la sentencia", "confirma sentencia", "confirma la sentencia", 
                                                                                  "confirmar la sentencia", "confirmar resolucion", "confirmar resolución", "confirmar la resolucion", "confirmar en parte", "confirmar la resolución"]), axis=1)

In [105]:
documents_amag["confirma"].value_counts()

0    996
Name: confirma, dtype: int64

In [106]:
documents_amag.to_csv(dc_interm_path + "/documents_amag_ii_clean.csv")

## 4.3. Preprocessing of reportes and procedural parts

Obtaining reportes

In [107]:
reportes_amag = reportes_amag_ii_raw[["expediente_n°_", "distrito_judicial__x", "distrito_judicial__y", "proceso__x", 
                                      "proceso__y", "especialidad__x", "especialidad__y", "estado__x", "estado__y", 
                                      "etapa_procesal__x", "etapa_procesal__y"]]

In [108]:
reportes_amag.to_csv(dc_interm_path + "/reportes_amag_ii_clean.csv")

Obtaining procedural parts

In [109]:
procedural_parts_amag_ii_raw = read_pickle("procedural_parts_amag_ii_raw.pkl", dc_temp_path)

### 4.3.1. Creating `parties` variable

In [110]:
procedural_parts_amag_ii_raw["parties"] = procedural_parts_amag_ii_raw.apply(lambda row: "plaintiff" if 
                                                                             row.parte == "DEMANDANTE" or 
                                                                             row.parte == "AGRAVIADO" or 
                                                                             row.parte == "VÍCTIMA" or
                                                                             row.parte == "SOLICITANTE" or
                                                                             row.parte == "DENUNCIANTE"
                                                                             else np.NaN, axis=1)

In [111]:
procedural_parts_amag_ii_raw["parties"] = procedural_parts_amag_ii_raw.apply(lambda row: "defendant" if 
                                                                             row.parte == "DEMANDADO" or 
                                                                             row.parte == "AGRESOR" or 
                                                                             row.parte == "DENUNCIADO"
                                                                             else row.parties, axis=1)

In [112]:
procedural_parts_amag_ii_raw["parties"] = procedural_parts_amag_ii_raw.apply(lambda row: "other" if 
                                                                             row.parties != "plaintiff" and 
                                                                             row.parties != "defendant"
                                                                             else row.parties, axis=1)

In [113]:
procedural_parts_amag_ii_raw["parties"].value_counts()

defendant    318
plaintiff    232
other          5
Name: parties, dtype: int64

### 4.3.2. Creating `first_name` and `second_name` variables

In [114]:
procedural_parts_amag_ii_raw["nombres"] = procedural_parts_amag_ii_raw["nombres"].apply(lambda nombres:
                                                            nombres.lower() if nombres != "\t\t\t\t\t\t\t\t\t\t\t\t\t"
                                                            else "")

In [115]:
procedural_parts_amag_ii_raw["first_name"] = procedural_parts_amag_ii_raw["nombres"].apply(lambda nombres: 
                                                                                           nombres.split(" ")[0])
procedural_parts_amag_ii_raw["second_name"] = procedural_parts_amag_ii_raw["nombres"].apply(lambda nombres: 
                                                                                           nombres.split(" ")[1] 
                                                                                           if len(nombres.split(" ")) == 2
                                                                                           else "")

In [116]:
procedural_parts_amag = pd.merge(procedural_parts_amag_ii_raw, gender_dataset, how="left", 
                                 left_on="first_name", right_on="name")
procedural_parts_amag = procedural_parts_amag.rename(columns={"female": "female_first"})
procedural_parts_amag = procedural_parts_amag.drop(columns=["name"])

In [117]:
procedural_parts_amag = pd.merge(procedural_parts_amag, gender_dataset, how="left", 
                                 left_on="second_name", right_on="name")
procedural_parts_amag = procedural_parts_amag.rename(columns={"female": "female_second"})
procedural_parts_amag = procedural_parts_amag.drop(columns=["name"])

In [118]:
procedural_parts_amag["female"] = procedural_parts_amag.apply(lambda row: 1 if row.female_first == 1 else np.NaN, axis=1)
procedural_parts_amag["female"] = procedural_parts_amag.apply(lambda row: 0 if row.female_first == 0 else row.female, axis=1)

In [119]:
procedural_parts_amag["female"].value_counts()

0.0    119
1.0     93
Name: female, dtype: int64

### 4.3.4. Creating `legal_entity` variable

In [120]:
procedural_parts_amag["legal_entity"] = procedural_parts_amag["tipo_depersona"].apply(lambda tipo_de_persona:
                                                                                     1 if tipo_de_persona == "JURIDICA"
                                                                                     else 0)

### 4.3.5. Collapsing at the expediente level

In [121]:
procedural_parts_amag_collapsed = procedural_parts_amag[["expediente_n°_", "parties", "female", "legal_entity"]].groupby(
                             by=["expediente_n°_", "parties"]).agg(female_ratio=("female", "mean"),
                                                                   female_indicator=("female", "max"),
                                                                   legal_entity_ratio=("legal_entity", "mean"),
                                                                   legal_entity_indicator=("legal_entity", "max")
                                                                   ).reset_index()

### 4.3.6. Reshaping dataframe

In [122]:
procedural_parts_amag_reshaped = pd.pivot_table(procedural_parts_amag_collapsed, values=["female_ratio", "legal_entity_ratio", "female_indicator", 
                           "legal_entity_indicator"], columns=["parties"], index=["expediente_n°_"])

In [123]:
procedural_parts_amag_reshaped.columns = ["_".join(col).strip() for col in procedural_parts_amag_reshaped.columns.values]

Storing results

In [124]:
procedural_parts_amag_collapsed.to_csv(dc_interm_path + "/procedural_parts_amag_ii_clean.csv")

# 5. Creating case outcomes variables

In [125]:
documents_amag_clean = pd.read_csv(dc_interm_path + "/documents_amag_ii_clean.csv")

In [126]:
reportes_amag_clean = pd.read_csv(dc_interm_path + "/reportes_amag_ii_clean.csv")

In [127]:
procedural_parts_amag_clean = pd.read_csv(dc_interm_path + "/procedural_parts_amag_ii_clean.csv")

## 5.1. Creating fundada outcomes

### 5.1.1. Preparing fundada variables for outcome creation

In [128]:
documents_amag_fundada = documents_amag_clean.groupby("expediente_n°_").agg(fundada=("fundada", "max"),
                                                   fundada_parte=("fundada_parte", "max"),
                                                   infundada=("infundada", "max")).reset_index()

In [129]:
# dropping NaN values
documents_amag_fundada = documents_amag_fundada[documents_amag_fundada[["expediente_n°_", "fundada", "fundada_parte", "infundada"]].notna()]

### 5.1.2. Creating fundada variables based on intermediate variables

In [130]:
documents_amag_fundada["var_fundada"] = documents_amag_fundada.apply(lambda row: 1 if row["fundada"] == 1 else 
                                                                     0 if row["fundada"] == 0 else np.NaN, axis=1)

In [131]:
documents_amag_fundada["var_fundada"] = documents_amag_fundada.apply(lambda row: 1 if row["fundada_parte"] == 1 else 
                                                                     row["var_fundada"], axis=1) 

Storing results

In [132]:
fundada_var = documents_amag_fundada[["expediente_n°_", "var_fundada"]]

In [133]:
fundada_var["var_fundada"].value_counts()

0    213
1      8
Name: var_fundada, dtype: int64

## 5.2. Creating resolution outcomes

### 5.2.1. Preparing fundada variables for outcome creation

In [134]:
documents_amag_resolution = documents_amag_clean.groupby("expediente_n°_").agg(vista2=("vista2", "max"),
                                                   revoca2=("revoca2", "max"),
                                                   nula2=("nula2", "max"),
                                                   confirma2=("confirma2", "max"),
                                                   auto_improcedente=("auto_improcedente", "max"), 
                                                   auto_final=("auto_final", "max"),
                                                   auto_definitivo=("auto_definitivo", "max"),
                                                   fundada=("fundada", "max"),
                                                   fundada_parte=("fundada_parte", "max"),
                                                   infundada=("infundada", "max")).reset_index()

In [135]:
# dropping NaN values
documents_amag_resolution = documents_amag_resolution[documents_amag_resolution[["expediente_n°_", "vista2", "revoca2", 
                                                                                 "nula2", "confirma2", "auto_improcedente",
                                                                                 "auto_final", "auto_definitivo",
                                                                                 "fundada", "fundada_parte", "infundada"]].notna()]

### 5.1.2. Creating fundada variables based on intermediate variables

In [136]:
documents_amag_resolution["var_resolution"] = documents_amag_resolution.apply(lambda row: 1 if row["vista2"] == 1 or
                                                                              row["revoca2"] == 1 or row["nula2"] == 1 or
                                                                              row["confirma2"] == 1 or 
                                                                              row["auto_improcedente"] == 1 or
                                                                              row["auto_final"] == 1 or
                                                                              row["auto_definitivo"] == 1 or
                                                                              row["fundada"] == 1 or
                                                                              row["fundada_parte"] == 1 or
                                                                              row["infundada"] == 1 else 0, 
                                                                              axis=1)

Storing results

In [137]:
resolution_var = documents_amag_resolution[["expediente_n°_", "var_resolution"]]

In [138]:
resolution_var["var_resolution"].value_counts()

0    163
1     58
Name: var_resolution, dtype: int64

## 5.3. Creating appeal outcomes

### 5.3.1. Filtering data

In [139]:
documents_amag_appeal = documents_amag_clean.copy() # creating appeal df
documents_amag_appeal["decision"] = documents_amag_appeal.apply(lambda row: 1 if row["fundada"] == 1 or 
                                                                row["fundada_parte"] == 1 or
                                                                row["fundada_parte"] == 1 else 0, axis=1)
documents_amag_appeal = documents_amag_appeal[documents_amag_appeal["decision"] == 1]

In [140]:
documents_amag_appeal = documents_amag_appeal[documents_amag_appeal["decision"] == 1]

### 5.3.2. Creating appeal variables 

In [141]:
# appeal col
documents_amag_appeal = documents_amag_appeal.groupby("expediente_n°_").agg(var_appeal=("appeal", "max")).reset_index()

In [142]:
columns = ["c1", "c2", "c3", "c4", "c5", "c6", "c7"] # expediente cols NOTE: this is not used in R script
documents_amag_appeal[columns] = documents_amag_appeal["expediente_n°_"].str.split("-", expand=True)

Storing results

In [143]:
appeal_var = documents_amag_appeal[["expediente_n°_", "var_appeal"]]

In [144]:
appeal_var["var_appeal"].value_counts()

0    8
Name: var_appeal, dtype: int64

## 5.3. Creating reversal outcomes

### 5.3.1. Creating reversal variables 

In [145]:
# reversal col
documents_amag_reversal = documents_amag_clean.groupby("expediente_n°_").agg(vista=("vista", "max"),
                                                                            revoca=("revoca", "max"),
                                                                            nula=("nula", "max"),
                                                                            confirma=("confirma", "max")).reset_index()

In [146]:
documents_amag_reversal["var_reversal"] = documents_amag_reversal.apply(lambda row: 1 if row.revoca == 1 or 
                                                                       row.nula == 1 else 0, axis=1)

In [147]:
documents_amag_reversal["var_reversal"].value_counts()

0    221
Name: var_reversal, dtype: int64

### 5.3.2. Merging with appeal var and storing results

In [148]:
reversal_var = pd.merge(documents_amag_reversal[["expediente_n°_", "var_reversal"]], appeal_var)

## 5.4. Creating first auto outcome

In [149]:
date_first_auto_var = documents_amag_clean.groupby("expediente_n°_").agg(date_first_auto=("date", "min")).reset_index()

## 5.5. Creating end date for resolutions

### 5.5.1. Filtering data

In [150]:
date_resolution_var = documents_amag_clean[(documents_amag_clean["vista2"] == 1) |
                                           (documents_amag_clean["revoca2"] == 1) |
                                           (documents_amag_clean["nula2"] == 1) | 
                                           (documents_amag_clean["confirma2"] == 1) | 
                                           (documents_amag_clean["auto_improcedente"] == 1) | 
                                           (documents_amag_clean["auto_definitivo"] == 1) | 
                                           (documents_amag_clean["auto_final"] == 1) | 
                                           (documents_amag_clean["fundada"] == 1) | 
                                           (documents_amag_clean["fundada_parte"] == 1) | 
                                           (documents_amag_clean["infundada"] == 1)]

### 5.5.2. Creating date variable

In [151]:
date_resolution_var = date_resolution_var.groupby("expediente_n°_").agg(date_resolution=("date", "max")).reset_index()

## 5.6. Creating end date for verdict

### 5.6.1. Filtering data

In [152]:
date_verdict_var = documents_amag_clean[(documents_amag_clean["fundada"] == 1) |
                                         (documents_amag_clean["fundada_parte"] == 1) |
                                         (documents_amag_clean["infundada"] == 1)]

### 5.6.2. Creating date variable and storing data

In [153]:
date_verdict_var = date_verdict_var.groupby("expediente_n°_").agg(date_verdict=("date", "max")).reset_index().drop_duplicates()

## 5.7. Creating end date for reversals

### 5.7.1. Filtering data

In [154]:
documents_amag_d_reversal = documents_amag_clean[(documents_amag_clean["revoca"] == 1) |
                                         (documents_amag_clean["nula"] == 1) |
                                         (documents_amag_clean["confirma"] == 1)]

### 5.7.2. Creating date variables

In [155]:
date_reversal_var = documents_amag_d_reversal.groupby("expediente_n°_").agg(date_reversal=("date", "max")).reset_index().drop_duplicates()

### 5.7.3. Keeping data with `reversal verdict length > 0`

Merging date of reversal date w/ reversal df

In [158]:
date_reversal_var = pd.merge(date_reversal_var, reversal_var)

In [159]:
date_reversal_var = pd.merge(date_reversal_var, date_verdict_var)

Length of reversal verdict

In [161]:
date_reversal_var["length_verdict_reversal"] = date_reversal_var["date_reversal"] - date_reversal_var["date_verdict"]

In [162]:
date_reversal_var = date_reversal_var[date_reversal_var["length_verdict_reversal"] > 0]

## 5.7. Creating case outcomes dataframe

In [163]:
case_outcomes = pd.merge(fundada_var, appeal_var)

In [164]:
case_outcomes = pd.merge(case_outcomes, resolution_var)

In [165]:
case_outcomes = pd.merge(case_outcomes, date_first_auto_var)

In [166]:
case_outcomes = pd.merge(case_outcomes, date_verdict_var)

**TODO:** Run cell when real data available
```python
case_outcomes = pd.merge(case_outcomes, date_reversal_var) # it'll generate empty results if no vals in date_reversal
```

In [167]:
case_outcomes = pd.merge(case_outcomes, date_resolution_var)

In [168]:
case_outcomes = pd.merge(case_outcomes, reportes_amag_clean)

**TODO:** Run cell when real data available
```python
case_outcomes = pd.merge(case_outcomes, procedural_parts_amag_clean) # it'll generate empty results if no vals in date_reversal
```

## 5.8. Year variables

In [169]:
case_outcomes["year"] = case_outcomes["date_first_auto"].apply(lambda date: datetime.strptime(date.split("-")[0], "%Y").year)

In [170]:
case_outcomes["month_year_first_auto"] = case_outcomes["date_first_auto"].apply(lambda date: datetime.strptime(date.split("-")[0] + 
                                                                                "-" + date.split("-")[1], "%Y-%m"))

In [171]:
case_outcomes["month_year_verdict"] = case_outcomes["date_verdict"].apply(lambda date: datetime.strptime(date.split("-")[0] + 
                                                                                "-" + date.split("-")[1], "%Y-%m"))

In [172]:
case_outcomes["month_year_resolution"] = case_outcomes["date_resolution"].apply(lambda date: datetime.strptime(date.split("-")[0] + 
                                                                                "-" + date.split("-")[1], "%Y-%m"))

## 5.9. During/after treatment variables

In [173]:
treatment_start_date = datetime.strptime("2021-06-05", "%Y-%m-%d")
treatment_final_date = datetime.strptime("2021-07-21", "%Y-%m-%d")

### 5.9.1. Case start date variables: before/during/after treatment

In [174]:
case_outcomes["first_auto_before_treatment"] = case_outcomes["date_first_auto"].apply(lambda date: datetime.strptime(date, "%Y-%m-%d") < treatment_start_date)

In [175]:
case_outcomes["first_auto_before_treatment"].value_counts()

False    8
Name: first_auto_before_treatment, dtype: int64

In [176]:
# case start during treatment
case_outcomes["first_auto_during_treatment"] = case_outcomes["date_first_auto"].apply(lambda date: datetime.strptime(date, "%Y-%m-%d") >= treatment_start_date
                                                                                      and datetime.strptime(date, "%Y-%m-%d") < treatment_final_date)

In [177]:
case_outcomes["first_auto_during_treatment"].value_counts()

False    8
Name: first_auto_during_treatment, dtype: int64

In [178]:
# case start during or after treatment
case_outcomes["first_auto_duringafter_treatment"] = case_outcomes["date_first_auto"].apply(lambda date: datetime.strptime(date, "%Y-%m-%d") >= treatment_start_date)

In [179]:
case_outcomes["first_auto_duringafter_treatment"].value_counts()

True    8
Name: first_auto_duringafter_treatment, dtype: int64

In [180]:
# case start after treatment
case_outcomes["first_auto_after_treatment"] = case_outcomes["date_first_auto"].apply(lambda date: datetime.strptime(date, "%Y-%m-%d") >= treatment_final_date)

In [181]:
case_outcomes["first_auto_after_treatment"].value_counts()

True    8
Name: first_auto_after_treatment, dtype: int64

### 5.9.2. Verdict date variables: before/during/after treatment

In [182]:
# verdict before treatment
case_outcomes["verdict_before_treatment"] = case_outcomes["date_verdict"].apply(lambda date: datetime.strptime(date, "%Y-%m-%d") < treatment_start_date)

In [183]:
case_outcomes["verdict_before_treatment"].value_counts()

False    8
Name: verdict_before_treatment, dtype: int64

In [184]:
# verdict during treatment
case_outcomes["verdict_during_treatment"] = case_outcomes["date_verdict"].apply(lambda date: datetime.strptime(date, "%Y-%m-%d") >= treatment_start_date
                                                                                and datetime.strptime(date, "%Y-%m-%d") < treatment_final_date)

In [185]:
case_outcomes["verdict_during_treatment"].value_counts()

False    8
Name: verdict_during_treatment, dtype: int64

In [186]:
# case start during or after treatment
case_outcomes["verdict_duringafter_treatment"] = case_outcomes["date_verdict"].apply(lambda date: datetime.strptime(date, "%Y-%m-%d") >= treatment_start_date)

In [187]:
case_outcomes["verdict_duringafter_treatment"].value_counts()

True    8
Name: verdict_duringafter_treatment, dtype: int64

In [188]:
# case start after treatment
case_outcomes["verdict_after_treatment"] = case_outcomes["date_verdict"].apply(lambda date: datetime.strptime(date, "%Y-%m-%d") >= treatment_final_date)

In [189]:
case_outcomes["verdict_after_treatment"].value_counts()

True    8
Name: verdict_after_treatment, dtype: int64

### 5.9.3. Reversal date variables: before/during/after treatment

**TODO:** Run cell when real data available
```python
# verdict before treatment
case_outcomes["reversal_before_treatment"] = case_outcomes["date_reversal"].apply(lambda date: datetime.strptime(date, "%Y-%m-%d") < treatment_start_date)

# reversal during treatment
case_outcomes["reversal_during_treatment"] = case_outcomes["date_reversal"].apply(lambda date: datetime.strptime(date, "%Y-%m-%d") >= treatment_start_date
                                                                                and datetime.strptime(date, "%Y-%m-%d") < treatment_final_date)

# case start during or after treatment
case_outcomes["reversal_duringafter_treatment"] = case_outcomes["date_reversal"].apply(lambda date: datetime.strptime(date, "%Y-%m-%d") >= treatment_start_date)

# case start after treatment
case_outcomes["reversal_after_treatment"] = case_outcomes["date_reversal"].apply(lambda date: datetime.strptime(date, "%Y-%m-%d") >= treatment_final_date)
```

### 5.9.3. Resolution date variables: before/during/after treatment

In [193]:
# verdict before treatment
case_outcomes["resolution_before_treatment"] = case_outcomes["date_resolution"].apply(lambda date: datetime.strptime(date, "%Y-%m-%d") < treatment_start_date)

In [194]:
case_outcomes["resolution_before_treatment"].value_counts()

False    8
Name: resolution_before_treatment, dtype: int64

In [195]:
# resolution during treatment
case_outcomes["resolution_during_treatment"] = case_outcomes["date_resolution"].apply(lambda date: datetime.strptime(date, "%Y-%m-%d") >= treatment_start_date
                                                                                and datetime.strptime(date, "%Y-%m-%d") < treatment_final_date)

In [196]:
case_outcomes["resolution_during_treatment"].value_counts()

False    8
Name: resolution_during_treatment, dtype: int64

In [197]:
# case start during or after treatment
case_outcomes["resolution_duringafter_treatment"] = case_outcomes["date_resolution"].apply(lambda date: datetime.strptime(date, "%Y-%m-%d") >= treatment_start_date)

In [198]:
case_outcomes["resolution_duringafter_treatment"].value_counts()

True    8
Name: resolution_duringafter_treatment, dtype: int64

In [199]:
# case start after treatment
case_outcomes["resolution_after_treatment"] = case_outcomes["date_resolution"].apply(lambda date: datetime.strptime(date, "%Y-%m-%d") >= treatment_final_date)

In [200]:
case_outcomes["resolution_after_treatment"].value_counts()

True    8
Name: resolution_after_treatment, dtype: int64

### 5.10. Length date variables

In [201]:
# length between first auto and resolution
case_outcomes["length_auto_resolution"] = case_outcomes.apply(lambda row: datetime.strptime(row["date_resolution"], "%Y-%m-%d")
                                                                  - datetime.strptime(row["date_first_auto"], "%Y-%m-%d"), axis=1)

In [202]:
# Length between first auto and sentence
case_outcomes["length_auto_sentence"] = case_outcomes.apply(lambda row: datetime.strptime(row["date_verdict"], "%Y-%m-%d")
                                                                  - datetime.strptime(row["date_first_auto"], "%Y-%m-%d"), axis=1)

**TODO:** Run cell when real data available
```python
# length between first auto and reversal
case_outcomes["length_auto_reversal"] = case_outcomes.apply(lambda row: datetime.strptime(row["date_reversal"], "%Y-%m-%d")
                                                                  - datetime.strptime(row["date_first_auto"], "%Y-%m-%d"), axis=1)

# length between sentence and reversal
case_outcomes["length_sentence_reversal"] = case_outcomes.apply(lambda row: datetime.strptime(row["date_reversal"], "%Y-%m-%d")
                                                                  - datetime.strptime(row["date_first_auto"], "%Y-%m-%d"), axis=1)
```

### 5.11. Unconditional reversal variable

**TODO:** Run cell when real data available
```python
# creating "especialidad" variable
case_outcomes["var_uncon_reversal"] = case_outcomes.apply(lambda row: row.var_reversal if row.var_reversal != np.NaN
                                                          else 0, axis=1)
```

### 5.12. Speciality variables

In [207]:
# creating "especialidad" variable
case_outcomes["especialidad"] = case_outcomes["especialidad__x"].apply(lambda especialidad: especialidad)
case_outcomes["especialidad"] = case_outcomes.apply(lambda row: row.especialidad__y if row.especialidad__x != np.NaN
                                                    else np.NaN, axis=1)

In [208]:
# length between first auto and resolution
case_outcomes["especialidad"] = case_outcomes.apply(lambda row: "otro" if row.especialidad == np.NaN else row.especialidad
                                                    , axis=1)

### 5.13. Case speciality variables

In [209]:
# length between first auto and resolution
case_outcomes["case_speciality"] = case_outcomes.apply(lambda row: "civil" if row.especialidad == "CIVIL" 
                                                       else np.NaN, axis=1)
case_outcomes["case_speciality"] = case_outcomes.apply(lambda row: "familia civil" if row.especialidad == "FAMILIA CIVIL" 
                                                       else row.case_speciality, axis=1)
case_outcomes["case_speciality"] = case_outcomes.apply(lambda row: "familia tutelar" if row.especialidad == "FAMILIA TUTELAR" 
                                                       else row.case_speciality, axis=1)
case_outcomes["case_speciality"] = case_outcomes.apply(lambda row: "laboral" if row.especialidad == "LABORAL" 
                                                       else row.case_speciality, axis=1)
case_outcomes["case_speciality"] = case_outcomes.apply(lambda row: "otro" if row.especialidad == "COMERCIAL" 
                                                       else row.case_speciality, axis=1)
case_outcomes["case_speciality"] = case_outcomes.apply(lambda row: "otro" if row.especialidad == "CONTENCIOSO ADM." 
                                                       else row.case_speciality, axis=1)
case_outcomes["case_speciality"] = case_outcomes.apply(lambda row: "otro" if row.especialidad == "DERECHO CONSTITUCIONAL" 
                                                       else row.case_speciality, axis=1)
case_outcomes["case_speciality"] = case_outcomes.apply(lambda row: "otro" if row.especialidad == "OTRO" 
                                                       else row.case_speciality, axis=1)

In [210]:
# length between first auto and resolution
case_outcomes["speciality_civil"] = case_outcomes.apply(lambda row: 1 if row.case_speciality == "civil" 
                                                        else 0, axis=1)

In [211]:
# length between first auto and resolution
case_outcomes["speciality_familia_civil"] = case_outcomes.apply(lambda row: 1 if row.case_speciality == "familia civil" 
                                                                else 0, axis=1)

In [212]:
# length between first auto and resolution
case_outcomes["speciality_familia_tutelar"] = case_outcomes.apply(lambda row: 1 if row.case_speciality == "familia tutelar" 
                                                                  else 0, axis=1)

In [213]:
# length between first auto and resolution
case_outcomes["speciality_laboral"] = case_outcomes.apply(lambda row: 1 if row.case_speciality == "laboral" else 0, axis=1)

In [214]:
# length between first auto and resolution
case_outcomes["speciality_otro"] = case_outcomes.apply(lambda row: 1 if row.case_speciality == "otro" else 0, axis=1)

## 5.14. Process variables

### 5.14.1. Creating proceso variable based on `proceso__x` and `proceso__y`

In [215]:
case_outcomes["proceso"] = case_outcomes["proceso__x"].apply(lambda proceso: proceso)
case_outcomes["proceso"] = case_outcomes.apply(lambda row: row.proceso__y if row.proceso == np.NaN else
                                               row.proceso, axis=1)

In [216]:
case_outcomes["case_process"] = case_outcomes["proceso"].apply(lambda proceso: "unico" if proceso == "UNICO" else np.NaN)

### 5.14.2. Creating type of process variable

In [217]:
case_outcomes["case_process"] = case_outcomes.apply(lambda row: "ejecucion" if row.proceso == "EJECUCION" else row.case_process, axis=1)
case_outcomes["case_process"] = case_outcomes.apply(lambda row: "ejecucion" if row.proceso == "EJECUTIVO" else row.case_process, axis=1)
case_outcomes["case_process"] = case_outcomes.apply(lambda row: "ejecucion" if row.proceso == "UNICO DE EJECUCION" else row.case_process, axis=1)
case_outcomes["case_process"] = case_outcomes.apply(lambda row: "sumarisimo" if row.proceso == "SUMARISIMO" else row.case_process, axis=1)
case_outcomes["case_process"] = case_outcomes.apply(lambda row: "abreviado" if row.proceso == "ABREVIADO" else row.case_process, axis=1)
case_outcomes["case_process"] = case_outcomes.apply(lambda row: "conocimiento" if row.proceso == "CONOCIMIENTO" else row.case_process, axis=1)
case_outcomes["case_process"] = case_outcomes.apply(lambda row: "no_contensioso" if row.proceso == "NO CONTENCIOSO" else row.case_process, axis=1)
case_outcomes["case_process"] = case_outcomes.apply(lambda row: "constitucional" if row.proceso == "CONSTITUCIONAL" else row.case_process, axis=1)
case_outcomes["case_process"] = case_outcomes.apply(lambda row: "otro" if row.proceso == "CONTENCIOSO ADMINISTRATIVO" else row.case_process, axis=1)
case_outcomes["case_process"] = case_outcomes.apply(lambda row: "otro" if row.proceso == "ESPECIAL" else row.case_process, axis=1)
case_outcomes["case_process"] = case_outcomes.apply(lambda row: "otro" if row.proceso == "ESPECIAL LEY 30634" else row.case_process, axis=1)
case_outcomes["case_process"] = case_outcomes.apply(lambda row: "otro" if row.proceso == "EXHORTO" else row.case_process, axis=1)
case_outcomes["case_process"] = case_outcomes.apply(lambda row: "otro" if row.proceso == "INVESTIGACION TUTELAR" else row.case_process, axis=1)
case_outcomes["case_process"] = case_outcomes.apply(lambda row: "otro" if row.proceso == "ORDINARIO" else row.case_process, axis=1)
case_outcomes["case_process"] = case_outcomes.apply(lambda row: "otro" if row.proceso == "CAUTELAR" else row.case_process, axis=1)
case_outcomes["case_process"] = case_outcomes.apply(lambda row: "otro" if row.proceso == "PROCEDIMIENTOS CIVILES" else row.case_process, axis=1)
case_outcomes["case_process"] = case_outcomes.apply(lambda row: "otro" if row.proceso == "URGENTE" else row.case_process, axis=1)
case_outcomes["case_process"] = case_outcomes.apply(lambda row: "otro" if row.proceso == "OTRO" else row.case_process, axis=1)

### 5.14.3. Creating variables per type of process

In [218]:
case_outcomes["process_unico"] = case_outcomes.apply(lambda row: 1 if row.case_process == "unico" else 0, axis=1)

In [219]:
case_outcomes["process_ejecucion"] = case_outcomes.apply(lambda row: 1 if row.case_process == "ejecucion" else 0, axis=1)

In [220]:
case_outcomes["process_sumarisimo"] = case_outcomes.apply(lambda row: 1 if row.case_process == "sumarisimo" else 0, axis=1)

In [221]:
case_outcomes["process_abreviado"] = case_outcomes.apply(lambda row: 1 if row.case_process == "abreviado" else 0, axis=1)

In [222]:
case_outcomes["process_conocimiento"] = case_outcomes.apply(lambda row: 1 if row.case_process == "conocimiento" else 0, axis=1)

In [223]:
case_outcomes["process_no_contencioso"] = case_outcomes.apply(lambda row: 1 if row.case_process == "no_contencioso" else 0, axis=1)

In [224]:
case_outcomes["process_constitucional"] = case_outcomes.apply(lambda row: 1 if row.case_process == "constitucional" else 0, axis=1)

In [225]:
case_outcomes["process_otro"] = case_outcomes.apply(lambda row: 1 if row.case_process == "otro" else 0, axis=1)

### 5.14.4. Storing created outcomes

In [226]:
case_outcomes_for_6 = case_outcomes.drop(columns=["proceso", "Unnamed: 0"])

In [227]:
case_outcomes_for_6.to_csv(dc_interm_path + "/case_outcomes_amag_ii.csv")

# 6. Constructing case outcomes

## 6.1. Loading and joining dataframes

### 6.1.1 Loading dataframes

In [452]:
amag_ii_participants = pd.read_csv(dc_raw_path + "/exp_participants_list.csv")
amag_ii_participants = amag_ii_participants.drop(columns=["Unnamed: 0"])

In [371]:
amag_ii_cases = read_pickle("amag_ii_cases.pkl", dc_temp_path)

In [372]:
case_outcomes = pd.read_csv(dc_interm_path + "/case_outcomes_amag_ii.csv")

In [373]:
date_filter = datetime.strptime("2018-05-01", "%Y-%m-%d") # date for filtering dataframes

### 6.1.2. Creating dataframe at the case-id level

In [374]:
data_participant_caseid = pd.merge(amag_ii_cases, case_outcomes, how="inner")

In [375]:
data_participant_caseid["length_auto_resolution"] = data_participant_caseid["length_auto_resolution"].str.extract("(\d+)").astype(float)

  data_participant_caseid["length_auto_resolution"] = data_participant_caseid["length_auto_resolution"].str.extract("(\d+)").astype(float)


Keeping only the cases after date

In [376]:
date_mask = (pd.to_datetime(data_participant_caseid["date_resolution"]) > date_filter)

In [377]:
data_participant_caseid_temp = data_participant_caseid[date_mask]

Creating `length_resolution` dataframe
**Note:** the length is in days

In [378]:
data_participant_caseid_temp = data_participant_caseid_temp.groupby(by=["case_speciality"]).agg(
                                                                 median_length_resolution=("length_auto_resolution", "median"),
                                                                 mean_length_resolution=("length_auto_resolution", "mean")
                                                                 ).reset_index()

**Creating timely resolved variable:** If length the case took to be solved is lower than the mean, the case was timely resolved

In [380]:
data_participant_caseid_temp[data_participant_caseid_temp["case_speciality"] == "civil"]["mean_length_resolution"].iloc[0] 

0.0

In [381]:
data_participant_caseid["timely_resolved"] = data_participant_caseid.apply(lambda row:
                                            row["length_auto_resolution"] > data_participant_caseid_temp[
                                                data_participant_caseid_temp["case_speciality"] \
                                                == row["case_speciality"]]["mean_length_resolution"].iloc[0],
                                                axis=1)

## 6.2. Create case outcomes variables at the month-judge level

### 6.2.1. For post-treatment regression

**Fundada Appeal Var**

Filtering data

In [433]:
verdict_aft_trt_mask = (data_participant_caseid["verdict_after_treatment"] == 1)
data_participant_caseid_temp = data_participant_caseid[verdict_aft_trt_mask]

Creating `var_fundada`, `var_uncon_reversal` and `var_appeal`at `["expediente_n°_", "month_year_verdict"]` level

**TODO:** Run cell when real data available
```python
fundada_appeal_var = data_participant_caseid_temp.groupby(by=["expediente_n°_", "month_year_verdict"]).agg(
                                 var_fundada=("var_fundada", "mean"),
                                 var_uncon_reversal=("var_uncon_reversal", "mean"),                             
                                 var_appeal=("var_appeal", "mean")
                                 ).reset_index()
```

In [434]:
fundada_appeal_var = data_participant_caseid_temp.groupby(by=["expediente_n°_", "month_year_verdict"]).agg(
                                 var_fundada=("var_fundada", "mean"),
                                 var_appeal=("var_appeal", "mean")
                                 ).reset_index()

Obtaining the variables back to case level

In [435]:
fundada_appeal_var = pd.merge(fundada_appeal_var, data_participant_caseid_temp)
fundada_appeal_var = fundada_appeal_var.rename(columns={"month_year_verdict": "month_year"})

**Resolution Var**

Filtering data

In [436]:
first_auto_aft_trt_mask = (data_participant_caseid["first_auto_after_treatment"] == 1)
data_participant_caseid_temp = data_participant_caseid[first_auto_aft_trt_mask]

Creating `var_resolution`at `["expediente_n°_", "month_year_first_auto"]` level

In [437]:
resolution_var = data_participant_caseid_temp.groupby(by=["expediente_n°_", "month_year_first_auto"]).agg(
                                 var_resolution=("var_resolution", "mean")
                                 ).reset_index()

Obtaining the variables back to case level

In [438]:
resolution_var = pd.merge(resolution_var, data_participant_caseid_temp)
resolution_var = resolution_var.rename(columns={"month_year_first_auto": "month_year"})

**Days to resolution Var**

Filtering data

In [439]:
resol_aft_trt_mask = (data_participant_caseid["resolution_after_treatment"] == 1)
data_participant_caseid_temp = data_participant_caseid[resol_aft_trt_mask]

Creating `var_resolution`at `["expediente_n°_", "month_year_resolution"]` level

In [440]:
days_to_res_var = data_participant_caseid_temp.groupby(by=["expediente_n°_", "month_year_resolution"]).agg(
                                 var_timely_resolved=("timely_resolved", "mean"),
                                 length_auto_resolution=("length_auto_resolution", "mean")
                                 ).reset_index()

Obtaining the variables back to case level

In [441]:
days_to_res_var = pd.merge(days_to_res_var, data_participant_caseid_temp)
days_to_res_var = days_to_res_var.rename(columns={"month_year_resolution": "month_year"})

### 6.2.2. For DiD regression

**Fundada Appeal Var**

Filtering data

In [442]:
date_verdict_mask = ((pd.to_datetime(data_participant_caseid["date_verdict"]) > date_filter) & 
                    (data_participant_caseid["verdict_during_treatment"] != 1))

Creating `verdict_after_treatment`, `var_fundada`, `var_appeal` at `["expediente_n°_", "month_year_verdict"]` level

**TODO:** Run cell when real data available
```python
fundada_appeal_var_did = data_participant_caseid_temp.groupby(by=["expediente_n°_", "month_year_verdict"]).agg(
                                 sentence_after_treatment=("verdict_after_treatment", "first"),
                                 var_fundada=("var_fundada", "mean"),
                                 var_uncon_reversal = ("var_uncon_reversal", "mean"),
                                 var_appeal=("var_appeal", "mean")
                                 ).reset_index()
```

In [443]:
fundada_appeal_var_did = data_participant_caseid_temp.groupby(by=["expediente_n°_", "month_year_verdict"]).agg(
                                 verdict_after_treatment=("verdict_after_treatment", "first"),
                                 var_fundada=("var_fundada", "mean"),
                                 var_appeal=("var_appeal", "mean"),
                                 ).reset_index()

Obtaining the variables back to case level and renaming `month_year_verdict` variable

In [472]:
fundada_appeal_var_did = pd.merge(fundada_appeal_var_did, data_participant_caseid_temp)
fundada_appeal_var_did = fundada_appeal_var_did.rename(columns={"month_year_verdict": "month_year"})

**Resolution Var**

Filtering data

In [445]:
date_resolution_mask = ((pd.to_datetime(data_participant_caseid["date_first_auto"]) > date_filter) & 
                       (data_participant_caseid["first_auto_during_treatment"] != 1))

Creating `first_auto_after_treatment`, `var_resolution` at `["expediente_n°_", "month_year_first_auto"]` level

In [446]:
resolution_var_did = data_participant_caseid_temp.groupby(by=["expediente_n°_", "month_year_first_auto"]).agg(
                                 first_auto_after_treatment=("first_auto_after_treatment", "first"),
                                 var_resolution=("var_resolution", "mean")
                                 ).reset_index()

Obtaining the variables back to case level and renaming `month_year_first_auto` variable

In [447]:
resolution_var_did = pd.merge(resolution_var_did, data_participant_caseid_temp)
resolution_var_did = resolution_var_did.rename(columns={"month_year_first_auto": "month_year"})

**Days to resolution Var**

Filtering data

In [448]:
date_res_to_var_mask = ((pd.to_datetime(data_participant_caseid["date_resolution"]) > date_filter) & 
                       (data_participant_caseid["resolution_during_treatment"] != 1))

Creating `resolution_after_treatment`, `var_timely_resolved`, `length_auto_resolution` at `["expediente_n°_", "month_year_resolution"]` level

In [449]:
days_to_res_var_did = data_participant_caseid_temp.groupby(by=["expediente_n°_", "month_year_resolution"]).agg(
                                 resolution_after_treatment=("resolution_after_treatment", "first"),
                                 var_timely_resolved=("timely_resolved", "mean"),
                                 length_auto_resolution=("length_auto_resolution", "mean")
                                 ).reset_index()

Obtaining the variables back to case level and renaming `month_year_first_auto` variable

In [450]:
days_to_res_var_did = pd.merge(days_to_res_var_did, data_participant_caseid_temp)
days_to_res_var_did = days_to_res_var_did.rename(columns={"month_year_resolution": "month_year"})

## 6.3. Joining dataframes

### 6.3.1. Post-treatment dataframes

**Fundada Appeal Var**

In [502]:
fundada_appeal_data_post = pd.merge(amag_ii_participants, fundada_appeal_var, how="inner")

In [503]:
fundada_appeal_judges = fundada_appeal_data_post.drop_duplicates(subset = "nrodocumento")[["nrodocumento", 
                                                                                      "participant_nombre_apellido"]]

**Resolution Var**

In [504]:
resolution_data_post = pd.merge(amag_ii_participants, resolution_var, how="inner")

In [505]:
resolution_judges = resolution_data_post.drop_duplicates(subset = "nrodocumento")[["nrodocumento", 
                                                                                     "participant_nombre_apellido"]]

**Days to resolution Var**

In [506]:
days_to_res_data_post = pd.merge(amag_ii_participants, days_to_res_var, how="inner")

In [507]:
days_to_res_judges = days_to_res_data_post.drop_duplicates(subset = "nrodocumento")[["nrodocumento", 
                                                                                       "participant_nombre_apellido"]]

### 6.3.2. DiD dataframes

In [483]:
fundada_appeal_data_did = pd.merge(pd.merge(amag_ii_participants, fundada_appeal_var_did, how="inner"), 
                                   fundada_appeal_judges, how="inner")

In [484]:
resolution_data_did = pd.merge(pd.merge(amag_ii_participants, resolution_var_did, how="inner"), 
                               resolution_judges, how="inner")

In [486]:
days_to_res_data_did = pd.merge(pd.merge(amag_ii_participants, days_to_res_var_did, how="inner"), 
                                days_to_res_judges, how="inner")

## 6.4. Saving analysis dataframes

In [493]:
data_participant_caseid.to_csv(dc_final_path + "/dataset_participant_caseid.csv")

In [508]:
fundada_appeal_data_post.to_csv(dc_final_path + "/dataset_caseid_month_fundada_post.csv")

In [509]:
resolution_data_post.to_csv(dc_final_path + "/dataset_caseid_month_resolution_post.csv")

In [510]:
days_to_res_data_post.to_csv(dc_final_path + "/dataset_caseid_month_days_to_res_post.csv")

In [511]:
fundada_appeal_data_did.to_csv(dc_final_path + "/dataset_caseid_month_fundada_did.csv")

In [512]:
resolution_data_did.to_csv(dc_final_path + "/dataset_caseid_month_resolution_did.csv")

In [513]:
days_to_res_data_did.to_csv(dc_final_path + "/dataset_caseid_month_days_to_res_did.csv")