In [1]:
import pickle
import regex as re
import nltk
import pandas as pd
import numpy as np
import json, os, string
from janitor import clean_names
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from datetime import datetime
import d6tjoin.top1
import d6tjoin.utils
import d6tjoin



In [2]:
def extract_text(text: str, pattern: str) -> str:
    """Extracts substring from string using a given regex pattern"""
    
    if type(text) is str:
        match = re.search(pattern, text)
        if match:
            return match.group(1)
        else:
            return ""
    else:
        return ""

In [92]:
def read_json_dict(path: str) -> dict:
    """
    Reads a json file and returns it as dict object
    """
    
    file = open(path) # Opening JSON file
    return json.load(file) # returns JSON object as a dictionary

def folder_creator(folder_name: string, path: string) -> None:
    """
    Generates a folder in specified path
    
    input: name of root folder, path where you want 
    folder to be created
    output: None
    """
    
    # defining paths
    data_folder_path = path + "/" + folder_name
    data_folder_exists = os.path.exists(data_folder_path)

    # creating folders if don't exist
    if data_folder_exists:
        pass
    else:    
        # create a new directory because it does not exist 
        os.makedirs(data_folder_path)

        # create subfolders
        print(f"The new directory {folder_name} was created!")
        
def fuzzy_merge(df_1, df_2, key1, key2, threshold=90, limit=2):
    """
    :param df_1: the left table to join
    :param df_2: the right table to join
    :param key1: key column of the left table
    :param key2: key column of the right table
    :param threshold: how close the matches should be to return a match, based on Levenshtein distance
    :param limit: the amount of matches that will get returned, these are sorted high to low
    :return: dataframe with boths keys and matches
    """
    s = df_2[key2].tolist()
    
    m = df_1[key1].apply(lambda x: process.extract(x, s, limit=limit))    
    df_1['matches'] = m
    
    m2 = df_1['matches'].apply(lambda x: ', '.join([i[0] for i in x if i[1] >= threshold]))
    df_1['matches'] = m2
    
    return df_1

def create_pickle(object_name, file_name: str, path: str) -> None:
    """
    Creates a pickle file for object. Note: Path should have no slash 
    at the end
    """
    with open(path + f"/{file_name}", "wb") as storing_output:
        pickle.dump(object_name, storing_output)
        storing_output.close()
        
def read_pickle(file_name: str, path: str) -> None:
    """
    Reads pickle file from specified path 
    """
    pickle_file = open(path + f"/{file_name}", "rb")
    output = pickle.load(pickle_file)
    pickle_file.close()
    return output

## Reading paths

In [4]:
paths = read_json_dict("paths.json")

In [5]:
paths

{'data_path': 'D:/Accesos directos/Trabajo/World Bank/WB Repos/peru-scrape-justice',
 'code_path': '/Users/brandonmora/GitHub/peru-amag-stats/case_outcomes',
 'data_amag_i': 'D:/Accesos directos/Trabajo/World Bank/WB Repos/peru-scrape-justice/01_AMAG',
 'data_cej': 'D:/Accesos directos/Trabajo/World Bank/WB Repos/peru-scrape-justice/data_cleaned_',
 'data_gender': 'D:/Accesos directos/Trabajo/World Bank/WB Repos/peru-scrape-justice/07_Other/02_Raw/names_gender',
 'local_storage': 'D:/Daniel Chen Dropbox/Marco Antonio GutiÃ©rrez ChÃ¡vez/datasets_amag_ii_scrape'}

In [6]:
data_path = paths["data_path"]

In [8]:
data_path

'D:/Accesos directos/Trabajo/World Bank/WB Repos/peru-scrape-justice'

In [10]:
data_cleaned_path = data_path + "/data_cleaned_test"

In [17]:
folder_creator("raw", data_cleaned_path)

In [18]:
dc_raw_path = data_cleaned_path + "/raw"

In [19]:
folder_creator("temp", data_cleaned_path)

In [20]:
dc_temp_path = data_cleaned_path + "/temp"

In [199]:
folder_creator("intermediate", data_cleaned_path)

The new directory intermediate was created!


In [200]:
dc_interm_path = data_cleaned_path + "/intermediate"

# 1. Creating participants list

Reading lab data

In [11]:
lab_data = pd.read_stata(data_path + "/lab_Data/Clean_Full_Data12.dta")

Creating name variables for future fuzzy merge

In [12]:
lab_data["participant_nombre_apellido"] = lab_data["Nombres"] + " " + lab_data["ApellidoPaterno"] + " " + lab_data["ApellidoMaterno"]
lab_data["participant_nombre_apellido"] = lab_data["participant_nombre_apellido"].str.strip()

  lab_data["participant_nombre_apellido"] = lab_data["Nombres"] + " " + lab_data["ApellidoPaterno"] + " " + lab_data["ApellidoMaterno"]


In [13]:
lab_data["participant_apellido_nombre"] = lab_data["ApellidoPaterno"] + " " + lab_data["ApellidoMaterno"] + " " + lab_data["Nombres"]
lab_data["participant_apellido_nombre"] = lab_data["participant_apellido_nombre"].str.strip()

  lab_data["participant_apellido_nombre"] = lab_data["ApellidoPaterno"] + " " + lab_data["ApellidoMaterno"] + " " + lab_data["Nombres"]


In [14]:
lab_data = lab_data.rename(columns={"DNI": "nrodocumento"})

In [15]:
amag_ii_participants = lab_data[["nrodocumento", "participant_nombre_apellido", "participant_apellido_nombre"]]

Exporting the list of participants

In [21]:
amag_ii_participants.to_csv(dc_raw_path + "/amag_ii_participants_list.csv")

# 2. Creating Cases List

### 2.0. Selecting reporte files

In [22]:
files_reports = pd.read_csv(dc_raw_path + "/DF_file_report_2022.csv")
files_reports = clean_names(files_reports)

### 2.1. Cleaning the reporte files

Creating lists with characters to be replace

In [23]:
backslash_reps = ["\\(\\*\\)", "\\", "\\([^()]{0,}\\)"]
trailing_and_special_reps = ["^\\s", "\\,", "\\.$", " \\- JUZ$", "\\*"]
other_strs_reps = ["\\- MIXTO Y LIQ", "\\- MIXTO", "\\- JUZ\\. MIXTO", 
                   "- JM", "- INVESTIGACION", "- PAZ LETRADO", "SECOM - ", "- JT"]

### 2.2. Replacing backlashes, special characters and other uninformative characters

In [24]:
empty_reps = backslash_reps + trailing_and_special_reps + other_strs_reps

In [25]:
for val in empty_reps:    
    files_reports["juez_"] = files_reports["juez_"].str.replace(val, "")

  files_reports["juez_"] = files_reports["juez_"].str.replace(val, "")
  files_reports["juez_"] = files_reports["juez_"].str.replace(val, "")


In [26]:
name_reps = [["ALFREDO E\\.", "ALFREDO E"], ["BERTHA F\\.", "BERTHA F"], ["CLAUDIO W\\.", "CLAUDIO W"], 
            ["CLAVELITO L\\.", "CLAVELITO L"], ["ELMER L\\.", "ELMER L"], ["ERNESTO A\\.", "ERNESTO A"],
            ["HERBERT M\\.", "HERBERT M"], ["LUZ K\\.", "LUZ K"], ["NANCY S\\.", "NANCY S"], ["JESSICA E\\.", "JESSICA E"],
            ["PATRICIA C\\.", "PATRICIA C"], ["JESSICA P\\.", "JESSICA P"], ["YOLANDA B\\.", "YOLANDA B\\."],
            ["LUZ M\\.", "LUZ M"], ["EDGAR\\.", "EDGAR"], ["C\\. ARTURO", "C ARTURO"], ["ALEXANDER A\\.", "ALEXANDER A"],
            ["RENE G\\.", "RENE G"], ["GUILLERMO S\\.", "GUILLERMO S"], ["FANNY L\\. ",  "FANNY L"], ["ELISA \\(LA", "ELISA"],
            ["JULIA \\(LA", "JULIA"], ["ACEVEDO DIEZ CECILIA", "ACEVEDO DIEZ CECILIA DEL PILAR"], [" J. ", " J "],
            [" K. ", " K "]]

### 2.3. Replacing names with issues

In [27]:
for name_rep in name_reps:
    files_reports["juez_"] = files_reports["juez_"].str.replace(name_rep[0], name_rep[1])

  files_reports["juez_"] = files_reports["juez_"].str.replace(name_rep[0], name_rep[1])


### 2.4. Obtaining the names of judges

Some cases have multiple judges assigned to them. As a result, we need to extract these names as we will match the case with the judge information.

In [28]:
files_reports = files_reports[files_reports["juez_"].notna()]

In [29]:
files_reports["juez_splitted"] = files_reports["juez_"].apply(lambda row: row.split("."))

In [30]:
files_reports["n_judges_case"] = files_reports["juez_splitted"].apply(lambda row: len(row))

In [31]:
judge_names = files_reports[files_reports["n_judges_case"] == 1]

In [32]:
multiple_judge_names = files_reports[files_reports["n_judges_case"] != 1]

In [33]:
multiple_judge_names["juez_1"] = multiple_judge_names["juez_splitted"].apply(lambda row: row[0])
multiple_judge_names["juez_2"] = multiple_judge_names["juez_splitted"].apply(lambda row: row[1])
multiple_judge_names["juez_3"] = multiple_judge_names["juez_splitted"].apply(lambda row: row[2])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  multiple_judge_names["juez_1"] = multiple_judge_names["juez_splitted"].apply(lambda row: row[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  multiple_judge_names["juez_2"] = multiple_judge_names["juez_splitted"].apply(lambda row: row[1])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  multiple_ju

In [34]:
judge_names = judge_names.rename(columns={"juez_": "juez"})

### 2.5. Fuzzy merge with Lab Experiment Data

Creating debuggings dataset to simulate the fuzzy merge

In [35]:
judge_names.columns

Index(['expediente_n°_', 'organo_jurisdiccional_', 'distrito_judicial_',
       'juez', 'especialista_legal_', 'fecha_de_inicio_', 'proceso_',
       'observacion_', 'especialidad_', 'materia_s_', 'estado_',
       'etapa_procesal_', 'fecha_conclusion_', 'ubicacion_',
       'motivo_conclusion_', 'sumilla_', 'juez_splitted', 'n_judges_case'],
      dtype='object')

In [36]:
debug_judges = judge_names[(judge_names["juez"] == "CRUZADO MEJIA MARTIN VALDEMAR") |
                           (judge_names["juez"] == "APAGUEÑO REATEGUI BRYAN ENRIQUE")]

In [37]:
debug_participants = amag_ii_participants[(amag_ii_participants["participant_apellido_nombre"] == "CRUZADO MEJIA MARTIN VALDEMAR") |
                                          (amag_ii_participants["participant_apellido_nombre"] == "APAGUEÑO REATEGUI BRYAN ENRIQUE")]

Fuzzy match of cases

In [38]:
matched_judge_name1 = d6tjoin.top1.MergeTop1(debug_judges, debug_participants, fuzzy_left_on=["juez"], 
                       fuzzy_right_on=["participant_apellido_nombre"]).merge()["merged"]

  df_candidates = df_candidates_exact.append(df_candidates_fuzzy, ignore_index=True)


In [39]:
amag_ii_cases = matched_judge_name1.copy()

# 3. Creating CEJ datasets

A pending task would be to bind the rows of all the dataframes

## Follow up dataframe

In [40]:
files_follow_up = pd.read_csv(dc_raw_path + "/DF_follow_up_cleaner_2022.csv", error_bad_lines=False)
files_follow_up = clean_names(files_follow_up)



  files_follow_up = pd.read_csv(dc_raw_path + "/DF_follow_up_cleaner_2022.csv", error_bad_lines=False)
Skipping line 8128: expected 10 fields, saw 11



# Procedural parts dataframe

In [41]:
files_procedural_parts = pd.read_csv(dc_raw_path + "/DF_procedural_parts_2022.csv")
files_procedural_parts = clean_names(files_procedural_parts)

In [42]:
files_procedural_parts["expediente_n°_"] = files_procedural_parts["expediente_n°_"].apply(lambda row: row.split("\\")[-1])

## Downloads dataframe

In [43]:
files_downloads = pd.read_csv(dc_raw_path + "/DF_DOWNLOADS_2022.csv")
files_downloads = clean_names(files_downloads)

### Merging complementary case data with amag ii cases

In [44]:
reportes_amag_ii_raw = pd.merge(amag_ii_cases, files_reports, how="inner", on="expediente_n°_")

In [45]:
follow_up_amag_ii_raw = pd.merge(amag_ii_cases, files_follow_up, how="inner", on="expediente_n°_")

In [46]:
procedural_parts_amag_ii_raw = pd.merge(amag_ii_cases, files_procedural_parts, how="inner", on="expediente_n°_")

In [47]:
downloads_amag_ii_raw = pd.merge(amag_ii_cases, files_downloads, how="inner", left_on="expediente_n°_", right_on="expediente_num")

In [48]:
help(create_pickle)

Help on function create_pickle in module __main__:

create_pickle(object_name, file_name: str, path: str) -> None
    Creates a pickle file for object. Note: Path should have no slash 
    at the end



Storing data on `.pkl` files

In [49]:
create_pickle(amag_ii_cases, "amag_ii_cases.pkl", dc_temp_path)

In [50]:
create_pickle(reportes_amag_ii_raw, "reportes_amag_ii_raw.pkl", dc_temp_path)

In [51]:
create_pickle(follow_up_amag_ii_raw, "follow_up_amag_ii_raw.pkl", dc_temp_path)

In [52]:
create_pickle(procedural_parts_amag_ii_raw, "procedural_parts_amag_ii_raw.pkl", dc_temp_path)

In [53]:
create_pickle(downloads_amag_ii_raw, "downloads_amag_ii_raw.pkl", dc_temp_path)

# 4. Preprocessing and cleaning of datasets

Reading gender dataframe

In [54]:
gender_dataset = pd.read_csv(dc_raw_path + "/harvard_set_gender.csv")

In [55]:
def spanish_cleaner(txt_file):
    text = txt_file
    text = re.sub(r"(&[a-zA-Z]*;)", " ", text)  # the txt files had some unwanted text like &rsquo; this line removes such text
    text = text.lower()

    # remove punctuation and numbers from the string
    punctuations = '''!()[]{};:'"\,<>./¿?@#$%^&*_–~=+¨`“”’|0123456789'''  # all but hyphens
    for x in text.lower(): 
        if x in punctuations: 
            text = text.replace(x, "")

    # replacing encoding characters
    enc_characters = [" st ", " nd ", " rd ", " th ", "srl", "lpfvf", "pctc", "jmxcff", "ayrq", "axu", "oadk", "jcxj", "nplt", "eef", "fcfc", "qyoc", "gobpe", "pfg", "vqrx", "csjppj", "xas", "feeback", "hafceqc", "xqj", "hellip", "rsquo", "ldquo", "rdquo", "ndash", "-", "n°", "nº", "º", "°", "dprgdonpdl", "«", "»", "…", "derjudicialpoderjudicialpoderjudicialpoderjudicialpoderjudicialpoderjudicialpoderjudicialpoderjudicialpoderjudicialpoderjudicialpoderjudicialpoderjudicialpoderjudicialpoderjudicialpoderjudicialpoderjudicialpoderjudicialpoderjudicialpoderjudicialpoderj", "ii", "iii", "vii", "viii"]
    
    for item in enc_characters:
        text = text.replace(item, " ")
    
    # cleaning for spanish stop words
    stopword_es = nltk.corpus.stopwords.words('spanish') # loading spanish stop words
    custom_substrs = ["http", "hangouts", "meet", "gmailcom"] # html related
    custom_gender_words = ["él", "ella", "la", "ese", "esa", "esos", "esas", "este", "esta", "aquel", "aquella", "aquellos", "aquellas", "lo", "la", "los", "las", "aquel", "aquella", "mío", "mía", "míos", "mías", "suyo", "suya", "suyos", "suyas"] # list with pronouns associated to a specific gender
    length_custom_stopwords = len(custom_substrs)
    words = text.split() # tokenizing sentence
    cleaned_words = [word for word in words if (word not in stopword_es and len(word) > 1) or word in custom_gender_words]
     
    sentence_no_custom = [] # omitting words that contain 
    for cleaned_word in cleaned_words:
        counter_stopwords = 0
        for word in custom_substrs: # evaluating if word contains substr
            if word not in cleaned_word: # if passes, +1 for counter
                counter_stopwords += 1
            if counter_stopwords == length_custom_stopwords: # append if passes all custom substrs tests
                sentence_no_custom.append(cleaned_word)

    return " ".join(sentence_no_custom)

  punctuations = '''!()[]{};:'"\,<>./¿?@#$%^&*_–~=+¨`“”’|0123456789'''  # all but hyphens


## 4.1. Preprocessing of downloads and follow up 

### Downloads Dataframe

In [56]:
# lowercase to text
downloads_amag_ii_raw["text"] = downloads_amag_ii_raw["text"].apply(lambda text: text.lower() 
                                                                    if type(text) is str else text)

In [57]:
# cleaning the text from the cases
downloads_amag_ii_raw["text"] = downloads_amag_ii_raw["text"].apply(lambda text: spanish_cleaner(text) 
                                                                    if type(text) is str else text)

### Downloads Follow Up

Fixing the date of the resolution

In [58]:
follow_up_amag_ii_raw["fecha_de_resolucion_ingreso_"] = follow_up_amag_ii_raw["fecha_de_resolucion_ingreso_"].apply(
                                                        lambda date: datetime.strptime(re.match("(\d+[-/]\d+[-/]\d+)", 
                                                        date)[0], "%d/%m/%Y"))

  lambda date: datetime.strptime(re.match("(\d+[-/]\d+[-/]\d+)",


Remove extra white space and lower "acto"

In [59]:
# lowercase to text
follow_up_amag_ii_raw["acto_"] = follow_up_amag_ii_raw["acto_"].apply(lambda text: text.lower() 
                                                                    if type(text) is str else text)

In [60]:
# cleaning the text from the cases
follow_up_amag_ii_raw["acto_"] = follow_up_amag_ii_raw["acto_"].apply(lambda text: spanish_cleaner(text) 
                                                                    if type(text) is str else text)

Remove extra white space and lower "sumilla" (from merged dataframes)

In [61]:
sumillas = ["sumilla__x", "sumilla__y"]

In [62]:
for sumilla in sumillas:
    # lowercase to text
    follow_up_amag_ii_raw[sumilla] = follow_up_amag_ii_raw[sumilla].apply(lambda text: text.lower() 
                                                                    if type(text) is str else text)
    # cleaning the text from the cases
    follow_up_amag_ii_raw[sumilla] = follow_up_amag_ii_raw[sumilla].apply(lambda text: spanish_cleaner(text)
                                                                    if type(text) is str else text)

Remove extra white space and lower "descripcion de usuario"

In [63]:
# lowercase to text
follow_up_amag_ii_raw["descripcion_de_usuario_"] = follow_up_amag_ii_raw["descripcion_de_usuario_"].apply(lambda text: text.lower() 
                                                                    if type(text) is str else text)

In [64]:
# cleaning the text from the cases
follow_up_amag_ii_raw["descripcion_de_usuario_"] = follow_up_amag_ii_raw["descripcion_de_usuario_"].apply(lambda text: spanish_cleaner(text) 
                                                                    if type(text) is str else text)

Create variable that identifies rows with pdf or docx file in downloads dataset

In [65]:
follow_up_amag_ii_raw["descargado"] = follow_up_amag_ii_raw["descripcion_de_usuario_"].apply(lambda text: 1
                                                                                            if "descargado" in text
                                                                                            else 0)

### Create dataset of case_id/number of documents (obs with no duplicates)

Droping duplicates in terms `expediente_n°_` and `num`

In [66]:
downloads_full = downloads_amag_ii_raw[downloads_amag_ii_raw["link"].notna()]

In [67]:
downloads_full.drop_duplicates(subset=["expediente_n°_", "num"], inplace=True)

Merging the data with no duplicates

In [68]:
documents_amag = pd.merge(follow_up_amag_ii_raw, downloads_full, on=["expediente_n°_", "nrodocumento", "link"], how="left")

## 4.2. Identify keywords from text, acto and sumilla columns 

### Filtering rows without information

In [69]:
documents_amag = documents_amag[(documents_amag["acto_"] !="auto de saneamiento") & (documents_amag["acto_"] != "nota")]

In [70]:
documents_amag["text"] = documents_amag["text"].apply(lambda text: text if type(text) is not float else "")

### 4.2.1. `Parte resolutiva` variable

In [71]:
documents_amag["parte_resolutiva"] = documents_amag["text"].apply(lambda text: extract_text(text,
                                                                                           r"resuelve\s*([^\n\r]*)|fallo\s*([^\n\r]*)|resuelvo\s*([^\n\r]*)"))

### 4.2.2. `Apela` variable

In [73]:
documents_amag["appeal"] = documents_amag.apply(lambda row: 1 if "apela" in row.acto_ or "apela" in row.sumilla__x
                                                or "apela" in row.sumilla__y else 0, axis=1)

In [111]:
documents_amag["appeal"].value_counts()

0    977
1     19
Name: appeal, dtype: int64

### 4.2.3. `Sentencia` variable

In [74]:
documents_amag["sentencia_acto"] = documents_amag.apply(lambda row: 1 if "sentencia" in row.acto_ else 0, axis=1)
documents_amag["sentencia_sumilla"] = documents_amag.apply(lambda row: 1 if "sentencia" in row.sumilla__x or 
                                                           "sentencia" in row.sumilla__y else 0, axis=1)
documents_amag["sentencia"] = documents_amag.apply(lambda row: 1 if row.sentencia_acto == 1 or row.sentencia_sumilla == 1 
                                                   else 0, axis=1)

In [108]:
documents_amag["sentencia_acto"].value_counts()

0    984
1     12
Name: sentencia_acto, dtype: int64

In [109]:
documents_amag["sentencia_sumilla"].value_counts()

0    985
1     11
Name: sentencia_sumilla, dtype: int64

In [110]:
documents_amag["sentencia"].value_counts()

0    973
1     23
Name: sentencia, dtype: int64

### 4.2.4. `Auto` variable

In [75]:
documents_amag["auto_final"] = documents_amag.apply(lambda row: 1 if "auto final" in row.acto_ or 
                                                           "auto final" in row.sumilla__x or
                                                           "auto final" in row.sumilla__y else 0, axis=1)

In [76]:
documents_amag["auto_definitivo"] = documents_amag.apply(lambda row: 1 if "auto definitivo" in row.acto_ or 
                                                           "auto definitivo" in row.sumilla__x or
                                                           "auto definitivo" in row.sumilla__y else 0, axis=1)

In [107]:
documents_amag["auto_definitivo"].value_counts()

0    993
1      3
Name: auto_definitivo, dtype: int64

### 4.2.5. `Final` variable

In [77]:
documents_amag["auto_improcedente"] = documents_amag.apply(lambda row: 1 if "auto que declara improcedente" in row.acto_ or 
                                                           "auto que declara improcedente" in row.sumilla__x or
                                                           "auto que declara improcedente" in row.sumilla__y or 
                                                           "auto improcedente" in row.acto_ or 
                                                           "auto improcedente" in row.sumilla__x or
                                                           "auto improcedente" in row.sumilla__y else 0, axis=1)

In [106]:
documents_amag["auto_improcedente"].value_counts()

0    986
1     10
Name: auto_improcedente, dtype: int64

### 4.2.6. `Vista 2` variable

In [78]:
documents_amag["vista2"] = documents_amag.apply(lambda row: 1 if "sentencia de vista" in row.acto_ or 
                                                           "sentencia de vista" in row.sumilla__x or
                                                           "sentencia de vista" in row.sumilla__y or 
                                                           "auto de vista" in row.acto_ or 
                                                           "auto de vista" in row.sumilla__x or
                                                           "auto de vista" in row.sumilla__y else 0, axis=1)

In [105]:
documents_amag["vista2"].value_counts()

0    996
Name: vista2, dtype: int64

### 4.2.7. `Revoca 2` variable

In [79]:
documents_amag["revoca2"] = documents_amag.apply(lambda row: 1 if "vista que revoca" in row.acto_ else 0, axis=1)

In [102]:
documents_amag["revoca2"].value_counts()

0    996
Name: revoca2, dtype: int64

### 4.2.8. `Anula 2` variable

In [80]:
documents_amag["nula2"] = documents_amag.apply(lambda row: 1 if "vista que anula" in row.acto_ else 0, axis=1)

In [103]:
documents_amag["nula2"].value_counts()

0    996
Name: nula2, dtype: int64

### 4.2.9. `Confirma 2` variable

In [81]:
documents_amag["confirma2"] = documents_amag.apply(lambda row: 1 if "vista que confirma" in row.acto_ else 0, axis=1)

In [104]:
documents_amag["confirma2"].value_counts()

0    996
Name: confirma2, dtype: int64

### 4.2.10. `Fundada` variable

In [82]:
def evaluate_vals(row, list_substrs: list) -> int:
    """determines whether the case matches a substr from list"""
    
    values = [row.parte_resolutiva, row.sumilla__x, row.sumilla__y, row.acto_]
    output = 0
    for value in values:
        if type(value) is not str:
            break
        else:
            for substr in list_substrs:
                if substr in value:
                    output = 1
                    return output
    return output

In [83]:
documents_amag["fundada"] = documents_amag.apply(lambda row: evaluate_vals(row, [" fundada la demanda", "sentencia fundada"]), axis=1)

In [101]:
documents_amag["fundada"].value_counts()

0    988
1      8
Name: fundada, dtype: int64

### 4.2.11. `Fundada en parte` variable

In [84]:
documents_amag["fundada_parte"] = documents_amag.apply(lambda row: evaluate_vals(row, ["fundada en parte"]), axis=1)

In [85]:
documents_amag["fundada_parte"].value_counts()

0    996
Name: fundada_parte, dtype: int64

### 4.2.12. `Infundada` variable

In [86]:
documents_amag["infundada"] = documents_amag.apply(lambda row: evaluate_vals(row, ["infundada la demanda", "sentencia infundada"]), axis=1)

In [113]:
documents_amag["infundada"].value_counts()

0    996
Name: infundada, dtype: int64

### 4.2.12. `Vista` variable

In [87]:
documents_amag["vista"] = documents_amag.apply(lambda row: evaluate_vals(row, ["sentencia de vista", "auto de vista"]), axis=1)

In [114]:
documents_amag["vista"].value_counts()

0    996
Name: vista, dtype: int64

### 4.2.13. `Revoca` variable

In [88]:
documents_amag["revoca"] = documents_amag.apply(lambda row: evaluate_vals(row, ["vista que revoca", "revocar la sentencia", "revocar la resolucion", "revocar en parte", "revocaron la sentencia", "revocaron la resolución"]), axis=1)

In [115]:
documents_amag["revoca"].value_counts()

0    996
Name: revoca, dtype: int64

### 4.2.13. `Anula` variable

In [89]:
documents_amag["nula"] = documents_amag.apply(lambda row: evaluate_vals(row, ["vista que anula", "declarar nula", "declara nula", "declara nulo", "declarar nulo", "declarar: nula", "declarar la nulidad", "declararon nula"]), axis=1)

In [116]:
documents_amag["nula"].value_counts()

0    996
Name: nula, dtype: int64

### 4.2.14. `Confirma` variable

In [90]:
documents_amag["confirma"] = documents_amag.apply(lambda row: evaluate_vals(row, ["vista que confirma", "confirmaron el auto", "confirmaron la sentencia", "aprobaron la sentencia", "confirma sentencia", "confirma la sentencia", 
                                                                                  "confirmar la sentencia", "confirmar resolucion", "confirmar resolución", "confirmar la resolucion", "confirmar en parte", "confirmar la resolución"]), axis=1)

In [117]:
documents_amag["confirma"].value_counts()

0    996
Name: confirma, dtype: int64

## 4.3. Preprocessing of reportes and procedural parts

Obtaining reportes

In [203]:
reportes_amag = reportes_amag_ii_raw[["expediente_n°_", "distrito_judicial__x", "distrito_judicial__y", "proceso__x", 
                                      "proceso__y", "especialidad__x", "especialidad__y", "estado__x", "estado__y", 
                                      "etapa_procesal__x", "etapa_procesal__y"]]

In [204]:
reportes_amag.to_csv(dc_interm_path + "/reportes_amag_ii_clean.csv")

Obtaining procedural parts

In [93]:
procedural_parts_amag_ii_raw = read_pickle("procedural_parts_amag_ii_raw.pkl", dc_temp_path)

### 4.3.1. Creating `parties` variable

In [98]:
procedural_parts_amag_ii_raw["parties"] = procedural_parts_amag_ii_raw.apply(lambda row: "plaintiff" if 
                                                                             row.parte == "DEMANDANTE" or 
                                                                             row.parte == "AGRAVIADO" or 
                                                                             row.parte == "VÍCTIMA" or
                                                                             row.parte == "SOLICITANTE" or
                                                                             row.parte == "DENUNCIANTE"
                                                                             else np.NaN, axis=1)

In [99]:
procedural_parts_amag_ii_raw["parties"] = procedural_parts_amag_ii_raw.apply(lambda row: "defendant" if 
                                                                             row.parte == "DEMANDADO" or 
                                                                             row.parte == "AGRESOR" or 
                                                                             row.parte == "DENUNCIADO"
                                                                             else row.parties, axis=1)

In [97]:
procedural_parts_amag_ii_raw["parties"] = procedural_parts_amag_ii_raw.apply(lambda row: "other" if 
                                                                             row.parties != "plaintiff" and 
                                                                             row.parties != "defendant", 
                                                                             axis=1)

In [100]:
procedural_parts_amag_ii_raw["parties"].value_counts()

defendant    318
plaintiff    232
Name: parties, dtype: int64

### 4.3.2. Creating `first_name` and `second_name` variables

In [155]:
procedural_parts_amag_ii_raw["nombres"] = procedural_parts_amag_ii_raw["nombres"].apply(lambda nombres:
                                                            nombres.lower() if nombres != "\t\t\t\t\t\t\t\t\t\t\t\t\t"
                                                            else "")

In [156]:
procedural_parts_amag_ii_raw["first_name"] = procedural_parts_amag_ii_raw["nombres"].apply(lambda nombres: 
                                                                                           nombres.split(" ")[0])
procedural_parts_amag_ii_raw["second_name"] = procedural_parts_amag_ii_raw["nombres"].apply(lambda nombres: 
                                                                                           nombres.split(" ")[1] 
                                                                                           if len(nombres.split(" ")) == 2
                                                                                           else "")

In [157]:
procedural_parts_amag = pd.merge(procedural_parts_amag_ii_raw, gender_dataset, how="left", 
                                 left_on="first_name", right_on="name")
procedural_parts_amag = procedural_parts_amag.rename(columns={"female": "female_first"})
procedural_parts_amag = procedural_parts_amag.drop(columns=["name"])

In [158]:
procedural_parts_amag = pd.merge(procedural_parts_amag, gender_dataset, how="left", 
                                 left_on="second_name", right_on="name")
procedural_parts_amag = procedural_parts_amag.rename(columns={"female": "female_second"})
procedural_parts_amag = procedural_parts_amag.drop(columns=["name"])

In [160]:
procedural_parts_amag["female"] = procedural_parts_amag.apply(lambda row: 1 if row.female_first == 1 else np.NaN, axis=1)
procedural_parts_amag["female"] = procedural_parts_amag.apply(lambda row: 0 if row.female_first == 0 else row.female, axis=1)

In [161]:
procedural_parts_amag["female"].value_counts()

0.0    119
1.0     93
Name: female, dtype: int64

### 4.3.4. Creating `legal_entity` variable

In [164]:
procedural_parts_amag["legal_entity"] = procedural_parts_amag["tipo_depersona"].apply(lambda tipo_de_persona:
                                                                                     1 if tipo_de_persona == "JURIDICA"
                                                                                     else 0)

### 4.3.5. Collapsing at the expediente level

In [176]:
procedural_parts_amag_collapsed = procedural_parts_amag[["expediente_n°_", "parties", "female", "legal_entity"]].groupby(
                             by=["expediente_n°_", "parties"]).agg(female_ratio=("female", "mean"),
                                                                   female_indicator=("female", "max"),
                                                                   legal_entity_ratio=("legal_entity", "mean"),
                                                                   legal_entity_indicator=("legal_entity", "max")
                                                                   ).reset_index()

### 4.3.6. Reshaping dataframe

In [196]:
procedural_parts_amag_collapsed = pd.pivot_table(procedural_parts_amag_collapsed, values=["female_ratio", "legal_entity_ratio", "female_indicator", 
                           "legal_entity_indicator"], columns=["parties"], index=["expediente_n°_"])

In [197]:
procedural_parts_amag_collapsed.columns = ["_".join(col).strip() for col in procedural_parts_amag_reshaped.columns.values]

Storing results

In [202]:
procedural_parts_amag_collapsed.to_csv(dc_interm_path + "/procedural_parts_amag_ii_clean.csv")

# 5. Creating case outcomes

## 5.1. Creating fundada outcomes