In [66]:
import pandas as pd
import math
import pickle
pd.options.display.max_rows

60

# Read data

In [2]:
path = "datajud/justica_federal/"

files_path = ['processos-trf1/processos-trf1_1.json', 'processos-trf1/processos-trf1_5.json',
              'processos-trf1/processos-trf1_2.json', 'processos-trf1/processos-trf1_6.json',
              'processos-trf1/processos-trf1_3.json', 'processos-trf1/processos-trf1_7.json',
              'processos-trf1/processos-trf1_4.json', 'processos-trf1/processos-trf1_8.json', 'processos-trf1/processos-trf1_9.json', 'processos-trf1/processos-trf1_10.json', 'processos-trf1/processos-trf1_11.json']

In [80]:
classes = pd.read_csv('datajud/sgt_classes.csv', sep=';')
subjects = pd.read_csv('datajud/sgt_assuntos.csv', sep=';')
moviments = pd.read_csv('datajud/sgt_movimentos.csv', sep=';')

In [44]:
original_data = None
for index, file_path in enumerate(files_path):
    print(path + file_path)
    df = pd.read_json(path + file_path)
    if original_data is None:
        original_data = df
    else:
        original_data = original_data.append(df)
    
#     if index > 5:
#         break

data = original_data.copy()

datajud/justica_federal/processos-trf1/processos-trf1_1.json
datajud/justica_federal/processos-trf1/processos-trf1_5.json
datajud/justica_federal/processos-trf1/processos-trf1_2.json
datajud/justica_federal/processos-trf1/processos-trf1_6.json
datajud/justica_federal/processos-trf1/processos-trf1_3.json
datajud/justica_federal/processos-trf1/processos-trf1_7.json
datajud/justica_federal/processos-trf1/processos-trf1_4.json
datajud/justica_federal/processos-trf1/processos-trf1_8.json
datajud/justica_federal/processos-trf1/processos-trf1_9.json
datajud/justica_federal/processos-trf1/processos-trf1_10.json
datajud/justica_federal/processos-trf1/processos-trf1_11.json


In [45]:
def expand_dict(df, column_name, drop=False):
    clean_column_data = df[column_name].dropna()
    column_data = clean_column_data.apply(lambda row: pd.Series(row))
    
    if drop:
        df.drop(column_name, axis=1, inplace=True)

    return pd.concat([df, column_data], axis=1)

## Expand basic metadata

In [46]:
data = expand_dict(data, 'dadosBasicos', True)
data = expand_dict(data, 'orgaoJulgador', True)

In [81]:
subject_data = data[['numero', 'assunto']].explode('assunto').reset_index(drop=True)

## Expand subject and moviment data

In [82]:
subject_data = expand_dict(subject_data, 'assunto', True)

In [83]:
subject_data = expand_dict(subject_data, 'assuntoLocal', True)

# Get subject parent and grandparent codes

In [50]:
def get_code(row):
    subject_code = float('NaN')
    if not math.isnan(row['codigoNacional']):
        subject_code = row['codigoNacional']
    elif not math.isnan(row['codigoPaiNacional']):
        subject_code = row['codigoPaiNacional']
    
    return subject_code

def get_parent(row):
    subject_code = float('NaN')
    if not math.isnan(row['codigoNacional']):
        subject_code = row['codigoNacional']
    elif not math.isnan(row['codigoPaiNacional']):
        subject_code = row['codigoPaiNacional']
    else:
        return subject_code

    parent_series = subjects.loc[subjects['codigo'] == subject_code]['cod_pai']
    return parent_series.iloc[0] if parent_series.size == 1 else subject_code

def get_grandparent(row):
    subject_code = float('NaN')
    if not math.isnan(row['cod_pai']):
        subject_code = row['cod_pai']
    else:
        return subject_code

    parent_series = subjects.loc[subjects['codigo'] == subject_code]['cod_pai']
    return parent_series.iloc[0] if parent_series.size == 1 else subject_code

In [84]:
subject_data['cod'] = subject_data.apply(get_code, axis=1)
subject_data['cod_pai'] = subject_data.apply(get_parent, axis=1)
subject_data['cod_avo'] = subject_data.apply(get_grandparent, axis=1)

In [85]:
subject_basic = pd.merge(data, subject_data, on='numero')

## Add parent codes to classes and subjects

In [61]:
def get_parents(df, code_col = 'codigo', parent_col = 'cod_pai'):
    parents = {}
    for index, row in df.iterrows():
        current_code = row[code_col]
        parent_code = row[parent_col]

        parents[current_code] = [current_code]

        if parent_code in parents:
            parents[current_code] = parents[parent_code] + [parent_code, current_code]
        else:
            while not math.isnan(parent_code):
                parents[current_code].insert(0, parent_code)
                parent_code = df.loc[df['codigo'] == parent_code].iloc[0]['cod_pai']

    return parents

def get_sons(df, code_col = 'codigo', parents_col = 'cod_pais'):
    sons = {}
    for index, row in df.iterrows():
        current_code = row[code_col]
        current_parents = row[parents_col]

        for parent_code in current_parents:
            if parent_code not in sons:
                sons[parent_code] = []

            sons[parent_code].append(current_code)

    return sons

In [86]:
subject_parents = pd.DataFrame(get_parents(subjects).items(), columns = ['codigo', 'cod_pais'])
class_parents = pd.DataFrame(get_parents(classes).items(), columns = ['codigo', 'cod_pais'])

In [87]:
subject_sons_dict = get_sons(subject_parents)
class_sons_dict = get_sons(class_parents)

subject_sons = pd.DataFrame(subject_sons_dict.items(), columns = ['codigo', 'todos_filhos'])
class_sons = pd.DataFrame(class_sons_dict.items(), columns = ['codigo', 'todos_filhos'])

In [88]:
classes = classes.merge(class_parents, on='codigo')
classes = classes.merge(class_sons, on='codigo')
subjects = subjects.merge(subject_parents, on='codigo')
subjects = subjects.merge(subject_sons, on='codigo')

In [89]:
subjects['cod_pais_obj'] = subjects['cod_pais'].apply(lambda parents: {'subject_level_{}'.format(str(i)): v for i, v in enumerate(parents)})
classes['cod_pais_obj'] = classes['cod_pais'].apply(lambda parents: {'class_level_{}'.format(str(i)): v for i, v in enumerate(parents)})

In [90]:
subjects = expand_dict(subjects, 'cod_pais_obj', True)
classes = expand_dict(classes, 'cod_pais_obj', True)

In [91]:
subs = subjects[['codigo', 'subject_level_0', 'subject_level_1', 'subject_level_2', 'subject_level_3', 'subject_level_4', 'subject_level_5']]
clas = classes[['codigo', 'class_level_0', 'class_level_1', 'class_level_2', 'class_level_3', 'class_level_4']]

In [92]:
subject_basic = pd.merge(subject_basic, subs, left_on='codigoPaiNacional', right_on='codigo', how='left')
subject_basic = pd.merge(subject_basic, clas, left_on='classeProcessual', right_on='codigo', how='left')
subject_basic = subject_basic\
    .loc[subject_basic['principal'] == True]\
    .drop(columns=['movimento', 'assunto', 'dataAjuizamento', 'nivelSigilo', 'codigoMunicipioIBGE', 'competencia', 'codigoLocalidade', 'valorCausa', 'principal', 'codigoNacional', 'codigoAssunto', 'codigo_x', 'codigo_y'])

In [96]:
subject_basic.to_pickle('subject_basic.pkl')
classes.to_pickle('classes.pkl')
subjects.to_pickle('subjects.pkl')