# POC zonage décision CA - cleanup
Based on “Sortie CSV - récap des demandes.docx”.

A bit of clean up, and adds a few new columns extracted from the data.

We chose to keep the original data unchanged (especially `types`), and to add new values for a sample task (classification).

## Libs versions

In [1]:
import os
import pathlib
import csv as csv
import pandas as pd
import numpy as np
import itertools

In [2]:
pd.__version__

'0.22.0'

In [3]:
np.__version__

'1.12.1'

## Set display options (for the Jupyter notebook)

In [4]:
pd.options.display.max_colwidth = 100
pd.options.display.max_rows = 100

In [5]:
def contrasting_text_color(hex_color:str):
    """returns a color visible on `hex_color`.
    Typically the result is a text color visible with high contrast on
    a  `hex_color` background.
    Parameter format: `#rrvvbb`"""
    (r, g, b) = (hex_color[1:3], hex_color[3:5], hex_color[5:])
    return '#FFFFFF' if 1 - (int(r, 16) * 0.299 + int(g, 16) * 0.587 + int(b, 16) * 0.114) / 255 < 0.5 else '#00000'

def highlight_color(x:str):
    """≠ color background for each file"""
    
    max_val = int('0xFFFFFF', base=16)
    
    # 2 calls to hash() to get more different values when the file numbers are close
    dec_bg_color = int(int(str(np.abs(hash(str(hash(x)))))[-10:-4]) / 999999  * max_val)
    bg_color = '#%06X' % dec_bg_color
    fg_color = '#%06X' % (max_val - dec_bg_color)
    
    return 'background-color: {}; color: {};'.format(bg_color, contrasting_text_color(fg_color)) 

## Load the CSV (generated by `annotations2csv.py`)

In [6]:
dir_base = '/home/ojeulin/developpement/data/IA-et-droit-zonage-décisions-lots/brat_6-12'
file_csv_annotations = pathlib.Path(dir_base) / 'annotations.csv'

csv_separator_in = ','
csv_separator_out = ','
csv_encoding = 'utf-8'
n_a_value = 'n_a' # value for 'not applicable'

In [7]:
print(f'data file: {file_csv_annotations}')

data file: /home/ojeulin/developpement/data/IA-et-droit-zonage-décisions-lots/brat_6-12/annotations.csv


In [8]:
df = pd.read_table(file_csv_annotations,
                   sep=csv_separator_in,
                   encoding=csv_encoding)

In [9]:
df.head(3)

Unnamed: 0,filename,line_num,types,annotation_difficulty,text
0,/home/ojeulin/developpement/data/france/cours-appel-zonnées/brat_6-12/lot_0037/JURITEXT000033216...,1,n_a,Difficile,COUR D'APPEL DE VERSAILLES
1,/home/ojeulin/developpement/data/france/cours-appel-zonnées/brat_6-12/lot_0037/JURITEXT000033216...,2,n_a,Difficile,Code nac : 53B
2,/home/ojeulin/developpement/data/france/cours-appel-zonnées/brat_6-12/lot_0037/JURITEXT000033216...,3,n_a,Difficile,16e chambre


### Consistency check (works only for \*nix-like OS)

In [10]:
def check_csv_nb_lines(file_csv, df):
    wc = !wc -l "$file_csv"
    line_count_csv = int(wc[0].split(' ')[0]) - 1 # skip 1st line, the header
    line_count_df = df.shape[0]
    assert line_count_csv == line_count_df, f'Error: number of lines in the CSV file (#{line_count_csv}) ≠ number of lines in the DataFrame (#{line_count_df})'

In [11]:
check_csv_nb_lines(file_csv_annotations, df)

## Cleaning the raw data

### Extract the filename's informations we need
Extract the last directory's name (= *batch* name) and the file name from the `filename` column, and drop it.

In [12]:
df2 = df.filename.str.rsplit('/', n=2, expand=True)
df['batch'] = df2[1]
df['file'] = df2[2]
del(df2)

In [13]:
df_clean =  df[list(filter(lambda s: s != 'filename', df.columns))].copy()
df_clean.shape

(52935, 6)

In [14]:
df_clean.head(3)

Unnamed: 0,line_num,types,annotation_difficulty,text,batch,file
0,1,n_a,Difficile,COUR D'APPEL DE VERSAILLES,lot_0037,JURITEXT000033216500.txt
1,2,n_a,Difficile,Code nac : 53B,lot_0037,JURITEXT000033216500.txt
2,3,n_a,Difficile,16e chambre,lot_0037,JURITEXT000033216500.txt


## Generate new data from existing columns

### Vectorize `types` to columns `0/1`
⇒ it's easier to rename and delete types

In [15]:
df_types = df_clean.types.str.get_dummies(sep=' ')

### Correct mispelled word in `types`
Dipositif → Di**s**positif

In [16]:
df_types.rename(columns={'Dipositif': 'Dispositif'}, inplace=True);

In [17]:
types_values = df_types.columns # keep for later, if we need to change/delete
types_values

Index(['Dispositif', 'Dispositif-1', 'Dispositif-2', 'Dispositif-3',
       'Dispositif-demandes_accessoires', 'Entete_appelant', 'Entete_avocat',
       'Entete_composition_de_la_cour', 'Entete_intime', 'Expose_litige',
       'Faits_et_procedure', 'Faits_et_procedure_faits',
       'Faits_et_procedure_procedure', 'Motif-1', 'Motif-1_faits',
       'Motif-1_pretentions_appelant', 'Motif-1_pretentions_intime',
       'Motif-1_texte', 'Motif-2', 'Motif-2_faits',
       'Motif-2_pretentions_appelant', 'Motif-2_pretentions_intime',
       'Motif-2_texte', 'Motif-3', 'Motif-3_faits',
       'Motif-3_pretentions_appelant', 'Motif-3_pretentions_intime',
       'Motif-3_texte', 'Motif-demandes_accessoires', 'Motif_de_la_decision',
       'Moyens_et_pretentions', 'Moyens_et_pretentions_appelant',
       'Moyens_et_pretentions_intime', 'References_decision_attaquee', 'n_a'],
      dtype='object')

Add the vectorized columns.

In [18]:
df_clean_vec = pd.concat([df_clean, df_types], axis=1)

In [19]:
df_clean_vec.columns

Index(['line_num', 'types', 'annotation_difficulty', 'text', 'batch', 'file',
       'Dispositif', 'Dispositif-1', 'Dispositif-2', 'Dispositif-3',
       'Dispositif-demandes_accessoires', 'Entete_appelant', 'Entete_avocat',
       'Entete_composition_de_la_cour', 'Entete_intime', 'Expose_litige',
       'Faits_et_procedure', 'Faits_et_procedure_faits',
       'Faits_et_procedure_procedure', 'Motif-1', 'Motif-1_faits',
       'Motif-1_pretentions_appelant', 'Motif-1_pretentions_intime',
       'Motif-1_texte', 'Motif-2', 'Motif-2_faits',
       'Motif-2_pretentions_appelant', 'Motif-2_pretentions_intime',
       'Motif-2_texte', 'Motif-3', 'Motif-3_faits',
       'Motif-3_pretentions_appelant', 'Motif-3_pretentions_intime',
       'Motif-3_texte', 'Motif-demandes_accessoires', 'Motif_de_la_decision',
       'Moyens_et_pretentions', 'Moyens_et_pretentions_appelant',
       'Moyens_et_pretentions_intime', 'References_decision_attaquee', 'n_a'],
      dtype='object')

### Add the total number of lines per file

In [20]:
df_clean_total_lines = df_clean_vec.merge(
    pd.DataFrame((df_clean_vec.assign(total_line_number=df_clean.line_num)
                          .groupby(['file'])
                          .total_line_number.max())),
    left_on='file', right_index=True)

Check…

In [21]:
slice = df_clean_total_lines.total_line_number[0]
df_clean_total_lines.loc[slice-2:slice+1, ['file', 'line_num', 'total_line_number']]

Unnamed: 0,file,line_num,total_line_number
175,JURITEXT000033216500.txt,176,177
176,JURITEXT000033216500.txt,177,177
177,JURITEXT000033214445.txt,1,60
178,JURITEXT000033214445.txt,2,60


### Add types_macro
**Types macro** : Entete, Expose_litige, Motif_de_la_decision, Dispositif.
- début de la décision au 1er type macro → Entete
- types pour les lignes comprises dans [Expose_litige, Motif_de_la_decision[ → Expose_litige
- types pour les lignes comprises dans [Motif_de_la_decision, Dispositif[ → Motif_de_la_decision
- types pour les lignes comprises dans [Dispositif, *fin du fichier*] → Dispositif

⇒ using forward fill N/A : ffill()

Check the data…

In [22]:
types_macro_cols = ['Expose_litige', 'Motif_de_la_decision', 'Dispositif']
types_macro = ['Entete'] + types_macro_cols

In [23]:
df_clean_total_lines[df_clean_total_lines[types_macro_cols].any(axis=1)][['line_num', 'batch', 'file'] + types_macro_cols].head().style.applymap(highlight_color, subset='file')

Unnamed: 0,line_num,batch,file,Expose_litige,Motif_de_la_decision,Dispositif
29,30,lot_0037,JURITEXT000033216500.txt,1,0,0
87,88,lot_0037,JURITEXT000033216500.txt,0,1,0
167,168,lot_0037,JURITEXT000033216500.txt,0,0,1
198,22,lot_0037,JURITEXT000033214445.txt,1,0,0
221,45,lot_0037,JURITEXT000033214445.txt,0,1,0


Create a column containing the index value of the list `types_macro` (sum the values of each lines):

$$\text{DataFrame.types_macro} = \sum_{i=0}^{\text{len(types_macro_cols)}}
                                       \text{DataFrame.}\mathit{\text{types_macro_cols}_i}  \cdot (i+1)$$

($\Leftrightarrow$ `df_clean_macro.types_macro = df_clean_macro.types_macro + df_clean_macro[colname] * i`, but using pandas's vectorized operations)


In [24]:
df_clean_macro = df_clean_total_lines.copy()

# create a matrix
# array([[1, 2, 3],
#        [1, 2, 3],
#        ..., \n",
#        [1, 2, 3]])

ones_serie = pd.Series(1, index=np.arange(df_clean_macro.shape[0]))
mult_df = np.array([ones_serie.values]).T * np.array([[1, 2, 3]])
# now multiply the matrix made by types_macro_cols by mult_df (this corresponds to the formula above)
df_clean_macro['types_macro'] = pd.DataFrame.sum(df_clean_macro[types_macro_cols] * mult_df, axis=1)

Check…

In [25]:
df_clean_macro[df_clean_macro[types_macro_cols].any(axis=1)][['types_macro'] + types_macro_cols].head()

Unnamed: 0,types_macro,Expose_litige,Motif_de_la_decision,Dispositif
29,1,1,0,0
87,2,0,1,0
167,3,0,0,1
198,1,1,0,0
221,2,0,1,0


In [26]:
# propagate the values:
# replace 0 with NaN, set the Entete index for the 1st lines, then ffill
df_clean_macro.loc[df_clean_macro.types_macro == 0, 'types_macro'] = np.nan
df_clean_macro.loc[df_clean_macro.line_num == 1, 'types_macro'] = 0 # index of 'Entete' in the list 'types_macro'
df_clean_macro.types_macro.ffill(inplace=True)
df_clean_macro.types_macro.astype(int);

In [27]:
# convert values to labels
mapping_types_macro = {idx: label for (idx, label) in enumerate(types_macro)}
df_clean_macro.types_macro = df_clean_macro.types_macro.map(mapping_types_macro)

Check…

In [28]:
df_clean_macro.types_macro.sample(5)

42900           Entete
40415    Expose_litige
50092           Entete
14729    Expose_litige
36429           Entete
Name: types_macro, dtype: object

### Define generic functions to add a new column based on the value of other columns in a dataframe

In [29]:
def filter_row(df, by_cols, strict):
    """returns an index of row that are `True` if any value in `by_cols`
    is True.
    If `strict == True`, then other columns must all be False, else only `by_cols` is used
    """
    # we must remove the 'n_a' column because it's useless when comparing types with 'strict' == True
    other_cols = list(itertools.filterfalse(lambda col: col in by_cols + ['n_a'],
                                            types_values))
    result = df[by_cols].any(axis=1)
    if strict:
        result = result & ~(df[other_cols].any(axis=1))
        
    return result[result == True].index


def get_values(df, col, by_cols, value, strict):
    """returns a new `Serie` where `value` is set if any value in `by_cols` is `True`;
    the strictness of the comparison is given by `strict` (see function `filter_row`)
    """
    result = df[col].copy() # we will add new values, and keep existing ones
    result.loc[filter_row(df, by_cols, strict)] = value
    
    return result


def set_col_value(df, col, by_cols, value, strict=False):
    """set a `value` for `df.col` (returns a modified copy of df) based on `by_cols`, using
    a `strict` match (see `get_values()`)"""
    result = df.copy()
    result[col] = get_values(df=df, col=col, by_cols=by_cols, value=value, strict=strict)

    return result

### Add column `sub_type`

In [30]:
def set_sub_type(df):
    col = 'sub_type'
    return (df
            .pipe(set_col_value, col,
                  ['References_decision_attaquee'],
                  'References_decision_attaquee')
            .pipe(set_col_value, col,
                  ['Entete_appelant', 'Entete_intime'],
                  'Entete_parties',
                  True)
            .pipe(set_col_value, col,
                  ['Entete_avocat'],
                  'Entete_avocat')
            .pipe(set_col_value, col,
                  ['Entete_composition_de_la_cour'],
                  'Entete_composition_de_la_cour')
            .pipe(set_col_value, col,
                  ['Faits_et_procedure', 'Faits_et_procedure_faits', 'Faits_et_procedure_procedure'],
                  'Faits_et_procedure',
                  True)
            .pipe(set_col_value, col,
                  ['Moyens_et_pretentions', 'Moyens_et_pretentions_appelant', 'Moyens_et_pretentions_intime'],
                  'Moyens_et_pretentions',
                  True)
            .pipe(set_col_value, col,
                  ['Motif-1_texte', 'Motif-2_texte', 'Motif-3_texte'],
                  'Motif_texte')
            .pipe(set_col_value, col,
                  ['Motif-1_faits', 'Motif-2_faits', 'Motif-3_faits'],
                  'Motif_faits',
                  True)
            .pipe(set_col_value, col,
                  ['Motif-1_pretentions_appelant', 'Motif-1_pretentions_intime', 'Motif-3_pretentions_appelant', 'Motif-3_pretentions_intime'],
                  'Motif_parties',
                  True)
            .pipe(set_col_value, col,
                  ['Motif-1', 'Motif-2', 'Motif-3'],
                  'Motif_juge',
                  True)
            .pipe(set_col_value, col,
                  ['Motif-demandes_accessoires'],
                  'Motif_demandes_accessoires')
            .pipe(set_col_value, col,
                  ['Dispositif-1', 'Dispositif-2', 'Dispositif-3'],
                  'Dispositif_contenu',
                  True)
            .pipe(set_col_value, col,
                  ['Dispositif-demandes_accessoires'],
                  'Dispositif_demandes_accessoires')
            .pipe(set_col_value, col,
                  ['n_a'],
                  n_a_value,
                  False)
            )

In [31]:
df_clean_st = df_clean_macro.copy()
# initialize the column's values
df_clean_st['sub_type'] = n_a_value

df_clean_st = set_sub_type(df_clean_st)

In [32]:
df_clean_st.sample(5)[list(df_clean_st.columns[:2]) + ['sub_type']]

Unnamed: 0,line_num,types,sub_type
37039,90,Dispositif-1,Dispositif_contenu
31896,39,n_a,n_a
36762,31,Faits_et_procedure_faits,Faits_et_procedure
10213,17,Entete_composition_de_la_cour,Entete_composition_de_la_cour
24597,90,Motif_de_la_decision,n_a


### Add column `Parties`

In [33]:
def set_parties(df):
    col = 'Parties'
    return (df
            .pipe(set_col_value, col,
                  ['Entete_appelant', 'Moyens_et_pretentions_appelant', 'Motif-1_pretentions_appelant', 'Motif-2_pretentions_appelant', 'Motif-3_pretentions_appelant'],
                  'Appelant')
            .pipe(set_col_value, col,
                  ['Entete_intime', 'Moyens_et_pretentions_intime', 'Motif-1_pretentions_intime', 'Motif-2_pretentions_intime', 'Motif-3_pretentions_intime'],
                  'Intime',
                  True)
            )

In [34]:
df_clean_parties = df_clean_st.copy()
# initialize the column's values
df_clean_parties['Parties'] = n_a_value
df_clean_parties = set_parties(df_clean_parties)

In [35]:
df_clean_parties.sample(15)[list(df_clean_parties.columns[:2]) + ['Parties']]

Unnamed: 0,line_num,types,Parties
15127,55,Moyens_et_pretentions_appelant,Appelant
29011,91,Motif_de_la_decision,n_a
30004,122,Motif-demandes_accessoires,n_a
4406,46,Motif-1,n_a
23146,19,Entete_composition_de_la_cour,n_a
33641,56,Moyens_et_pretentions_intime,Intime
50079,24,Entete_intime,Intime
6076,66,Motif-1_faits,n_a
49542,14,Entete_appelant,Appelant
43594,20,Entete_appelant,Appelant


### Add column `Argument`

In [36]:
# values are overwriten by the next pipe(), so we order them from least to max priority

def set_argument(df):
    col = 'Argument'
    return (df
            .pipe(set_col_value, col,
                  ['Motif-3', 'Motif-3_pretentions_appelant', 'Motif-3_pretentions_intime', 'Motif-3_texte', 'Motif-3_faits', 'Dispositif-3'],
                  3)
            .pipe(set_col_value, col,
                  ['Motif-2', 'Motif-2_pretentions_appelant', 'Motif-2_pretentions_intime', 'Motif-2_texte', 'Motif-2_faits', 'Dispositif-2'],
                  2)
            .pipe(set_col_value, col,
                  ['Motif-1', 'Motif-1_pretentions_appelant', 'Motif-1_pretentions_intime', 'Motif-1_texte', 'Motif-1_faits', 'Dispositif-1'],
                  1)
            )

In [37]:
df_clean_arg = df_clean_parties.copy()
df_clean_arg['Argument'] = np.nan # the number of the argument used in the case
df_clean_arg = set_argument(df_clean_arg)

In [38]:
# check that Arguments is NaN for arguments that doesn't match "Motif-#" …
cols_check = list(df_clean_arg.columns[:2]) + ['Argument']
df_clean_arg[:3][cols_check]

Unnamed: 0,line_num,types,Argument
0,1,n_a,
1,2,n_a,
2,3,n_a,


In [39]:
# … and that it is ≠ Nan otherwise
df_clean_arg[df_clean_arg['Argument'].notnull()].sample(5)[cols_check]

Unnamed: 0,line_num,types,Argument
16492,33,Motif-1_pretentions_appelant,1.0
23585,105,Motif-1_faits,1.0
19224,68,Motif-1,1.0
26632,63,Motif-3,3.0
25721,59,Dispositif-1,1.0


## Save as CSV

In [40]:
file_result = pathlib.Path(dir_base) / 'annotations-clean.csv'

In [41]:
#cols_for_csv = list(filter(lambda s: s not in types_values, list(df_clean_parties.columns)))
cols_for_csv = ['batch', 'file', 'total_line_number', 'line_num', 'types', 'annotation_difficulty', 'types_macro', 'sub_type', 'Parties', 'Argument', 'text']
print(f'Columns of the CSV:\n{cols_for_csv}', end='\n\n')

df_clean_arg.to_csv(file_result,
                index=None,
                sep=csv_separator_out,
                columns=cols_for_csv,
                quoting=csv.QUOTE_NONNUMERIC,
                encoding=csv_encoding)
print(f'Result in {file_result}')

Columns of the CSV:
['batch', 'file', 'total_line_number', 'line_num', 'types', 'annotation_difficulty', 'types_macro', 'sub_type', 'Parties', 'Argument', 'text']

Result in /home/ojeulin/developpement/data/IA-et-droit-zonage-décisions-lots/brat_6-12/annotations-clean.csv


# Print the sorted list of values for types, types_macro, sub_type, Parties, Argument:

In [42]:
print('——————— types ———————')
for t in sorted(list({t for tt in df_clean_arg.types.str.split(' ').tolist() for t in tt})):
    print(t)

for col in ['types_macro', 'sub_type', 'Parties', 'Argument']:
    print(f'\n——————— {col} ———————')
    for t in sorted(list(df_clean_arg[col].unique())):
        print(t)

——————— types ———————
Dipositif
Dispositif-1
Dispositif-2
Dispositif-3
Dispositif-demandes_accessoires
Entete_appelant
Entete_avocat
Entete_composition_de_la_cour
Entete_intime
Expose_litige
Faits_et_procedure
Faits_et_procedure_faits
Faits_et_procedure_procedure
Motif-1
Motif-1_faits
Motif-1_pretentions_appelant
Motif-1_pretentions_intime
Motif-1_texte
Motif-2
Motif-2_faits
Motif-2_pretentions_appelant
Motif-2_pretentions_intime
Motif-2_texte
Motif-3
Motif-3_faits
Motif-3_pretentions_appelant
Motif-3_pretentions_intime
Motif-3_texte
Motif-demandes_accessoires
Motif_de_la_decision
Moyens_et_pretentions
Moyens_et_pretentions_appelant
Moyens_et_pretentions_intime
References_decision_attaquee
n_a

——————— types_macro ———————
Dispositif
Entete
Expose_litige
Motif_de_la_decision

——————— sub_type ———————
Dispositif_contenu
Dispositif_demandes_accessoires
Entete_avocat
Entete_composition_de_la_cour
Entete_parties
Faits_et_procedure
Motif_demandes_accessoires
Motif_faits
Motif_juge
Motif_part