In [1]:
import os
import json 
import pandas as pd

In [2]:
DATA_FOLDER = os.path.join(os.path.abspath(""), "..", "data", "annotated")

In [3]:
with open(os.path.join(DATA_FOLDER, "annotated_data.json"), "r", encoding="utf-8") as file:
    data = json.load(file)

df = pd.DataFrame(data)
df = df.explode("annotations")
df = df[df["annotations"].notna()]


for c in ["start", "end", "label", "text"]:
    df[c] = df["annotations"].apply(lambda x: x[c] if x else None)
df.head()


Unnamed: 0,id,checksum,annotations,timestamp,start,end,label,text
0,672d22474e0888abb7bfa1b1,717d19039e1b053fe9b8a39270fcd285,"{'start': 924, 'end': 962, 'label': 'article',...",2024-11-18 19:56:33.889071,924,962,article,article 28 du code de procédure civile
0,672d22474e0888abb7bfa1b1,717d19039e1b053fe9b8a39270fcd285,"{'start': 1152, 'end': 1191, 'label': 'article...",2024-11-18 19:56:33.889071,1152,1191,article,article 462 du code de procédure civile
1,672c98ef75a19b13ac453303,2b873a7fc7a9f33d488bfe7544128901,"{'start': 1703, 'end': 1756, 'label': 'article...",2024-11-18 19:56:33.889071,1703,1756,article,"article 1014, alinéa 1er, du code de procédure..."
1,672c98ef75a19b13ac453303,2b873a7fc7a9f33d488bfe7544128901,"{'start': 1988, 'end': 2027, 'label': 'article...",2024-11-18 19:56:33.889071,1988,2027,article,article 700 du code de procédure civile
2,672d11e14e0888abb7bf5dab,9f28ffd5b4fc73b4efe9d9dd849cb7d8,"{'start': 3103, 'end': 3142, 'label': 'article...",2024-11-18 19:56:33.889071,3103,3142,article,article 700 du code de procédure civile


In [4]:
df.shape

(833, 8)

In [5]:
df['text'].str.lower().value_counts()

text
article 700 du code de procédure civile                                                      128
article 455 du code de procédure civile                                                       17
article 1014, alinéa 1er, du code de procédure civile                                         13
article 450 du code de procédure civile                                                       10
article 1343-2 du code civil                                                                   9
                                                                                            ... 
article 1213 du code civil dans sa rédaction antérieure à l'ordonnance du 10 février 2016      1
articles 382 du code de procédure civile                                                       1
articles 452 et 456 du code de procédure civile                                                1
article 514-1 du même code                                                                     1
l'alinéa 3 de l'article 3

In [6]:
import re


def deaccent(text):
    """Remove letter accents from the given string.
    Parameters
    ----------
    text : str
        Input string.
    Returns
    -------
    str
        Unicode string without accents.
    Examples
    --------
        >>> from juritools.utils import deaccent
        >>> deaccent("ÀÁÂÃÄàáâãäÈÉÊËèéêëÍÌÎÏíìîïÒÓÔÕÖòóôõöÙÚÛÜùúûüÑñÇç")
        u'AAAAAaaaaaEEEEeeeeIIIIiiiiOOOOOoooooUUUUuuuuNnCc'
    """
    mapping = {
        "A": "ÀÁÂÃÄÅÆ",
        "C": "Ç",
        "E": "ÈÉÊË",
        "I": "ÌÍÎÏ",
        "N": "Ñ",
        "O": "ÒÓÔÕÖ",
        "U": "ÙÚÛÜ",
        "Y": "Ý",
        "a": "àáâãäåæ",
        "c": "ç",
        "e": "èéêë",
        "i": "ìíîï",
        "n": "ñ",
        "o": "òóôõö",
        "u": "ùúûüũ",
        "y": "ýŷÿ",
    }
    if not isinstance(text, str):
        # assume utf8 for byte strings, use default (strict) error handling
        text = text.decode("utf8")
    for letter, letter_accent in mapping.items():
        text = re.sub(rf"[{letter_accent}]", letter, text)
    return text

In [7]:
df["clean_text"] = df["text"].str.lower().apply(deaccent)

In [8]:
CODES = [
    "code civil",
    "code penal",
    "code de procedure civile",
    "code de procedure penale",
    "code de l'organisation judiciaire",
]



for code in CODES:
    df[code.replace(" ", "_").replace("'", "")] = df["clean_text"].str.contains(code)

In [9]:
df.head()

Unnamed: 0,id,checksum,annotations,timestamp,start,end,label,text,clean_text,code_civil,code_penal,code_de_procedure_civile,code_de_procedure_penale,code_de_lorganisation_judiciaire
0,672d22474e0888abb7bfa1b1,717d19039e1b053fe9b8a39270fcd285,"{'start': 924, 'end': 962, 'label': 'article',...",2024-11-18 19:56:33.889071,924,962,article,article 28 du code de procédure civile,article 28 du code de procedure civile,False,False,True,False,False
0,672d22474e0888abb7bfa1b1,717d19039e1b053fe9b8a39270fcd285,"{'start': 1152, 'end': 1191, 'label': 'article...",2024-11-18 19:56:33.889071,1152,1191,article,article 462 du code de procédure civile,article 462 du code de procedure civile,False,False,True,False,False
1,672c98ef75a19b13ac453303,2b873a7fc7a9f33d488bfe7544128901,"{'start': 1703, 'end': 1756, 'label': 'article...",2024-11-18 19:56:33.889071,1703,1756,article,"article 1014, alinéa 1er, du code de procédure...","article 1014, alinea 1er, du code de procedure...",False,False,True,False,False
1,672c98ef75a19b13ac453303,2b873a7fc7a9f33d488bfe7544128901,"{'start': 1988, 'end': 2027, 'label': 'article...",2024-11-18 19:56:33.889071,1988,2027,article,article 700 du code de procédure civile,article 700 du code de procedure civile,False,False,True,False,False
2,672d11e14e0888abb7bf5dab,9f28ffd5b4fc73b4efe9d9dd849cb7d8,"{'start': 3103, 'end': 3142, 'label': 'article...",2024-11-18 19:56:33.889071,3103,3142,article,article 700 du code de procédure civile,article 700 du code de procedure civile,False,False,True,False,False


In [10]:
df["code_civil"].sum()

99

In [11]:
df["code_de_procedure_civile"].sum()

377

In [12]:
df["code_de_procedure_penale"].sum()

26