In [38]:
import os
import pandas as pd
import re
import numpy as np
from typing import List, Tuple

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate, GridSearchCV, RandomizedSearchCV
from xgboost import XGBClassifier

In [2]:
#x_train data
x_tr_df = pd.read_csv('./train_folder/x_train_ids.csv', index_col=0)
tr_dir = "./train_folder/txt_files/"
tr_files = x_tr_df['filename'].values
tr_texts = []
for tr_file in tr_files:
    f = open(os.path.join(tr_dir, tr_file), "r")
    tr_texts.append(f.read().replace('\xa0', ' ').replace('\n', ' '))

#x_test data
x_te_df = pd.read_csv('./test_folder/x_test_ids.csv', index_col=0)
te_dir = "./test_folder/txt_files/"
te_files = x_te_df['filename'].values
te_texts = []
for te_file in te_files:
    f = open(os.path.join(te_dir, te_file), "r")
    te_texts.append(f.read().replace('\xa0', ' ').replace('\n', ' '))


#y_train data
y_tr_df = pd.read_csv('./Y_train_predilex.csv', index_col=0)

In [3]:
i = 0
print('File :', x_tr_df.iloc[i]['filename'])
print('Text (200 first char):', tr_texts[i][:200])
print('Output : ')
print(y_tr_df.iloc[i])

File : Agen_100515.txt
Text (200 first char): Le : 12/11/2019     Cour d’appel d’Agen    chambre sociale    Audience publique du 30 avril 2002    N° de RG: 01/00515          REPUBLIQUE FRANCAISE   AU NOM DU PEUPLE FRANCAIS   ARRET DU 30 AVRIL 200
Output : 
sexe                       homme
date_accident         1991-04-09
date_consolidation          n.c.
Name: 0, dtype: object


In [4]:
print(f"Number of training samples: {len(tr_texts)}")
print(f"Number of test samples: {len(te_texts)}")

Number of training samples: 770
Number of test samples: 257


In [5]:
tr_texts[0]

'Le : 12/11/2019     Cour d’appel d’Agen    chambre sociale    Audience publique du 30 avril 2002    N° de RG: 01/00515          REPUBLIQUE FRANCAISE   AU NOM DU PEUPLE FRANCAIS   ARRET DU 30 AVRIL 2002 ----------------------- 01/00515 ----------------------- Yvon X... Annie X... C/ GROUPE AZUR ASSURANCES IARD CAISSE PRIMAIRE D’ASSURANCE MALADIE DE LOT ET GARONNE Yannick GUGUEN ès qualités de mandataire liquidateur à la liquidation judiciaire de la société des transports MOREL ----------------------- ARRET N° COUR D’APPEL D’AGEN CHAMBRE SOCIALE Prononcé à l’audience publique du trente Avril deux mille deux par Madame LATRABE, Conseiller, La COUR d’APPEL D’AGEN, CHAMBRE SOCIALE, dans l’affaire ENTRE : Monsieur Yvon X... né le 30 Septembre 1955 à TONNEINS (47) Y... du Laurier 47300 BIAS Rep/assistant : Me Pascale LUGUET (avocat au barreau d’AGEN) Madame Annie X... Y... du Laurier 47300 BIAS Rep/assistant : Me Pascale LUGUET (avocat au barreau d’AGEN) APPELANTS d’un jugement du Tribunal d

### Gender classification

In [6]:
# drop rows with unknow gender in training set
gender_na_idx = y_tr_df[y_tr_df['sexe'] == 'n.c.'].index
y_tr_df = y_tr_df[y_tr_df['sexe'] != 'n.c.']
y_tr_df.reset_index(inplace=True, drop=True)

tr_texts = [tr_texts[i] for i in range(len(tr_texts)) if i not in gender_na_idx]

#### Method 1: specific words count per gender (NOT GOOD)

In [7]:
reg_exp_male = r"il|monsieur| né| représenté"
reg_exp_female = r"elle|madame|mademoiselle| née| représentée"

In [8]:
def ismale(text:str):
    male_words = re.findall(reg_exp_male, text, flags=re.IGNORECASE)
    female_words = re.findall(reg_exp_female, text, flags=re.IGNORECASE)
    return len(male_words) >= len(female_words)

In [9]:
y_pred_train = np.array([ismale(text) for text in tr_texts], dtype=int)
y_train = y_tr_df['sexe'].map({'homme': 1, 'femme': 0}).values

print(f"Accuracy on train set: {round((1 - (y_pred_train - y_train).mean()) * 100, 2)}%")

Accuracy on train set: 78.17%


#### Method 2: bag of words (GOOD)

In [62]:
# clean text without using stop words nor lemmitizer
def clean_text(text: str) -> str:
    """ remove unwanted characters from text """
    # remove line breaks
    text = text.replace('\n', '')
    # remove quotes
    text = text.replace('"', '')
    #lowercase text
    text = text.lower()
    #remove punctuation and special characters ?:!.,[]();
    text = text.translate(str.maketrans('', '', '?:!.,;[]()*-'))
    # remove possessive pronouns termination
    text = text.replace("'s", "")
    # remove unnecessery white spaces
    text = re.sub(r"\s\s+", " ", text)
        
    return text

In [8]:
tfidf_vec = TfidfVectorizer(ngram_range=(1, 2), max_features=100, norm='l2', sublinear_tf=True)
x_train = tfidf_vec.fit_transform(tr_texts)
y_train = y_tr_df['sexe'].map({'homme': 1, 'femme': 0}).values

In [9]:
c_val, c_count = np.unique(y_train, return_counts=True)
for i, c in enumerate(c_val):
    print(f"Number of samples for class {c}: {c_count[i]}")

Number of samples for class 0: 206
Number of samples for class 1: 559


In [10]:
def apply_cv(cls, cv=3):
    cv_results = cross_validate(cls, x_train, y_train, scoring='accuracy', return_train_score=True, cv=cv)
    print(f"CV Accuracy: {round(cv_results['train_score'].mean() * 100, 2)}%")

In [11]:
logistic_reg = LogisticRegression()
apply_cv(logistic_reg)

CV Accuracy: 75.56%


In [12]:
def perform_grid_search(cls, param_grid):
    """ perform a grid search on a classifier using 3-fold cross validation and the passed prameters """
    grid_search = GridSearchCV(estimator=cls, 
                               param_grid=param_grid,
                               scoring='accuracy',
                               cv=3,
                               verbose=1)
    grid_search.fit(x_train, y_train)
    print(f"The best hyperparameters from Grid Seach are: {grid_search.best_params_}")
    return grid_search.best_estimator_

In [15]:
random_state = 42

param_grid = {
    'max_depth': [3, 5],
    'learning_rate': [.01, .1],
    'subsample': [.7, .9, 1],
    'colsample_bytree': [.7, .9, 1],
}
xgb_cls = XGBClassifier(random_state=random_state)
xgb_cls = perform_grid_search(xgb_cls, param_grid)

Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 108 out of 108 | elapsed:  1.0min finished


The best hyperparameters from Grid Seach are: {'colsample_bytree': 1, 'learning_rate': 0.1, 'max_depth': 3, 'subsample': 0.9}


In [16]:
apply_cv(xgb_cls, 10)

CV Accuracy: 99.87%


### Accident date

In [3]:
# drop rows with unknow accident date in training set
accdate_na_idx = y_tr_df[y_tr_df['date_accident'] == 'n.c.'].index
y_tr_df = y_tr_df[y_tr_df['date_accident'] != 'n.c.']
y_tr_df.reset_index(inplace=True, drop=True)

tr_texts = [tr_texts[i] for i in range(len(tr_texts)) if i not in accdate_na_idx]

    
print(f"Remaining rows: {len(tr_texts)}")

Remaining rows: 743


In [7]:
re.findall(r"(\d+) (\w+) (\d+)", tr_texts[0])

[('30', 'avril', '2002'),
 ('30', 'AVRIL', '2002'),
 ('30', 'Septembre', '1955'),
 ('12', 'Février', '2001'),
 ('19', 'Mars', '2002'),
 ('9', 'avril', '1991'),
 ('24', 'avril', '1997'),
 ('30', 'mars', '1999'),
 ('2', 'février', '2'),
 ('12', 'février', '2')]

In [42]:
# REGEX PATTERNS

pattern1 = r"\d+ \w+ \d+"
pattern2 = r"1° \w+ \d+"
pattern3 = r"1er \w+ \d+"
pattern4 = r"\d+\.\d+\.\d+"
pattern5 = r"\d+/\d+/\d+"
pattern6 = r"\d+ / \d+ / \d+"
pattern7 = r"\d+/ \d+/ \d+"
pattern8 = r"\d+\. \d+\. \d+"

pattern1_bis = r"(\d+) (\w+) (\d+)"
pattern2_bis = r"(1°) (\w+) (\d+)"
pattern3_bis = r"(1er) (\w+) (\d+)"
pattern4_bis = r"(\d+)\.(\d+)\.(\d+)"
pattern5_bis = r"(\d+)/(\d+)/(\d+)"
pattern6_bis = r"(\d+) / (\d+) / (\d+)"
pattern7_bis = r"(\d+)/ (\d+)/ (\d+)"
pattern8_bis = r"(\d+)\. (\d+)\. (\d+)"

months_str_to_int = dict(janvier='01', fevrier='02', mars='03', avril='04', mai='05', juin='06',
                         juillet='07', aout='08', septembre='09', novembre='10', octobre='11',
                         decembre='12')

def format_date(day:str, month:str):
    if len(day) == 1:
        day = '0' + day
    if not month.isnumeric():
        month = months_str_to_int[month.lower().replace('é', 'e').replace('ô', 'o')]
    elif len(month) == 1:
        month = '0' + month
    return day, month


In [63]:
patterns = [pattern1]
patterns_bis = [pattern1_bis]


window_trunc = 300
    

def find_sentences_with_dates(text: str, tr_idx) -> pd.DataFrame:
    """ return all sentences containing a date from corpus text """

    # remove ... from text
    text = re.sub(r"[.]{2,}", "", text)
    
    #initialze dataframe
    sentences_df = pd.DataFrame(columns=['tr_idx', 'sentence', 'class'])
    
    df = pd.DataFrame(columns=['tr_idx', 'sentence', 'class'])
    accident_date = y_tr_df.iloc[tr_idx]['date_accident']
    for pattern_idx, pattern in enumerate(patterns):
        for date in re.findall(pattern, text):
            reg_match = re.match(patterns_bis[pattern_idx], date)
            day, month, year = reg_match[1], reg_match[2], reg_match[3]
            day, month = format_date(day, month)
            current_date = f"{year}-{month}-{day}"
            sentences = [senten[:window_trunc] for senten in re.findall(r"[^.]*?"+date+r"[^.]*", text)]
            df = df.append(pd.DataFrame([[tr_idx, clean_text(senten), int(current_date == accident_date)] for senten in sentences],
                                columns=['tr_idx', 'sentence', 'class']), ignore_index=True)
        
    return df

In [64]:
senten_train_df = pd.DataFrame(columns=['tr_idx', 'sentence', 'class'])


Unnamed: 0,tr_idx,sentence,class
0,0,le 12/11/2019 cour d’appel d’agen chambre soci...,0
1,0,le 12/11/2019 cour d’appel d’agen chambre soci...,0
2,0,le 12/11/2019 cour d’appel d’agen chambre soci...,0
3,0,le 12/11/2019 cour d’appel d’agen chambre soci...,0
4,0,la cause a été débattue et plaidée en audienc...,0
5,0,le 9 avril 1991 monsieur x a été victime d’un...,1
6,0,suivant jugement en date du 24 avril 1997 con...,0
7,0,suivant jugement en date du 24 avril 1997 con...,0
8,0,transports morel employeur de monsieur x et a...,0
9,0,suivant jugement en date du 12 février 2 001 ...,0
