#  Inférence de l'effet - Stratégie Multilabels
Dans ce Notebook, nous cosntruisons un modèle qui permet d'inférer l'EFFET à partir de la classification de l'incident et des données textuelles

Nous considérons ce problème comme un problème de classification multiclasses et multilabels. En effet, il y a plusieurs effets possibles et un incidents peut entrainer plusieurs effets.

Ainsi, notre métrique d'évaluation sera le f1_samples

Dans le Notebook précedent, nous n'avions pas pris en compte l'aspect multilabel et notre score était de  f1_weighted = 0,28.

Volontairement dans un premier temps, nous ne modifions pas les paramères de notre modèle afin d'avaluer l'apport de la stratégie multilabelles.

In [2]:
import warnings
warnings.filterwarnings('ignore')

from pprint import pprint
from time import time
import logging

import pandas as pd

import numpy as np
import sklearn as sk
import seaborn as sns

import nltk
from nltk import word_tokenize
lang ='french'

import clean_text
import skmultilearn




import matplotlib.pyplot as plt

from sklearn.ensemble import BaggingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer,HashingVectorizer
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GroupShuffleSplit
from sklearn.feature_extraction.text import TfidfTransformer,CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.model_selection import cross_val_score, cross_validate

from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score,f1_score,classification_report
from sklearn import metrics
from sklearn.linear_model import SGDClassifier
from sklearn.decomposition import TruncatedSVD,IncrementalPCA,SparsePCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.calibration import CalibratedClassifierCV
from sklearn.multiclass import OneVsRestClassifier

import spacy
nlp =spacy.load('fr')
from spacy.lang.fr.stop_words import STOP_WORDS

## 0.1 Chargement des données

In [324]:
%time
df_declaration_mrv = pd.read_csv("data/data_mrv/declaration_mrv_complet.csv")#delimiter=';',encoding='ISO-8859-1')
id_to_dco = pd.read_csv("data/ref_MRV/referentiel_dispositif.csv",delimiter=';',encoding='ISO-8859-1')
df_effets = pd.read_csv("data/ref_MRV/referentiel_dispositif_effets_connus.csv",delimiter=';',encoding='ISO-8859-1')
df_dys = pd.read_csv("data/ref_MRV/referentiel_dispositif_dysfonctionnement.csv",delimiter=';',encoding='ISO-8859-1')

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 7.15 µs


## 0.2 Netoyage des données :
- Elimination de l'effet : PAS D'EFFET NEFASTE DECLARE
- Suppression des classes sous représentées (<15 occurences)
- Netoyyage des donénes textuelles
- Encodage de la classification et des effets

In [325]:
%%time
#On complète les effets vide comme étant sans effets
df_declaration_mrv['TYPE_EFFET']  = df_declaration_mrv['TYPE_EFFET'].fillna("PAS D'EFFET NEFASTE DECLARE")
df_declaration_mrv['TEF_ID']= df_declaration_mrv['TEF_ID'].fillna('E1213')

#on selectionne les colonnes avec des effets
df = df_declaration_mrv[['DESCRIPTION_INCIDENT','TYPE_VIGILANCE','LIBELLE_COMMERCIAL',
                         'REFERENCE_COMMERCIALE','ETAT_PATIENT','FABRICANT','DCO_ID',
                         'ACTION_PATIENT','CLASSIFICATION','TEF_ID']]#[df_declaration_mrv['TEF_ID']!='E1213']
# On complète les NaN avec du vide
df['ETAT_PATIENT'] = df['ETAT_PATIENT'].fillna("")
df['DESCRIPTION_INCIDENT'] = df['DESCRIPTION_INCIDENT'].fillna("")
df['LIBELLE_COMMERCIAL'] = df['LIBELLE_COMMERCIAL'].fillna("")
df['FABRICANT'] = df['FABRICANT'].fillna("")
df["REFERENCE_COMMERCIALE"] = df['REFERENCE_COMMERCIALE'].fillna("")
df['TYPE_VIGILANCE'] = df['TYPE_VIGILANCE'].fillna("")
df['CLASSIFICATION'] = df['CLASSIFICATION'].fillna('')
df['DCO_ID'] = df['DCO_ID'].fillna(-1)
#On nettoieles variables textueelles : 

for col in  ['DESCRIPTION_INCIDENT','LIBELLE_COMMERCIAL','ETAT_PATIENT','FABRICANT','ACTION_PATIENT'] :
    df[col] = df[col].map(lambda x: clean_text.preprocess_text(x))


print(len(df))
n = 15
# On filtre pour a voir plus de n observations par classse
df = df.groupby("TEF_ID").filter(lambda x: len(x) > n)
print(len(df))
le_v = LabelEncoder()
df.TYPE_VIGILANCE = le_v.fit_transform(df.TYPE_VIGILANCE.values)
le = LabelEncoder()
df.TEF_ID = le.fit_transform(df.TEF_ID.values)

df_m = df.groupby('DESCRIPTION_INCIDENT')['TEF_ID'].apply(list).reset_index(name='multilabels')


df_ = pd.merge(df,df_m, on = 'DESCRIPTION_INCIDENT')
df_['multilabels'] = df_['multilabels'].apply(np.array)
df_['multilabels'] = df_['multilabels'].map(np.unique)

#df_.to_csv('Multilabel_dataset.csv')

#df_ = df_.drop_duplicates('DESCRIPTION_INCIDENT')

76954
76402
CPU times: user 31.4 s, sys: 0 ns, total: 31.4 s
Wall time: 31.4 s


## 0.3 Construction du jeu de données d'entrainement et de test

In [326]:
# selection des train et test set
train_index,test_index = next(GroupShuffleSplit(random_state=1029).split(df_, groups=df_['DESCRIPTION_INCIDENT']))
df_train, df_test = df_.iloc[train_index], df_.iloc[test_index]
y = df_train.multilabels
y_test =df_test.multilabels

## 0.4 Encodage multilabel

In [349]:
lb = MultiLabelBinarizer()
y_lb =lb.fit_transform(y) 
y_test_lb = lb.transform(y_test)
X = df_train[['FABRICANT','CLASSIFICATION','DESCRIPTION_INCIDENT','ETAT_PATIENT']]
X_test = df_test[['FABRICANT','CLASSIFICATION','DESCRIPTION_INCIDENT','ETAT_PATIENT']]

## 1.1 Construction du pipeline avec une stratégie ONE-VS-REST

This strategy, also known as one-vs-all, is implemented in OneVsRestClassifier. The strategy consists in fitting one classifier per class. For each classifier, the class is fitted against all the other classes. In addition to its computational efficiency (only n_classes classifiers are needed), one advantage of this approach is its interpretability. Since each class is represented by one and only one classifier, it is possible to gain knowledge about the class by inspecting its corresponding classifier. This is the most commonly used strategy and is a fair default choice.

In [322]:
%%time
preprocess = ColumnTransformer(
    [('description_tfidf',TfidfVectorizer(sublinear_tf=True, min_df=3,
                            ngram_range=(1, 1),
                            stop_words=STOP_WORDS,
                            max_features = 10000,norm = 'l2'), 'DESCRIPTION_INCIDENT'),
     
     ('etat_pat_tfidf', TfidfVectorizer(sublinear_tf=True, min_df=3,ngram_range=(1, 1),
                                       stop_words=STOP_WORDS,
                                       max_features = 10000,norm = 'l2'), 'ETAT_PATIENT'),
     
     ('fabricant_tfidf',TfidfVectorizer(sublinear_tf=True, min_df=3,
                            ngram_range=(1, 1),
                            stop_words=STOP_WORDS,
                            max_features = 5000,norm = 'l2'), 'FABRICANT'),
    
    ('classification_enc', TfidfVectorizer(sublinear_tf=True, min_df=5,
                            ngram_range=(1, 1),
                            stop_words=STOP_WORDS,
                            max_features = 100,norm = 'l2'),'CLASSIFICATION')
     ],
    
    remainder='passthrough')


pipeline = Pipeline([
    ('vect', preprocess),
    ('clf', OneVsRestClassifier(CalibratedClassifierCV(LinearSVC(class_weight='balanced'),cv=3, method='isotonic'))),
])

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 2.03 ms


## 1.2 Evaluation du pipeline en cross validation : l'importance de la séparation train/test

In [162]:
%%time
cv = KFold(n_splits=5, random_state=0)
result= cross_validate(pipeline, X, y_lb, scoring='f1_samples', cv=cv)
result['test_score'].mean()

CPU times: user 26min, sys: 0 ns, total: 26min
Wall time: 26min


0.4733920669007727

In [163]:
%%time
cv = ShuffleSplit(n_splits=5, random_state=0)
result= cross_validate(pipeline, X, y_lb, scoring='f1_samples', cv=cv)
result['test_score'].mean()

CPU times: user 30min 25s, sys: 0 ns, total: 30min 25s
Wall time: 30min 24s


0.903673111438492

In [20]:
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold, MultilabelStratifiedShuffleSplit
mrv =  pd.read_csv('./data_clean/clean_data.csv')
mrv = mrv[mrv['TEF_ID']!='E1213']

msss = MultilabelStratifiedKFold(n_splits=2, random_state=1029)

mlb = MultiLabelBinarizer()

for train_index, test_index in msss.split(mrv['text'], mlb.fit_transform(mrv['TEF_ID'])):
    train, test = mrv.loc[train_index],  mrv.loc[test_index]

X_train = train[['FABRICANT','CLASSIFICATION','DESCRIPTION_INCIDENT','ETAT_PATIENT']]
y_train = mlb.transform(train['TEF_ID'])

X_test = test[['FABRICANT','CLASSIFICATION','DESCRIPTION_INCIDENT','ETAT_PATIENT']]
y_test = mlb.transform(test['TEF_ID'])

pipeline.fit(X_train,y_train)

y_pred = pipeline.predict(X_test)
f1 = f1_score(y_test , y_pred,average='samples')
print('f1_score samples : ',f1)

f1_score samples :  0.8488677419996127


In [30]:
len(mrv['TEF_ID'].unique())

2994

In [34]:
mlb = MultiLabelBinarizer()
mlb.fit(mrv['TEF_ID'].values)
a = mlb.transform(mrv['TEF_ID'])
a.shape

(32731, 14)

In [23]:
print(classification_report(y_test , y_pred))

              precision    recall  f1-score   support

           0       0.54      0.62      0.58      2615
           1       0.54      0.62      0.58      2615
           2       0.85      0.83      0.84     11388
           3       0.94      0.92      0.93     14604
           4       0.47      0.56      0.51      2711
           5       0.66      0.68      0.67      3758
           6       0.49      0.54      0.51      1681
           7       0.69      0.74      0.72      2286
           8       0.81      0.79      0.80     11223
           9       0.61      0.66      0.63      2282
          10       0.53      0.57      0.55      1327
          11       0.53      0.62      0.57      2306
          12       1.00      1.00      1.00     16389
          13       1.00      1.00      1.00     16389

   micro avg       0.84      0.85      0.85     91574
   macro avg       0.69      0.73      0.71     91574
weighted avg       0.85      0.85      0.85     91574
 samples avg       0.86   

In [None]:
## 1.3 Si on supprimer les doublons ?

In [317]:
## Supression des doublons
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold


df_s = df_.drop_duplicates('DESCRIPTION_INCIDENT')

lb_s = MultiLabelBinarizer()
y_s = df_s.multilabels
y_s  = lb_s.fit_transform(y_s)

df_s = df_s[['FABRICANT','CLASSIFICATION','TYPE_VIGILANCE','DESCRIPTION_INCIDENT','ETAT_PATIENT']]

mskf = MultilabelStratifiedKFold(n_splits=2, random_state=0)

train_index_s,test_index_s = next(mskf.split(df_s,y_s))
df_train_s, df_test_s = df_s.iloc[train_index_s], df_s.iloc[test_index_s]
y_train_s = y_s[train_index_s]
y_test_s =y_s[test_index_s]

#lb_s = MultiLabelBinarizer()
#y_train_lb_s =lb_s.fit_transform(y_train_s) 
#y_test_lb_s = lb.transform(y_test_s)
X_train_s = df_train_s[['FABRICANT','CLASSIFICATION','TYPE_VIGILANCE','DESCRIPTION_INCIDENT','ETAT_PATIENT']]
X_test_s = df_test_s[['FABRICANT','CLASSIFICATION','TYPE_VIGILANCE','DESCRIPTION_INCIDENT','ETAT_PATIENT']]

pipeline = Pipeline([
    ('vect', preprocess),
    ('clf', OneVsRestClassifier(LinearSVC(class_weight='balanced'))),
])

pipeline.fit(X_train_s,y_train_s)
#y_test_lb = lb.transform(y_test)
Y_pred_ovr_s = pipeline.predict(X_test_s)
f1 = f1_score(y_test_s , Y_pred_ovr_s,average='samples')
print('f1_score samples : ',f1)

f1_score samples :  0.5008347403295654


In [318]:
print(classification_report(y_test_s , Y_pred_ovr_s))

              precision    recall  f1-score   support

           0       1.00      0.22      0.36        18
           1       1.00      0.09      0.17        11
           2       0.00      0.00      0.00         7
           3       0.68      0.36      0.47        36
           4       0.40      0.16      0.23        25
           5       0.64      0.55      0.59       110
           6       0.30      0.18      0.22       119
           7       1.00      0.20      0.33         5
           8       0.50      0.10      0.16        21
           9       0.32      0.12      0.17        50
          10       1.00      0.15      0.27        13
          11       0.40      0.09      0.14        23
          12       0.60      0.14      0.22        22
          13       0.40      0.11      0.17        18
          14       1.00      0.20      0.33        20
          15       1.00      0.08      0.14        13
          16       0.00      0.00      0.00         7
          17       1.00    

## 1.3 Evaluation du pipeline sur les données de test
### Avec le SVM probabilisé

In [350]:
%%time
pipeline.fit(X,y_lb)
#y_test_lb = lb.transform(y_test)
Y_pred_ovr = pipeline.predict(X_test)
f1 = f1_score(y_test_lb , Y_pred_ovr,average='samples')
print('f1_score samples : ',f1)

TypeError: 'numpy.float64' object is not callable

In [352]:
f1 = f1_score(y_test_lb , Y_pred_ovr,average='samples')
print('f1_score samples : ',f1)

f1_score samples :  0.5944730016530945


In [329]:
print(classification_report(y_test_lb , Y_pred_ovr))

              precision    recall  f1-score   support

           0       0.33      0.08      0.13        12
           1       0.00      0.00      0.00         4
           2       0.00      0.00      0.00        38
           3       0.94      0.32      0.48        50
           4       0.00      0.00      0.00        15
           5       0.60      0.09      0.16        64
           6       0.91      0.09      0.17       322
           7       0.00      0.00      0.00         4
           8       0.00      0.00      0.00        37
           9       0.00      0.00      0.00        36
          10       0.00      0.00      0.00         6
          11       1.00      0.11      0.20        18
          12       1.00      0.17      0.29        12
          13       0.00      0.00      0.00        11
          14       0.00      0.00      0.00        17
          15       0.00      0.00      0.00         7
          16       0.00      0.00      0.00         5
          17       0.00    

In [361]:
import joblib
filename = 'Encoder_effets.sav'
joblib.dump(le, filename)


['Encoder_effets.sav']

In [354]:
model_e = joblib.load(filename)

In [359]:
l = X_test[['FABRICANT','CLASSIFICATION','DESCRIPTION_INCIDENT','ETAT_PATIENT']].fillna('').loc[1:1]

In [360]:
pred = model_e.predict_proba(l)

In [346]:
def get_name(x:int,dico,col)-> str :
    """
    Renvoie le nom du dispositif à partir du numéro précisé dans le reférentiel
    si le numéro n'existe pas, alors il ne renvoi rien
    """
    try :
        return (df_effets[df_effets['TEF_ID']==int(x[1:])]['TYPE_EFFET'].iloc[0])
    except :
        return("")

In [347]:
pred = model_e.predict_proba(X_test.loc[1:1])
df_r = pd.DataFrame(pred[0])
df_r["class"] = le.inverse_transform(df_r.index.values)
df_r['class_name'] = df_r['class'].apply(lambda x :  get_name(x))
df_r['proba'] = df_r[0]
df_r = df_r.drop(0,axis=1)
df_r = df_r.sort_values('proba',ascending=False)

In [348]:
df_r

Unnamed: 0,class,class_name,proba
82,E1232,REACTION ALLERGIQUE,0.559276
73,E1213,PAS D'EFFET NEFASTE DECLARE,0.444602
6,E1111,ALLERGIE,0.106148
131,E1467,SF DERMATO,0.099553
140,E1476,PRURIT,0.097375
...,...,...,...
119,E1449,SF CARDIO,0.000000
33,E1146,DISSECTION,0.000000
134,E1470,SC DIGESTIF,0.000000
77,E1218,PERTE DE CHANCE PROCREATION,0.000000


### Sans le SVM probabilisé

In [298]:
pipeline = Pipeline([
    ('vect', preprocess),
    ('clf', OneVsRestClassifier(LinearSVC(class_weight='balanced'))),
])

In [310]:
%%time
pipeline.fit(X,y_lb)
Y_pred_ovr = pipeline.predict(X_test)
f1 = f1_score(y_test_lb , Y_pred_ovr,average='samples')
print('f1_score samples : ',f1)

f1_score samples :  0.631352294551771
CPU times: user 2min 55s, sys: 0 ns, total: 2min 55s
Wall time: 2min 55s


In [300]:
print('f1_score samples : ',f1)

f1_score samples :  0.572080991241877


In [313]:
print(classification_report(y_test_lb , Y_pred_ovr))

              precision    recall  f1-score   support

           0       0.50      0.08      0.14        12
           1       0.00      0.00      0.00         4
           2       0.00      0.00      0.00        38
           3       0.96      0.50      0.66        50
           4       1.00      0.07      0.12        15
           5       0.43      0.50      0.46        64
           6       0.60      0.30      0.40       322
           7       0.00      0.00      0.00         4
           8       0.00      0.00      0.00        37
           9       0.38      0.22      0.28        36
          10       1.00      0.67      0.80         6
          11       1.00      0.11      0.20        18
          12       0.57      0.33      0.42        12
          13       0.00      0.00      0.00        11
          14       0.00      0.00      0.00        17
          15       0.50      0.29      0.36         7
          16       0.00      0.00      0.00         5
          17       0.00    

In [None]:
               precision    recall  f1-score   support   
samples avg       0.62      0.66      0.62     23357
(sans doublon)
Samples avg      0.67      0.65      0.63     42151
(avec doublon)

## Commentaire : 
#### **Les principaux enseignements sont les suivants :**
1)  L'approche multilabels est pertinente : Nous obtenons enfin un score "correct" 0.55 qui se rapproche de la précision à 4 du model simple Label. Ce qui est assez cohérent car il y a en moyenne 4 classes par evénement. Cela signifie que nous avons simplement tirer partie de la métrique mais que notre modèle n'apprend pas mieux... Dommage ! C'est en fait assez logique car le SVM en multiclasse applique déja une stratégie One vs REST

2)  Elle est pertinente.. Mais moins qu'on pourrait le penser avec la shuffle validation : comme le montre la différence entre le Kfold et le ShuffleSplit, les doublons joue un rôle important. Cette différence s'explique apr le fait que les doublons joue un rôle important dans notre base de données car ils ne sont pas exactement des "doublons". De plus ils nous permettent, de tenir compte de la proportion des classes dans notre evaluation.

3) Cette approche multilabels combinée avec la probalisation du SVM est couteuse en temps de construction de modèle et nous devrons le prendre en compte dans notre approche de finetuning.

#### **Les pistes d'améliorations sont :**

1) Finetuner le modèle pour vérifier que notre jeu de paramètre n'est pas un cas particulier de performances 


2) Essayer une approche
- MultiOutputClassifier
- ClassifierChain https://scikit-learn.org/stable/auto_examples/multioutput/plot_classifier_chain_yeast.html

3) Essayer le one-sht learning

## 2.1 Optune et la marge de progression du modèle
Pour le finetunning, nous enlevons la probabilisation du modèle

In [180]:
pipeline = Pipeline([
    ('vect', preprocess),
    ('clf', OneVsRestClassifier(LinearSVC(class_weight='balanced'))),
])

In [184]:
import optuna
from optuna import Trial

def objective(trial):    
    
    train_index,test_index = next(GroupShuffleSplit(random_state=1029).split(X, groups=X['DESCRIPTION_INCIDENT']))
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    y_train_lb = lb.transform(y_train)
    y_test_lb = lb.transform(y_test)
    
    #Etat Patient
    #vect__etat_pat_tfidf__analyzer = trial.suggest_categorical('vect__etat_pat_tfidf__analyzer', ['word', 'char', 'char_wb']) 
    vect__etat_pat_tfidf__max_features = trial.suggest_int('vect__etat_pat_tfidf__max_features', 500, 20_000)
    vect__etat_pat_tfidf__min_df =  trial.suggest_int('vect__etat_pat_tfidf__min_df', 1,5)
    vect__etat_pat_tfidf__norm = trial.suggest_categorical('vect__etat_pat_tfidf__norm', ('l1', 'l2'))
    #Description
    #vect__description_tfidf__analyzer = trial.suggest_categorical('vect__description_tfidf__analyzer', ['word', 'char', 'char_wb']) 
    vect__description_tfidf__max_features = trial.suggest_int('vect__description_tfidf__max_features', 1500, 60_000)
    vect__description_tfidf__min_df =  trial.suggest_int('vect__description_tfidf__min_df', 1,5)
    vect__description_tfidf__norm = trial.suggest_categorical('vect__description_tfidf__norm', ('l1', 'l2'))
    #Fabricant
    vect__fabricant_tfidf__analyzer = trial.suggest_categorical('vect__fabricant_tfidf__analyzer', ['word', 'char', 'char_wb']) 
    vect__fabricant_tfidf__max_features = trial.suggest_int('vect__fabricant_tfidf__max_features', 500, 10_000)
    vect__fabricant_tfidf__min_df =  trial.suggest_int('vect__fabricant_tfidf__min_df', 1,5)
    vect__fabricant_tfidf__norm = trial.suggest_categorical('vect__fabricant_tfidf__norm', ('l1', 'l2'))
    #action patient
    
    #Classification
    vect__classification_enc__analyzer = trial.suggest_categorical('vect__classification_enc__analyzer', ['word', 'char', 'char_wb']) 
    vect__classification_enc__max_features = trial.suggest_int('vect__classification_enc__max_features', 500, 5000)
    vect__classification_enc__min_df =  trial.suggest_int('vect__classification_enc__min_df', 1,5)
    vect__classification_enc__norm = trial.suggest_categorical('vect__classification_enc__norm', ('l1', 'l2'))
    
    #clf__C =trial.suggest_loguniform('svr_c', 1e-5, 1e5)
    

    
    params = {
        #'vect__etat_pat_tfidf__analyzer':vect__etat_pat_tfidf__analyzer,
        'vect__etat_pat_tfidf__max_features': vect__etat_pat_tfidf__max_features,
        'vect__etat_pat_tfidf__min_df':vect__etat_pat_tfidf__min_df,
        'vect__etat_pat_tfidf__norm':vect__etat_pat_tfidf__norm,
        
        #'vect__description_tfidf__analyzer':vect__description_tfidf__analyzer,
        'vect__description_tfidf__max_features': vect__description_tfidf__max_features,
        'vect__description_tfidf__min_df':vect__description_tfidf__min_df,
        'vect__description_tfidf__norm':vect__description_tfidf__norm,
        
        'vect__fabricant_tfidf__analyzer':vect__fabricant_tfidf__analyzer,
        'vect__fabricant_tfidf__max_features': vect__fabricant_tfidf__max_features,
        'vect__fabricant_tfidf__min_df':vect__fabricant_tfidf__min_df,
        'vect__fabricant_tfidf__norm':vect__fabricant_tfidf__norm,
        
        
        'vect__classification_enc__analyzer':vect__classification_enc__analyzer,
        'vect__classification_enc__max_features': vect__classification_enc__max_features,
        'vect__classification_enc__min_df':vect__classification_enc__min_df,
        'vect__classification_enc__norm':vect__classification_enc__norm,
        
        #'clf__C':clf__C
    }
    
    pipeline.set_params(**params)
    pipeline.fit(X_train,y_train_lb)
    Y_pred_ovr = pipeline.predict(X_test)
    score = f1_score(y_test_lb , Y_pred_ovr,average='samples')
    
    return score

In [185]:
time_out = 75*100
studyName = 'EFFET_ml_optimisation_svm'
study = optuna.create_study(direction="maximize")
study.optimize(objective, timeout=time_out)

#Suvegarde du resultat
df = study.trials_dataframe()
df.to_json(studyName+'.json')
print(study.best_trial)

[32m[I 2020-06-10 18:43:30,547][0m Finished trial#0 with value: 0.5268400357946823 with parameters: {'vect__etat_pat_tfidf__max_features': 9211, 'vect__etat_pat_tfidf__min_df': 4, 'vect__etat_pat_tfidf__norm': 'l2', 'vect__description_tfidf__max_features': 34910, 'vect__description_tfidf__min_df': 4, 'vect__description_tfidf__norm': 'l1', 'vect__fabricant_tfidf__analyzer': 'char_wb', 'vect__fabricant_tfidf__max_features': 7623, 'vect__fabricant_tfidf__min_df': 3, 'vect__fabricant_tfidf__norm': 'l1', 'vect__action_pat_tfidf__analyzer': 'char', 'vect__classification_enc__max_features': 3078, 'vect__classification_enc__min_df': 5, 'vect__classification_enc__norm': 'l1'}. Best is trial#0 with value: 0.5268400357946823.[0m
[32m[I 2020-06-10 18:45:45,297][0m Finished trial#1 with value: 0.5314562900155817 with parameters: {'vect__etat_pat_tfidf__max_features': 2892, 'vect__etat_pat_tfidf__min_df': 1, 'vect__etat_pat_tfidf__norm': 'l2', 'vect__description_tfidf__max_features': 36279, 've

FrozenTrial(number=13, value=0.5638014529635027, datetime_start=datetime.datetime(2020, 6, 10, 19, 7, 53, 904403), datetime_complete=datetime.datetime(2020, 6, 10, 19, 9, 11, 38205), params={'vect__etat_pat_tfidf__max_features': 19700, 'vect__etat_pat_tfidf__min_df': 4, 'vect__etat_pat_tfidf__norm': 'l2', 'vect__description_tfidf__max_features': 20401, 'vect__description_tfidf__min_df': 2, 'vect__description_tfidf__norm': 'l2', 'vect__fabricant_tfidf__analyzer': 'word', 'vect__fabricant_tfidf__max_features': 9992, 'vect__fabricant_tfidf__min_df': 3, 'vect__fabricant_tfidf__norm': 'l1', 'vect__action_pat_tfidf__analyzer': 'word', 'vect__classification_enc__max_features': 2062, 'vect__classification_enc__min_df': 4, 'vect__classification_enc__norm': 'l1'}, distributions={'vect__etat_pat_tfidf__max_features': IntUniformDistribution(high=20000, low=500, step=1), 'vect__etat_pat_tfidf__min_df': IntUniformDistribution(high=5, low=1, step=1), 'vect__etat_pat_tfidf__norm': CategoricalDistribut

In [226]:
%%time
Param = {'vect__etat_pat_tfidf__max_features': 19700, 'vect__etat_pat_tfidf__min_df': 4, 'vect__etat_pat_tfidf__norm': 'l2', 
         'vect__description_tfidf__max_features': 20401, 'vect__description_tfidf__min_df': 2, 'vect__description_tfidf__norm': 'l2', 
         'vect__fabricant_tfidf__analyzer': 'word', 'vect__fabricant_tfidf__max_features': 9992, 'vect__fabricant_tfidf__min_df': 3, 
         'vect__fabricant_tfidf__norm': 'l1', 'vect__classification_enc_analyzer': 'word', 'vect__classification_enc__max_features': 2062,
         'vect__classification_enc__min_df': 4, 'vect__classification_enc__norm': 'l1'}

preprocess2 = ColumnTransformer(
    [('description_tfidf',TfidfVectorizer(sublinear_tf=True, min_df=2,
                            ngram_range=(1, 1),
                            stop_words=STOP_WORDS,
                            max_features = 20401,norm = 'l2'), 'DESCRIPTION_INCIDENT'),
     
     ('etat_pat_tfidf', TfidfVectorizer(sublinear_tf=True, min_df=4,
                                        ngram_range=(1, 1),
                                        stop_words=STOP_WORDS,
                                        max_features = 19700,
                                        norm = 'l2'), 'ETAT_PATIENT'),
     
     ('fabricant_tfidf',TfidfVectorizer(sublinear_tf=True, min_df=3,
                            ngram_range=(1, 1),
                            stop_words=STOP_WORDS,
                            max_features = 9992,norm = 'l1'), 'FABRICANT'),
    
    ('classification_enc', TfidfVectorizer(sublinear_tf=True, min_df=4,
                            ngram_range=(1, 1),
                            stop_words=STOP_WORDS,
                            max_features = 2062,norm = 'l2'),'CLASSIFICATION')
     ],
    
    remainder='passthrough')


pipeline = Pipeline([
    ('vect', preprocess2),
    ('clf', OneVsRestClassifier(CalibratedClassifierCV(LinearSVC(class_weight='balanced'),cv=3, method='isotonic'))),
])


pipeline.fit(X,y_lb)
Y_pred_ovr = pipeline.predict(X_test)
f1 = f1_score(y_test_lb , Y_pred_ovr,average='samples')
print('f1_score samples : ',f1)

f1_score samples :  0.5491033504163741


In [227]:
print(classification_report(y_test_lb , Y_pred_ovr))

              precision    recall  f1-score   support

           0       0.36      0.20      0.26        49
           1       0.00      0.00      0.00         4
           2       0.00      0.00      0.00        28
           3       0.78      0.31      0.44        68
           4       0.00      0.00      0.00        34
           5       0.45      0.37      0.41        68
           6       0.00      0.00      0.00       353
           7       1.00      0.76      0.86        21
           8       1.00      0.26      0.41        23
           9       0.00      0.00      0.00        34
          10       0.00      0.00      0.00        25
          11       0.00      0.00      0.00        22
          12       0.67      0.12      0.21        16
          13       0.00      0.00      0.00        11
          14       0.00      0.00      0.00        39
          15       0.00      0.00      0.00        18
          16       0.00      0.00      0.00         1
          17       0.00    

## Commantaire
 Notre première approximation des performances était la bonne. En effet, le finetunning des paramètres des TFIDF n'a pas permis d'augmenter significativmement les résultats
 
## 3.0 L'approche Multioutput

> Multioutput classification support can be added to any classifier with MultiOutputClassifier. This strategy consists of fitting one classifier per target. This allows multiple target variable classifications. The purpose of this class is to extend estimators to be able to estimate a series of target functions (f1,f2,f3…,fn) that are trained on a single X predictor matrix to predict a series of responses (y1,y2,y3…,yn).

https://scikit-learn.org/stable/modules/generated/sklearn.multioutput.MultiOutputClassifier.html




In [230]:
from sklearn.multioutput import MultiOutputClassifier

pipeline = Pipeline([
    ('vect', preprocess),
    ('clf', MultiOutputClassifier(LinearSVC(class_weight='balanced'))),
])
#### prédiction 
pipeline.fit(X,y_lb)
Y_pred_ovr = pipeline.predict(X_test)
f1 = f1_score(y_test_lb , Y_pred_ovr,average='samples')
print('f1_score samples : ',f1)

f1_score samples :  0.572080991241877


In [231]:
print(classification_report(y_test_lb , Y_pred_ovr))

              precision    recall  f1-score   support

           0       1.00      0.20      0.34        49
           1       1.00      0.50      0.67         4
           2       0.00      0.00      0.00        28
           3       0.66      0.31      0.42        68
           4       0.57      0.12      0.20        34
           5       0.42      0.50      0.46        68
           6       0.29      0.11      0.16       353
           7       1.00      0.76      0.86        21
           8       0.88      0.30      0.45        23
           9       0.10      0.03      0.05        34
          10       0.00      0.00      0.00        25
          11       0.00      0.00      0.00        22
          12       0.67      0.12      0.21        16
          13       0.00      0.00      0.00        11
          14       0.00      0.00      0.00        39
          15       0.00      0.00      0.00        18
          16       0.00      0.00      0.00         1
          17       0.00    

### Commentaire
Comme attendu, nous n'observons pas de grande différence car les deux approches sont très similaires

## 3.1 Approche One vs One

>This strategy consists in fitting one classifier per class pair. At prediction time, the class which received the most votes is selected. Since it requires to fit n_classes * (n_classes - 1) / 2 classifiers, this method is usually slower than one-vs-the-rest, due to its O(n_classes^2) complexity. However, this method may be advantageous for algorithms such as kernel algorithms which don’t scale well with n_samples. This is because each individual learning problem only involves a small subset of the data whereas, with one-vs-the-rest, the complete dataset is used n_classes times.



In [240]:
%%time
from sklearn.multiclass import OneVsOneClassifier
pipeline = Pipeline([
    ('vect', preprocess),
    ('clf', MultiOutputClassifier(OneVsOneClassifier(LinearSVC(class_weight='balanced')))),
])
#### prédiction 
pipeline.fit(X,y_lb)
Y_pred_ovr = pipeline.predict(X_test)
f1 = f1_score(y_test_lb , Y_pred_ovr,average='samples')
print('f1_score samples : ',f1)

f1_score samples :  0.572080991241877


### Commentaire
Nous n'oservons pas de changement de performances, seulement une hausse du temps de calcul
## 3.2 l'approche ClassifierChain
>A multi-label model that arranges binary classifiers into a chain.
Each model makes a prediction in the order specified by the chain using all of the available features provided to the model plus the predictions of models that are earlier in the chain.



In [188]:
from sklearn.multioutput import ClassifierChain

In [189]:
train_index_,test_index_ = next(GroupShuffleSplit(random_state=1029).split(X, groups=X['DESCRIPTION_INCIDENT']))
X_train_, X_test_ = X.iloc[train_index_], X.iloc[test_index_]
y_train_, y_test_ = y.iloc[train_index_], y.iloc[test_index_]

X_train_, X_test_ =preprocess.fit_transform(X_train_),preprocess.transform(X_test_)
y_train_lb_ = lb.transform(y_train_)
y_test_lb_ = lb.transform(y_test_)

In [245]:
%%time
X_train, X_test_ =preprocess.fit_transform(X),preprocess.transform(X_test)
clf = LinearSVC(class_weight='balanced')


chains = [ClassifierChain(clf, order='random', random_state=i) for i in range(10)]

for chain in chains:
    chain.fit(X_train, y_lb)
    
y_pred_chains = np.array([chain.predict(X_test_) for chain in chains])

chain_f1_scores = [f1_score(y_test_lb, y_pred_chain, average='samples') for y_pred_chain in y_pred_chains]

y_pred_ensemble = y_pred_chains.mean(axis=0)

y_e = y_pred_ensemble>=0.4

ensemble_f1_score = f1_score(y_test_lb,y_e, average='samples')

print(ensemble_f1_score)

0.5956181738659063


In [321]:
print(classification_report(y_test_lb,y_e))

              precision    recall  f1-score   support

           0       1.00      0.20      0.34        49
           1       1.00      0.50      0.67         4
           2       0.00      0.00      0.00        28
           3       0.66      0.31      0.42        68
           4       0.57      0.12      0.20        34
           5       0.46      0.63      0.53        68
           6       0.29      0.11      0.16       353
           7       1.00      0.76      0.86        21
           8       1.00      0.30      0.47        23
           9       0.11      0.03      0.05        34
          10       0.00      0.00      0.00        25
          11       0.67      0.18      0.29        22
          12       0.50      0.19      0.27        16
          13       0.00      0.00      0.00        11
          14       0.00      0.00      0.00        39
          15       0.00      0.00      0.00        18
          16       0.00      0.00      0.00         1
          17       0.00    

In [224]:
#Random test
test = np.random.rand(y_e.shape[0],y_e.shape[1])>=0.5
f1_score(y_test_lb_,test, average='samples')

0.04225001471861698

### Commentaire : 
L'approche chain combiner à une méthode ensemblise améliore les performaces de quelques pourcent mais l'ordre de grandeur reste le même.

L'approche multilabel est une piste à suivre mais elle n'améliore pas ou peu la qualité de notre apprentissage.

### Si on appliue un mapping inteligent

In [363]:
%%time
import json 

df_declaration_mrv = pd.read_csv("data/data_mrv/declaration_mrv_complet.csv")#delimiter=';',encoding='ISO-8859-1')
id_to_dco = pd.read_csv("data/ref_MRV/referentiel_dispositif.csv",delimiter=';',encoding='ISO-8859-1')
df_effets = pd.read_csv("data/ref_MRV/referentiel_dispositif_effets_connus.csv",delimiter=';',encoding='ISO-8859-1')
df_dys = pd.read_csv("data/ref_MRV/referentiel_dispositif_dysfonctionnement.csv",delimiter=';',encoding='ISO-8859-1')

with open ('mapping_effet.json', 'r') as file :
    mapping_effet = json.load(file)
    
#On complète les effets vide comme étant sans effets
df_declaration_mrv['TYPE_EFFET']  = df_declaration_mrv['TYPE_EFFET'].fillna("PAS D'EFFET NEFASTE DECLARE")
df_declaration_mrv['TEF_ID']= df_declaration_mrv['TEF_ID'].fillna('E1213')


#on selectionne les colonnes avec des effets
df = df_declaration_mrv[['DESCRIPTION_INCIDENT','TYPE_VIGILANCE','LIBELLE_COMMERCIAL',
                         'REFERENCE_COMMERCIALE','ETAT_PATIENT','FABRICANT','DCO_ID',
                         'ACTION_PATIENT','CLASSIFICATION','TYPE_EFFET','TEF_ID']][df_declaration_mrv['TEF_ID']!='E1213']
# On complète les NaN avec du vide
df['ETAT_PATIENT'] = df['ETAT_PATIENT'].fillna("")
df['DESCRIPTION_INCIDENT'] = df['DESCRIPTION_INCIDENT'].fillna("")
df['LIBELLE_COMMERCIAL'] = df['LIBELLE_COMMERCIAL'].fillna("")
df['FABRICANT'] = df['FABRICANT'].fillna("")
df["REFERENCE_COMMERCIALE"] = df['REFERENCE_COMMERCIALE'].fillna("")
df['TYPE_VIGILANCE'] = df['TYPE_VIGILANCE'].fillna("")
df['CLASSIFICATION'] = df['CLASSIFICATION'].fillna('')
df['DCO_ID'] = df['DCO_ID'].fillna(-1)
#On nettoieles variables textueelles : 

for col in  ['DESCRIPTION_INCIDENT','LIBELLE_COMMERCIAL','ETAT_PATIENT','FABRICANT','ACTION_PATIENT'] :
    df[col] = df[col].map(lambda x: clean_text.preprocess_text(x))

def apply_mapping(x,mapping):
    cle = list(mapping.keys())
    if x in cle:
        return(x)
    else:
        for elt in cle:
            if x in mapping[elt] :
                return elt

n = 15
# On filtre pour a voir plus de n observations par classse
df = df.groupby("TEF_ID").filter(lambda x: len(x) > n)

print(len(df.groupby("TYPE_EFFET")))
df.TYPE_EFFET = df.TYPE_EFFET.map(lambda x: apply_mapping(x,mapping_effet))
print(len(df.groupby("TYPE_EFFET")))

le = LabelEncoder()
df.TYPE_VIGILANCE = le.fit_transform(df.TYPE_VIGILANCE.values)
le = LabelEncoder()
df.TYPE_EFFET = le.fit_transform(df.TYPE_EFFET.values)


# Encodage des varaible  multilabelle
df_m = df.groupby('DESCRIPTION_INCIDENT')['TYPE_EFFET'].apply(list).reset_index(name='multilabels')


df_ = pd.merge(df,df_m, on = 'DESCRIPTION_INCIDENT')
df_['multilabels'] = df_['multilabels'].apply(np.array)
df_['multilabels'] = df_['multilabels'].map(np.unique)


# selection des train et test set
train_index,test_index = next(GroupShuffleSplit(random_state=1029).split(df_, groups=df_['DESCRIPTION_INCIDENT']))
df_train, df_test = df_.iloc[train_index], df_.iloc[test_index]
y = df_train.multilabels
y_test =df_test.multilabels

lb = MultiLabelBinarizer()
y_lb =lb.fit_transform(y) 
y_test_lb = lb.transform(y_test)
X = df_train[['FABRICANT','CLASSIFICATION','DESCRIPTION_INCIDENT','ETAT_PATIENT']]
X_test = df_test[['FABRICANT','CLASSIFICATION','DESCRIPTION_INCIDENT','ETAT_PATIENT']]

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 10.3 µs
172
145


In [364]:
%%time
from sklearn.multioutput import MultiOutputClassifier

pipeline = Pipeline([
    ('vect', preprocess),
    ('clf', MultiOutputClassifier(LinearSVC(class_weight='balanced'))),
])
#### prédiction 
pipeline.fit(X,y_lb)
Y_pred_ovr = pipeline.predict(X_test)
f1 = f1_score(y_test_lb , Y_pred_ovr,average='samples')
print('f1_score samples : ',f1)

f1_score samples :  0.5916493372568136
CPU times: user 1min 38s, sys: 0 ns, total: 1min 38s
Wall time: 1min 39s


In [366]:
len(mapping_effet.keys())

237

In [247]:
C = sk.metrics.multilabel_confusion_matrix(y_test_lb,y_e)

In [257]:
df_re = pd.DataFrame(classification_report(y_test_lb , Y_pred_ovr,output_dict=True)).T

In [273]:
df_re['Class'] = le.inverse_transform(df_re.index.map(int).values[:-5])

ValueError: invalid literal for int() with base 10: 'micro avg'

In [269]:
L =df_re.index.to_list()

In [271]:
L.map(int)

AttributeError: 'list' object has no attribute 'map'