# Inférence de la GRAVITE à partir des données de MRV 
## Optuna et SMV Ordinal
La variable GRAVITE représente la gravité de l'évenement avec 5 echelons




**Stratégie ML 1.1**



## 0) Chargement des librairies

In [2]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd



import clean_text

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GroupShuffleSplit
from sklearn.pipeline import Pipeline

from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score,f1_score
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfVectorizer,HashingVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.calibration import CalibratedClassifierCV
import spacy
nlp =spacy.load('fr')
from spacy.lang.fr.stop_words import STOP_WORDS


import joblib

import optuna
from optuna import Trial

from sklearn.svm import LinearSVC, SVC
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score,f1_score

In [3]:
%time
df_declaration_mrv = pd.read_csv("data/data_mrv/declaration_mrv_complet.csv")#delimiter=';',encoding='ISO-8859-1')
id_to_dco = pd.read_csv("data/ref_MRV/referentiel_dispositif.csv",delimiter=';',encoding='ISO-8859-1')

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 6.91 µs


## 1) Constructions du jeux d'évaluation
On met de coté environ 20% du dataset pour l'évaluation et on ne garde pour l'entrainement seulement les classes avec plus de 10 observations

In [4]:
%%time
df_declaration_mrv = pd.read_csv("data/data_mrv/declaration_mrv_complet.csv")#delimiter=';',encoding='ISO-8859-1')
id_to_dco = pd.read_csv("data/ref_MRV/referentiel_dispositif.csv",delimiter=';',encoding='ISO-8859-1')

#Supression des NaN

#Charegement des colonnes utiles et suppression des NaN

df = df_declaration_mrv[['DESCRIPTION_INCIDENT','TYPE_VIGILANCE','LIBELLE_COMMERCIAL','DCO_ID',
                         'REFERENCE_COMMERCIALE','ETAT_PATIENT','ACTION_PATIENT','FABRICANT',
                          'GRAVITE','CLASSIFICATION']][df_declaration_mrv['GRAVITE'].notna()]

# On complète les NaN avec du vide
df['ETAT_PATIENT'] = df['ETAT_PATIENT'].fillna("")
df['DESCRIPTION_INCIDENT'] = df['DESCRIPTION_INCIDENT'].fillna("")
df['LIBELLE_COMMERCIAL'] = df['LIBELLE_COMMERCIAL'].fillna("")
df['FABRICANT'] = df['FABRICANT'].fillna("")
df["REFERENCE_COMMERCIALE"] = df['REFERENCE_COMMERCIALE'].fillna("")
df['TYPE_VIGILANCE'] = df['TYPE_VIGILANCE'].fillna("")
df['CLASSIFICATION'] = df['CLASSIFICATION'].fillna('')
df['ACTION_PATIENT'] = df['ACTION_PATIENT'].fillna('')
df['DCO_ID'] = df['DCO_ID'].fillna(-1)

# On ajoute des collones pertinentes
df['des_lib'] = df['LIBELLE_COMMERCIAL']+ ' ' + df['DESCRIPTION_INCIDENT']
df['fab_lib'] = df['LIBELLE_COMMERCIAL']+ ' ' + df['FABRICANT']
df['com'] = df['LIBELLE_COMMERCIAL']+ ' ' + df['REFERENCE_COMMERCIALE']
df['Text'] = df['LIBELLE_COMMERCIAL']+ ' ' + df['FABRICANT'] + "" + df['DESCRIPTION_INCIDENT']

# On nettoie les données :
for col in  ['DESCRIPTION_INCIDENT','LIBELLE_COMMERCIAL','ETAT_PATIENT','Text',"des_lib","fab_lib"] :
        df[col] = df[col].map(lambda x: clean_text.preprocess_text(x))

n = 15
# On filtre pour a voir plus de n observations par classse
df_n = df.groupby("GRAVITE").filter(lambda x: len(x) > n)

# On encode les labels
def GRAVITE_ENC(x):
    if x =='NULLE':
        return 0
    elif x == 'MINEU':
        return 1
    elif x == 'MOYEN':
        return 2
    elif x== 'SEVER':
        return 3
    elif x== 'CRITI':
        return 4
df_n.GRAVITE = df_n.GRAVITE.map(lambda x:GRAVITE_ENC(x))

#OnEncode les autres varibles
le = LabelEncoder()
#On encode le type de vigilance
#df_n.TYPE_VIGILANCE = le.fit_transform(df_n.TYPE_VIGILANCE.values)
#On encode la classifcation 
#df_n.CLASSIFICATION = le.fit_transform(df_n.CLASSIFICATION.values)
#on encode le DCO
#df_n.DCO_ID = le.fit_transform(df_n.DCO_ID.values)

# On selection les variables de test en faisant attention aux doublons
train_index,test_index = next(GroupShuffleSplit(random_state=1029,test_size=0.2).split(df_n, groups=df_n['DESCRIPTION_INCIDENT']))
df_train, df_test = df_n.iloc[train_index], df_n.iloc[test_index]


CPU times: user 36.8 s, sys: 400 ms, total: 37.2 s
Wall time: 37.2 s


In [7]:
from sklearn.base import clone


class OrdinalClassifier():
    
    def __init__(self, clf):
        self.clf = clf
        self.clfs = {}
    
    def fit(self, X, y):
        self.unique_class = np.sort(np.unique(y))
        if self.unique_class.shape[0] > 2:
            for i in range(self.unique_class.shape[0]-1):
                # for each k - 1 ordinal value we fit a binary classification problem
                binary_y = (y > self.unique_class[i]).astype(np.uint8)
                clf = clone(self.clf)
                clf.fit(X, binary_y)
                self.clfs[i] = clf
    
    def predict_proba(self, X):
        clfs_predict = {k:self.clfs[k].predict_proba(X) for k in self.clfs}
        predicted = []
        for i,y in enumerate(self.unique_class):
            if i == 0:
                # V1 = 1 - Pr(y > V1)
                predicted.append(1 - clfs_predict[y][:,1])
            elif y in clfs_predict:
                # Vi = Pr(y > Vi-1) - Pr(y > Vi)
                 predicted.append(clfs_predict[y-1][:,1] - clfs_predict[y][:,1])
            else:
                # Vk = Pr(y > Vk-1)
                predicted.append(clfs_predict[y-1][:,1])
        return np.vstack(predicted).T
    
    def predict(self, X):
        return np.argmax(self.predict_proba(X), axis=1)

## 2) Construction du pipeline  pour la gravité ordonnée

In [8]:
%%time
preprocess = ColumnTransformer(
    [('etat_pat_tfidf', TfidfVectorizer(sublinear_tf=True,
                                       stop_words=STOP_WORDS), 'ETAT_PATIENT'),
     
     ('description_tfidf',TfidfVectorizer(sublinear_tf=True,
                                       stop_words=STOP_WORDS), 'DESCRIPTION_INCIDENT'),
     
     ('action_pat_tfidf',TfidfVectorizer(sublinear_tf=True,
                                       stop_words=STOP_WORDS), 'ACTION_PATIENT'),
     
     ('fabricant_tfidf',TfidfVectorizer(sublinear_tf=True,
                                       stop_words=STOP_WORDS), 'FABRICANT'),
     
     ('classification_enc', TfidfVectorizer(sublinear_tf=True,
                                       stop_words=STOP_WORDS),'CLASSIFICATION')
          
    ],
    #
    remainder='passthrough')


pipeline = Pipeline([
    ('vect', preprocess),
    ('clf', OrdinalClassifier(CalibratedClassifierCV(LinearSVC(class_weight='balanced'),cv=5, method='isotonic'))),
])

X = df_train[['DESCRIPTION_INCIDENT','ETAT_PATIENT','ACTION_PATIENT','FABRICANT','CLASSIFICATION']] # 
y = df_train.GRAVITE
CV = 5


#result= cross_validate(pipeline, X, y, scoring=['accuracy','balanced_accuracy','f1_weighted' ], cv=CV)

CPU times: user 8 ms, sys: 0 ns, total: 8 ms
Wall time: 7.45 ms


In [18]:
pd.DataFrame(result)[['test_accuracy','test_balanced_accuracy','test_f1_weighted']].mean()

test_accuracy             0.714910
test_balanced_accuracy    0.475036
test_f1_weighted          0.706529
dtype: float64

## 2.2 Optimisation avec Optuna

In [22]:
def objective(trial):    
    
    joblib.dump(study, 'GRAVITE_study.pkl')
    train_index,test_index = next(GroupShuffleSplit(random_state=1029).split(X, groups=X['DESCRIPTION_INCIDENT']))
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    #Etat Patient
    #vect__etat_pat_tfidf__analyzer = trial.suggest_categorical('vect__etat_pat_tfidf__analyzer', ['word', 'char', 'char_wb']) 
    vect__etat_pat_tfidf__max_features = trial.suggest_int('vect__etat_pat_tfidf__max_features', 500, 20_000)
    vect__etat_pat_tfidf__min_df =  trial.suggest_int('vect__etat_pat_tfidf__min_df', 1,5)
    vect__etat_pat_tfidf__norm = trial.suggest_categorical('vect__etat_pat_tfidf__norm', ('l1', 'l2'))
    #Description
    #vect__description_tfidf__analyzer = trial.suggest_categorical('vect__description_tfidf__analyzer', ['word', 'char', 'char_wb']) 
    vect__description_tfidf__max_features = trial.suggest_int('vect__description_tfidf__max_features', 1500, 60_000)
    vect__description_tfidf__min_df =  trial.suggest_int('vect__description_tfidf__min_df', 1,5)
    vect__description_tfidf__norm = trial.suggest_categorical('vect__description_tfidf__norm', ('l1', 'l2'))
    #Fabricant
    vect__fabricant_tfidf__analyzer = trial.suggest_categorical('vect__fabricant_tfidf__analyzer', ['word', 'char', 'char_wb']) 
    vect__fabricant_tfidf__max_features = trial.suggest_int('vect__fabricant_tfidf__max_features', 500, 10_000)
    vect__fabricant_tfidf__min_df =  trial.suggest_int('vect__fabricant_tfidf__min_df', 1,5)
    vect__fabricant_tfidf__norm = trial.suggest_categorical('vect__fabricant_tfidf__norm', ('l1', 'l2'))
    #action patient
    #vect__action_pat_tfidf__analyzer = trial.suggest_categorical('vect__action_pat_tfidf__analyzer', ['word', 'char', 'char_wb']) 
    vect__action_pat_tfidf__max_features = trial.suggest_int('vect__action_pat_tfidf__max_features', 500, 20_000)
    vect__action_pat_tfidf__min_df =  trial.suggest_int('vect__action_pat_tfidf__min_df', 1,5)
    vect__action_pat_tfidf__norm = trial.suggest_categorical('vect__action_pat_tfidf__norm', ('l1', 'l2'))
    #Classification
    #vect__action_pat_tfidf__analyzer = trial.suggest_categorical('vect__action_pat_tfidf__analyzer', ['word', 'char', 'char_wb']) 
    vect__classification_enc__max_features = trial.suggest_int('vect__classification_enc__max_features', 500, 5000)
    vect__classification_enc__min_df =  trial.suggest_int('vect__classification_enc__min_df', 1,5)
    vect__classification_enc__norm = trial.suggest_categorical('vect__classification_enc__norm', ('l1', 'l2'))
    
    #clf__C =trial.suggest_loguniform('svr_c', 1e-5, 1e5)
    

    
    params = {
        #'vect__etat_pat_tfidf__analyzer':vect__etat_pat_tfidf__analyzer,
        'vect__etat_pat_tfidf__max_features': vect__etat_pat_tfidf__max_features,
        'vect__etat_pat_tfidf__min_df':vect__etat_pat_tfidf__min_df,
        'vect__etat_pat_tfidf__norm':vect__etat_pat_tfidf__norm,
        
        #'vect__description_tfidf__analyzer':vect__description_tfidf__analyzer,
        'vect__description_tfidf__max_features': vect__description_tfidf__max_features,
        'vect__description_tfidf__min_df':vect__description_tfidf__min_df,
        'vect__description_tfidf__norm':vect__description_tfidf__norm,
        
        'vect__fabricant_tfidf__analyzer':vect__fabricant_tfidf__analyzer,
        'vect__fabricant_tfidf__max_features': vect__fabricant_tfidf__max_features,
        'vect__fabricant_tfidf__min_df':vect__fabricant_tfidf__min_df,
        'vect__fabricant_tfidf__norm':vect__fabricant_tfidf__norm,
        
        #'vect__action_pat_tfidf__analyzer':vect__action_pat_tfidf__analyzer,
        'vect__action_pat_tfidf__max_features': vect__action_pat_tfidf__max_features,
        'vect__action_pat_tfidf__min_df':vect__action_pat_tfidf__min_df,
        'vect__action_pat_tfidf__norm':vect__action_pat_tfidf__norm,
        
        #'vect__action_pat_tfidf__analyzer':vect__action_pat_tfidf__analyzer,
        'vect__classification_enc__max_features': vect__classification_enc__max_features,
        'vect__classification_enc__min_df':vect__classification_enc__min_df,
        'vect__classification_enc__norm':vect__classification_enc__norm,
        
        #'clf__C':clf__C
    }
    
    pipeline.set_params(**params)
    pipeline.fit(X_train,y_train)
    y_pred = pipeline.predict(X_test)
    
    return balanced_accuracy_score(y_pred,y_test)

In [None]:
time_out = 75*100
studyName = 'GRAVITE_optimisation_svm'
study = optuna.create_study(direction="maximize")
study.optimize(objective, timeout=time_out)

#Suvegarde du resultat
df = study.trials_dataframe()
df.to_json(studyName+'.json')
print(study.best_trial)

[32m[I 2020-06-04 14:53:30,572][0m Finished trial#0 with value: 0.6086453304921464 with parameters: {'vect__etat_pat_tfidf__max_features': 2083, 'vect__etat_pat_tfidf__min_df': 1, 'vect__etat_pat_tfidf__norm': 'l1', 'vect__description_tfidf__max_features': 23675, 'vect__description_tfidf__min_df': 5, 'vect__description_tfidf__norm': 'l1', 'vect__fabricant_tfidf__analyzer': 'char', 'vect__fabricant_tfidf__max_features': 9132, 'vect__fabricant_tfidf__min_df': 1, 'vect__fabricant_tfidf__norm': 'l2', 'vect__action_pat_tfidf__max_features': 14576, 'vect__action_pat_tfidf__min_df': 5, 'vect__action_pat_tfidf__norm': 'l2', 'vect__classification_enc__max_features': 2552, 'vect__classification_enc__min_df': 1, 'vect__classification_enc__norm': 'l2'}. Best is trial#0 with value: 0.6086453304921464.[0m
[32m[I 2020-06-04 14:54:03,767][0m Finished trial#1 with value: 0.7344201512943285 with parameters: {'vect__etat_pat_tfidf__max_features': 12303, 'vect__etat_pat_tfidf__min_df': 3, 'vect__etat

## 2.3 Prédiction avec le SVM

In [17]:
preprocess = ColumnTransformer(
    [('etat_pat_tfidf_tfidf', TfidfVectorizer(sublinear_tf=True,
                                         analyzer='word',
                                         min_df=4,
                                         ngram_range=(1, 1),
                                         stop_words=STOP_WORDS,
                                         max_features = 13463,
                                         norm = 'l1'), 'ETAT_PATIENT'),
     
     ('action_pat_tfidf', TfidfVectorizer(sublinear_tf=True,
                                         analyzer='word',
                                         min_df=1,
                                         ngram_range=(1, 1),
                                         stop_words=STOP_WORDS,
                                         max_features = 13735,
                                         norm = 'l1'), 'ACTION_PATIENT'),
     
     ('description_tfidf',TfidfVectorizer(sublinear_tf=True,
                                         min_df=1,
                                         ngram_range=(1, 1),
                                         stop_words=STOP_WORDS,
                                         max_features = 22359,
                                         norm = 'l1'), 'DESCRIPTION_INCIDENT'),
     
    ('fabricant_tfidf',TfidfVectorizer(sublinear_tf=True,
                                         analyzer='word',
                                         min_df=3,
                                         ngram_range=(1, 1),
                                         stop_words=STOP_WORDS,
                                         max_features = 7836,
                                         norm = 'l2'), 'FABRICANT'),
    ('classification_enc',TfidfVectorizer(sublinear_tf=True,
                                         analyzer='word',
                                         min_df=5,
                                         ngram_range=(1, 1),
                                         stop_words=STOP_WORDS,
                                         max_features = 2977,
                                         norm = 'l2'), 'CLASSIFICATION')
        ],
    
    remainder='passthrough')

    

In [19]:
pipeline = Pipeline([
    ('vect', preprocess),
    ('clf', OrdinalClassifier(CalibratedClassifierCV(LinearSVC(class_weight='balanced'),cv=3, method='isotonic')))
])

X = df_train[['DESCRIPTION_INCIDENT','ETAT_PATIENT','ACTION_PATIENT','FABRICANT','CLASSIFICATION']] # 
y = df_train.GRAVITE


pipeline.fit(X,y)

y_pred = pipeline.predict(df_test[['CLASSIFICATION','DESCRIPTION_INCIDENT','ETAT_PATIENT','ACTION_PATIENT','FABRICANT']])
y_true = df_test.GRAVITE

print('Justesse:',accuracy_score(y_pred,y_true))
print('Justesse pondéré: ', balanced_accuracy_score(y_pred,y_true))
print('f1_weighted : ',f1_score(y_pred,y_true,average='weighted'))

précision: 0.7354525503757399
présision pondéré:  0.7567361506788588
f1_weighted :  0.7461994757033263
