# Inférence de la Typologie à partir des données de MRV (Stratégie ML Naive)
La variable TYPO présente les types de dysfocntionnements et d'effets des dispositifs. Il y a 3 varaibles concernée
- TYPE DE DYSFONCTIONNEMENT
- CONSEQUENCE DYSFONCTIONNEMENT
- TYPE D'EFFET




**Stratégie ML**

## 0) Chargement des librairies

In [98]:
import warnings
warnings.filterwarnings('ignore')

from pprint import pprint
from time import time
import logging

import pandas as pd

import numpy as np
import sklearn as sk
import seaborn as sns

import nltk
from nltk import word_tokenize
lang ='french'

import clean_text





import matplotlib.pyplot as plt


from sklearn.feature_extraction.text import TfidfVectorizer,HashingVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GroupShuffleSplit
from sklearn.feature_extraction.text import TfidfTransformer,CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.model_selection import cross_val_score, cross_validate

from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score,f1_score
from sklearn import metrics
from sklearn.linear_model import SGDClassifier
from sklearn.decomposition import TruncatedSVD,IncrementalPCA,SparsePCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.calibration import CalibratedClassifierCV

import spacy
nlp =spacy.load('fr')
from spacy.lang.fr.stop_words import STOP_WORDS

## 0.1 Chargement et exploration des données

In [99]:
%time
df_declaration_mrv = pd.read_csv("data/data_mrv/declaration_mrv_complet.csv")#delimiter=';',encoding='ISO-8859-1')
id_to_dco = pd.read_csv("data/ref_MRV/referentiel_dispositif.csv",delimiter=';',encoding='ISO-8859-1')
df_effets = pd.read_csv("data/ref_MRV/referentiel_dispositif_effets_connus.csv",delimiter=';',encoding='ISO-8859-1')
df_dys = pd.read_csv("data/ref_MRV/referentiel_dispositif_dysfonctionnement.csv",delimiter=';',encoding='ISO-8859-1')

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 7.63 µs


In [100]:
df_declaration_mrv.columns

Index(['Unnamed: 0', 'NUMERO_DECLARATION', 'TYPE_DECLARATION',
       'TYPE_DECLARANT', 'NB_PATIENT_CONCERNE', 'NB_DISPOSITIF_CONCERNE',
       'DESCRIPTION_INCIDENT', 'ETAT_PATIENT', 'ACTION_PATIENT', 'DCO_ID',
       'DCO', 'LIBELLE_COMMERCIAL', 'REFERENCE_COMMERCIALE', 'NUMERO_SERIE',
       'NUMERO_LOT', 'FABRICANT', 'MANDATAIRE', 'DISTRIBUTEUR', 'TDY_ID',
       'TYPE_DYSFONCTIONNEMENT', 'CDY_ID', 'CONSEQUENCE_DYSFONCTIONNEMENT',
       'TEF_ID', 'TYPE_EFFET', 'GRAVITE', 'NUMERO', 'TYPE_VIGILANCE',
       'CLASSIFICATION'],
      dtype='object')

In [107]:
%%time
#On complète les effets vide comme étant sans effets
df_declaration_mrv['TYPE_EFFET']  = df_declaration_mrv['TYPE_EFFET'].fillna("PAS D'EFFET NEFASTE DECLARE")
df_declaration_mrv['TEF_ID']= df_declaration_mrv['TEF_ID'].fillna('E1213')

#on selectionne les colonnes
df = df_declaration_mrv[['DESCRIPTION_INCIDENT','TYPE_VIGILANCE','LIBELLE_COMMERCIAL',
                         'REFERENCE_COMMERCIALE','ETAT_PATIENT','FABRICANT','DCO_ID',
                         'ACTION_PATIENT','CLASSIFICATION','TEF_ID']]
# On complète les NaN avec du vide
df['ETAT_PATIENT'] = df['ETAT_PATIENT'].fillna("")
df['DESCRIPTION_INCIDENT'] = df['DESCRIPTION_INCIDENT'].fillna("")
df['LIBELLE_COMMERCIAL'] = df['LIBELLE_COMMERCIAL'].fillna("")
df['FABRICANT'] = df['FABRICANT'].fillna("")
df["REFERENCE_COMMERCIALE"] = df['REFERENCE_COMMERCIALE'].fillna("")
df['TYPE_VIGILANCE'] = df['TYPE_VIGILANCE'].fillna("")
df['CLASSIFICATION'] = df['CLASSIFICATION'].fillna('')
df['DCO_ID'] = df['DCO_ID'].fillna(-1)
#On nettoieles variables textueelles : 

for col in  ['DESCRIPTION_INCIDENT','LIBELLE_COMMERCIAL','ETAT_PATIENT','FABRICANT','ACTION_PATIENT'] :
    df[col] = df[col].map(lambda x: clean_text.preprocess_text(x))

def EFFET_ENC(x):
    if x =='E1213':
        return 0
    else :
        return 1

le = LabelEncoder()
df.TYPE_VIGILANCE = le.fit_transform(df.TYPE_VIGILANCE.values)
df['TARGET'] = df.TEF_ID.map(EFFET_ENC)


# Encodage des varaible catégorielle

# selection des train et test set
train_index,test_index = next(GroupShuffleSplit(random_state=1029).split(df, groups=df['DESCRIPTION_INCIDENT']))
df_train, df_test = df.iloc[train_index], df.iloc[test_index]
y = df_train.TARGET
y_test =df_test.TARGET 



CPU times: user 22.5 s, sys: 140 ms, total: 22.6 s
Wall time: 22.6 s


In [102]:
# baseline
print(' En inférant toujours la classe majoritaire, le score est :', len(y.iloc[np.where(y==1)])/len(y))

 En inférant toujours la classe majoritaire, le score est : 0.6107907006969604


In [110]:
%%time
preprocess = ColumnTransformer(
    [('description_tfidf',TfidfVectorizer(sublinear_tf=True, min_df=3,
                            ngram_range=(1, 1),
                            stop_words=STOP_WORDS,
                            max_features = 10000,norm = 'l2'), 'DESCRIPTION_INCIDENT'),
     
     ('etat_pat_tfidf', TfidfVectorizer(sublinear_tf=True, min_df=3,ngram_range=(1, 1),
                                       stop_words=STOP_WORDS,
                                       max_features = 10000,norm = 'l2'), 'ETAT_PATIENT'),
     
     ('fabricant_tfidf',TfidfVectorizer(sublinear_tf=True, min_df=3,
                            ngram_range=(1, 1),
                            stop_words=STOP_WORDS,
                            max_features = 5000,norm = 'l2'), 'FABRICANT'),
    
    ('classification_enc', TfidfVectorizer(sublinear_tf=True, min_df=5,
                            ngram_range=(1, 1),
                            stop_words=STOP_WORDS,
                            max_features = 100,norm = 'l2'),'CLASSIFICATION')
     ],
    
    remainder='passthrough')


pipeline = Pipeline([
    ('vect', preprocess),
    ('clf', CalibratedClassifierCV(LinearSVC(class_weight='balanced'),cv=3, method='isotonic')),
])

CV = 5
X = df_train[['FABRICANT','CLASSIFICATION','TYPE_VIGILANCE','DESCRIPTION_INCIDENT','ETAT_PATIENT']]
result= cross_validate(pipeline, X, y, scoring=['accuracy','balanced_accuracy','f1_weighted' ], cv=CV)

CPU times: user 48.4 s, sys: 60 ms, total: 48.4 s
Wall time: 48.4 s


In [111]:
pd.DataFrame(result)[['test_accuracy','test_balanced_accuracy','test_f1_weighted']].mean()

test_accuracy             0.817327
test_balanced_accuracy    0.813348
test_f1_weighted          0.817674
dtype: float64

In [114]:
pipeline.fit(X,y)
X_test = df_test[['FABRICANT','CLASSIFICATION','TYPE_VIGILANCE','DESCRIPTION_INCIDENT','ETAT_PATIENT']]
y_pred = pipeline.predict(X_test)

print('Justesse:',accuracy_score(y_pred,y_test))
print('Justesse pondéré: ', balanced_accuracy_score(y_pred,y_test))
print('f1_weighted : ',f1_score(y_pred,y_test,average='weighted'))

Justesse: 0.8367638465034738
Justesse pondéré:  0.8298282494697896
f1_weighted :  0.836643451664396


In [123]:
CV=5
score = cross_validate(pipeline, X, y, scoring='balanced_accuracy', cv=CV)

## Optuna 

In [129]:
import optuna
from optuna import Trial

def objective(trial):    
    
    train_index,test_index = next(GroupShuffleSplit(random_state=1029).split(X, groups=X['DESCRIPTION_INCIDENT']))
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    #Etat Patient
    #vect__etat_pat_tfidf__analyzer = trial.suggest_categorical('vect__etat_pat_tfidf__analyzer', ['word', 'char', 'char_wb']) 
    vect__etat_pat_tfidf__max_features = trial.suggest_int('vect__etat_pat_tfidf__max_features', 500, 20_000)
    vect__etat_pat_tfidf__min_df =  trial.suggest_int('vect__etat_pat_tfidf__min_df', 1,5)
    vect__etat_pat_tfidf__norm = trial.suggest_categorical('vect__etat_pat_tfidf__norm', ('l1', 'l2'))
    #Description
    #vect__description_tfidf__analyzer = trial.suggest_categorical('vect__description_tfidf__analyzer', ['word', 'char', 'char_wb']) 
    vect__description_tfidf__max_features = trial.suggest_int('vect__description_tfidf__max_features', 1500, 60_000)
    vect__description_tfidf__min_df =  trial.suggest_int('vect__description_tfidf__min_df', 1,5)
    vect__description_tfidf__norm = trial.suggest_categorical('vect__description_tfidf__norm', ('l1', 'l2'))
    #Fabricant
    vect__fabricant_tfidf__analyzer = trial.suggest_categorical('vect__fabricant_tfidf__analyzer', ['word', 'char', 'char_wb']) 
    vect__fabricant_tfidf__max_features = trial.suggest_int('vect__fabricant_tfidf__max_features', 500, 10_000)
    vect__fabricant_tfidf__min_df =  trial.suggest_int('vect__fabricant_tfidf__min_df', 1,5)
    vect__fabricant_tfidf__norm = trial.suggest_categorical('vect__fabricant_tfidf__norm', ('l1', 'l2'))
    #action patient
    
    #Classification
    vect__classification_enc__analyzer = trial.suggest_categorical('vect__action_pat_tfidf__analyzer', ['word', 'char', 'char_wb']) 
    vect__classification_enc__max_features = trial.suggest_int('vect__classification_enc__max_features', 500, 5000)
    vect__classification_enc__min_df =  trial.suggest_int('vect__classification_enc__min_df', 1,5)
    vect__classification_enc__norm = trial.suggest_categorical('vect__classification_enc__norm', ('l1', 'l2'))
    
    #clf__C =trial.suggest_loguniform('svr_c', 1e-5, 1e5)
    

    
    params = {
        #'vect__etat_pat_tfidf__analyzer':vect__etat_pat_tfidf__analyzer,
        'vect__etat_pat_tfidf__max_features': vect__etat_pat_tfidf__max_features,
        'vect__etat_pat_tfidf__min_df':vect__etat_pat_tfidf__min_df,
        'vect__etat_pat_tfidf__norm':vect__etat_pat_tfidf__norm,
        
        #'vect__description_tfidf__analyzer':vect__description_tfidf__analyzer,
        'vect__description_tfidf__max_features': vect__description_tfidf__max_features,
        'vect__description_tfidf__min_df':vect__description_tfidf__min_df,
        'vect__description_tfidf__norm':vect__description_tfidf__norm,
        
        'vect__fabricant_tfidf__analyzer':vect__fabricant_tfidf__analyzer,
        'vect__fabricant_tfidf__max_features': vect__fabricant_tfidf__max_features,
        'vect__fabricant_tfidf__min_df':vect__fabricant_tfidf__min_df,
        'vect__fabricant_tfidf__norm':vect__fabricant_tfidf__norm,
        
        
        'vect__classification_enc__analyzer':vect__action_pat_tfidf__analyzer,
        'vect__classification_enc__max_features': vect__classification_enc__max_features,
        'vect__classification_enc__min_df':vect__classification_enc__min_df,
        'vect__classification_enc__norm':vect__classification_enc__norm,
        
        #'clf__C':clf__C
    }
    
    pipeline.set_params(**params)
    CV=5
    score = cross_validate(pipeline, X, y, scoring='balanced_accuracy', cv=CV)
    
    return score['test_score'].mean()

In [130]:
time_out = 75*100
studyName = 'EFFET_O1_optimisation_svm'
study = optuna.create_study(direction="maximize")
study.optimize(objective, timeout=time_out)

#Suvegarde du resultat
df = study.trials_dataframe()
df.to_json(studyName+'.json')
print(study.best_trial)

[32m[I 2020-06-08 15:25:22,336][0m Finished trial#0 with value: 0.8045990730249926 with parameters: {'vect__etat_pat_tfidf__max_features': 16427, 'vect__etat_pat_tfidf__min_df': 5, 'vect__etat_pat_tfidf__norm': 'l1', 'vect__description_tfidf__max_features': 9558, 'vect__description_tfidf__min_df': 2, 'vect__description_tfidf__norm': 'l2', 'vect__fabricant_tfidf__analyzer': 'char', 'vect__fabricant_tfidf__max_features': 5492, 'vect__fabricant_tfidf__min_df': 1, 'vect__fabricant_tfidf__norm': 'l2', 'vect__action_pat_tfidf__analyzer': 'word', 'vect__classification_enc__max_features': 4882, 'vect__classification_enc__min_df': 1, 'vect__classification_enc__norm': 'l1'}. Best is trial#0 with value: 0.8045990730249926.[0m
[32m[I 2020-06-08 15:26:20,435][0m Finished trial#1 with value: 0.8074814231354507 with parameters: {'vect__etat_pat_tfidf__max_features': 5218, 'vect__etat_pat_tfidf__min_df': 4, 'vect__etat_pat_tfidf__norm': 'l2', 'vect__description_tfidf__max_features': 1522, 'vect__

KeyboardInterrupt: 

In [137]:
params = {'vect__etat_pat_tfidf__max_features': 1605, 'vect__etat_pat_tfidf__min_df': 4,
          'vect__etat_pat_tfidf__norm': 'l1', 'vect__description_tfidf__max_features': 45053, 
          'vect__description_tfidf__min_df': 2, 'vect__description_tfidf__norm': 'l1', 
          'vect__fabricant_tfidf__analyzer': 'char', 'vect__fabricant_tfidf__max_features': 2727, 
          'vect__fabricant_tfidf__min_df': 2, 'vect__fabricant_tfidf__norm': 'l1', 
          'vect__action_pat_tfidf__analyzer': 'char', 'vect__classification_enc__max_features': 4687, 
          'vect__classification_enc__min_df': 3, 'vect__classification_enc__norm': 'l2'}

preprocess = ColumnTransformer(
    [('etat_pat_tfidf_tfidf', TfidfVectorizer(sublinear_tf=True,
                                         analyzer='word',
                                         min_df=4,
                                         ngram_range=(1, 1),
                                         stop_words=STOP_WORDS,
                                         max_features = 1605,
                                         norm = 'l1'), 'ETAT_PATIENT'),
    
     
     ('description_tfidf',TfidfVectorizer(sublinear_tf=True,
                                         min_df=2,
                                         ngram_range=(1, 1),
                                         stop_words=STOP_WORDS,
                                         max_features = 45053,
                                         norm = 'l1'), 'DESCRIPTION_INCIDENT'),
     
    ('fabricant_tfidf',TfidfVectorizer(sublinear_tf=True,
                                         analyzer='char',
                                         min_df=2,
                                         ngram_range=(1, 1),
                                         stop_words=STOP_WORDS,
                                         max_features = 2727,
                                         norm = 'l1'), 'FABRICANT'),
    ('classification_enc',TfidfVectorizer(sublinear_tf=True,
                                         analyzer='char',
                                         min_df=3,
                                         ngram_range=(1, 1),
                                         stop_words=STOP_WORDS,
                                         max_features = 4687,
                                         norm = 'l2'), 'CLASSIFICATION')
        ],
    
    remainder='passthrough')

In [138]:
pipeline.fit(X,y)
X_test = df_test[['FABRICANT','CLASSIFICATION','TYPE_VIGILANCE','DESCRIPTION_INCIDENT','ETAT_PATIENT']]
y_pred = pipeline.predict(X_test)

print('Justesse:',accuracy_score(y_pred,y_test))
print('Justesse pondéré: ', balanced_accuracy_score(y_pred,y_test))
print('f1_weighted : ',f1_score(y_pred,y_test,average='weighted'))

Justesse: 0.8428673462762158
Justesse pondéré:  0.8355198303263203
f1_weighted :  0.8422250648563704
