#  Inférence de la variable DCO à partir des données de MRV (Stratégie ML conclusion )
## OPTUNA, SVM

Rappel : La variable DCO représente le nom des dispositifs.Il y en a 1500 de différents et notre objectif est d'identifier le dispositif impliqué dans l'incident à partir de deux variables:
- DESCRIPTION DE L'INCIDENT
- LIBELLE COMMERCIAL
- FABRICANT
- REFERENCE

Ce notebook conclu le premier Batch de l'approche machine learning pour le DCO. La stratégie est la suivante :
- Netoyage des collones textuelles selon clean_text.py
- Sélection des classes avec plus de 15 occurences
- Séparation en train test avec 0,20 dans le test en en évitant que les doublons ne soient séparés.
- Utilisation de 4 collones de texte : DESCRIPTION DE L'INCIDENT, LIBELLE COMMERCIAL, FABRICANT et  REFERENCE.
- Encodage de ces 5 colonnes en tfidf
- FineTunning des paramètres des tfidf avec la librairie Optuna
- Application du SVM calibré (cf approche 1.1)

## 0) Chargement des librairies

In [25]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd



import clean_text

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GroupShuffleSplit
from sklearn.pipeline import Pipeline

from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score,f1_score
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfVectorizer,HashingVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.calibration import CalibratedClassifierCV
import spacy
nlp =spacy.load('fr')
from spacy.lang.fr.stop_words import STOP_WORDS

import joblib
import xgboost as xgb

import optuna
from optuna import Trial

from sklearn.svm import LinearSVC, SVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score,f1_score

## 1) Chargement des données et constructions du jeux d'évaluation
On met de coté environ 20% du dataset pour l'évaluation et on ne garde pour l'entrainement seulement les classes avec plus de 10 observations

In [26]:
%%time
df_declaration_mrv = pd.read_csv("data/data_mrv/declaration_mrv_complet.csv")#delimiter=';',encoding='ISO-8859-1')
id_to_dco = pd.read_csv("data/ref_MRV/referentiel_dispositif.csv",delimiter=';',encoding='ISO-8859-1')

#Charegement des colonnes utiles
df = df_declaration_mrv[['DESCRIPTION_INCIDENT','TYPE_VIGILANCE','LIBELLE_COMMERCIAL',
                         'REFERENCE_COMMERCIALE','ETAT_PATIENT','FABRICANT','DCO_ID','CLASSIFICATION']]
# On complète les NaN avec du vide
df['ETAT_PATIENT'] = df['ETAT_PATIENT'].fillna("")
df['DESCRIPTION_INCIDENT'] = df['DESCRIPTION_INCIDENT'].fillna("")
df['LIBELLE_COMMERCIAL'] = df['LIBELLE_COMMERCIAL'].fillna("")
df['FABRICANT'] = df['FABRICANT'].fillna("")
df["REFERENCE_COMMERCIALE"] = df['REFERENCE_COMMERCIALE'].fillna("")
df['TYPE_VIGILANCE'] = df['TYPE_VIGILANCE'].fillna("")
df['CLASSIFICATION'] = df['CLASSIFICATION'].fillna('')


# On ajoute des collones pertinentes
df['des_lib'] = df['LIBELLE_COMMERCIAL']+ ' ' + df['DESCRIPTION_INCIDENT']
df['fab_lib'] = df['LIBELLE_COMMERCIAL']+ ' ' + df['FABRICANT']
df['com'] = df['LIBELLE_COMMERCIAL']+ ' ' + df['REFERENCE_COMMERCIALE']
df['Text'] = df['LIBELLE_COMMERCIAL']+ ' ' + df['FABRICANT'] + "" + df['DESCRIPTION_INCIDENT']

# On nettoie les données :
for col in  ['DESCRIPTION_INCIDENT','LIBELLE_COMMERCIAL','ETAT_PATIENT','Text',"des_lib","fab_lib"] :
    df[col] = df[col].map(lambda x: clean_text.preprocess_text(x))

n = 15
# On filtre pour a voir plus de n observations par classse
df_n = df.groupby("DCO_ID").filter(lambda x: len(x) > n)

# On encode les labels
le = LabelEncoder()
df_n.DCO_ID = le.fit_transform(df_n.DCO_ID.values)
#On encode le type de vigilance
df_n.TYPE_VIGILANCE = le.fit_transform(df_n.TYPE_VIGILANCE.values)
#On encode la classifcation 
df_n.CLASSIFICATION = le.fit_transform(df_n.CLASSIFICATION.values)

# On selection les variables de test en faisant attention aux doublons
train_index,test_index = next(GroupShuffleSplit(random_state=1029).split(df_n, groups=df_n['DESCRIPTION_INCIDENT']))
df_train, df_test = df_n.iloc[train_index], df_n.iloc[test_index]



CPU times: user 38.6 s, sys: 380 ms, total: 39 s
Wall time: 39 s


## 2) Construction du pipeline

In [27]:
%%time
#Pipeline
preprocess = ColumnTransformer(
    [('reference_tfidf', TfidfVectorizer(sublinear_tf=True, stop_words=STOP_WORDS), 'REFERENCE_COMMERCIALE'),
     
     ('libelle_tfidf', TfidfVectorizer(sublinear_tf=True, stop_words=STOP_WORDS), 'LIBELLE_COMMERCIAL'),
     
     ('description_tfidf',TfidfVectorizer(sublinear_tf=True,
                            stop_words=STOP_WORDS), 'DESCRIPTION_INCIDENT'),
     
    ('fabricant_tfidf',TfidfVectorizer(sublinear_tf=True, stop_words=STOP_WORDS), 'FABRICANT')],
    
    remainder='passthrough')


pipeline = Pipeline([
    ('vect', preprocess),
    ('clf', LinearSVC(class_weight='balanced')),
])

X = df_train[['DESCRIPTION_INCIDENT','FABRICANT','REFERENCE_COMMERCIALE','LIBELLE_COMMERCIAL']] # 
y = df_train.DCO_ID




CPU times: user 32 ms, sys: 4 ms, total: 36 ms
Wall time: 32.3 ms


## 2.1) Optimisation des paramètres de TFIDF

In [18]:
def objective(trial):    
    
    joblib.dump(study, 'study.pkl')
    train_index,test_index = next(GroupShuffleSplit(random_state=1029).split(X, groups=X['DESCRIPTION_INCIDENT']))
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    #reference
    vect__reference_tfidf__analyzer = trial.suggest_categorical('vect__reference_tfidf__analyzer', ['word', 'char', 'char_wb']) 
    vect__reference_tfidf__max_features = trial.suggest_int('vect__reference_tfidf__max_features', 500, 10_000)
    vect__reference_tfidf__min_df =  trial.suggest_int('vect__reference_tfidf__min_df', 1,5)
    vect__reference_tfidf__norm = trial.suggest_categorical('vect__reference_tfidf__norm', ('l1', 'l2'))
    #Description
    #vect__description_tfidf__analyzer = trial.suggest_categorical('vect__description_tfidf__analyzer', ['word', 'char', 'char_wb']) 
    vect__description_tfidf__max_features = trial.suggest_int('vect__description_tfidf__max_features', 1500, 60_000)
    vect__description_tfidf__min_df =  trial.suggest_int('vect__description_tfidf__min_df', 1,5)
    vect__description_tfidf__norm = trial.suggest_categorical('vect__description_tfidf__norm', ('l1', 'l2'))
    #Fabricant
    vect__fabricant_tfidf__analyzer = trial.suggest_categorical('vect__fabricant_tfidf__analyzer', ['word', 'char', 'char_wb']) 
    vect__fabricant_tfidf__max_features = trial.suggest_int('vect__fabricant_tfidf__max_features', 500, 10_000)
    vect__fabricant_tfidf__min_df =  trial.suggest_int('vect__fabricant_tfidf__min_df', 1,5)
    vect__fabricant_tfidf__norm = trial.suggest_categorical('vect__fabricant_tfidf__norm', ('l1', 'l2'))
    #Libelle
    vect__libelle_tfidf__analyzer = trial.suggest_categorical('vect__libelle_tfidf__analyzer', ['word', 'char', 'char_wb']) 
    vect__libelle_tfidf__max_features = trial.suggest_int('vect__libelle_tfidf__max_features', 500, 10_000)
    vect__libelle_tfidf__min_df =  trial.suggest_int('vect__libelle_tfidf__min_df', 1,5)
    vect__libelle_tfidf__norm = trial.suggest_categorical('vect__libelle_tfidf__norm', ('l1', 'l2'))
    
    #clf__C =trial.suggest_loguniform('svr_c', 1e-5, 1e5)
    

    
    params = {
        'vect__reference_tfidf__analyzer':vect__reference_tfidf__analyzer,
        'vect__reference_tfidf__max_features': vect__reference_tfidf__max_features,
        'vect__reference_tfidf__min_df':vect__reference_tfidf__min_df,
        'vect__reference_tfidf__norm':vect__reference_tfidf__norm,
        
        #'vect__description_tfidf__analyzer':vect__description_tfidf__analyzer,
        'vect__description_tfidf__max_features': vect__description_tfidf__max_features,
        'vect__description_tfidf__min_df':vect__description_tfidf__min_df,
        'vect__description_tfidf__norm':vect__description_tfidf__norm,
        
        'vect__fabricant_tfidf__analyzer':vect__fabricant_tfidf__analyzer,
        'vect__fabricant_tfidf__max_features': vect__fabricant_tfidf__max_features,
        'vect__fabricant_tfidf__min_df':vect__fabricant_tfidf__min_df,
        'vect__fabricant_tfidf__norm':vect__fabricant_tfidf__norm,
        
        'vect__libelle_tfidf__analyzer':vect__libelle_tfidf__analyzer,
        'vect__libelle_tfidf__max_features': vect__libelle_tfidf__max_features,
        'vect__libelle_tfidf__min_df':vect__libelle_tfidf__min_df,
        'vect__libelle_tfidf__norm':vect__libelle_tfidf__norm,
        
        #'clf__C':clf__C
    }
    
    pipeline.set_params(**params)
    pipeline.fit(X_train,y_train)
    y_pred = pipeline.predict(X_test)
    
    return balanced_accuracy_score(y_pred,y_test)

In [19]:
time_out = 75*100
studyName = 'optimisation_svm'
study = optuna.create_study(direction="maximize")
study.optimize(objective, timeout=time_out)

#Suvegarde du resultat
df = study.trials_dataframe()
df.to_json(studyName+'.json')
print(study.best_trial)

[32m[I 2020-06-04 12:06:58,511][0m Finished trial#0 with value: 0.6724148828271783 with parameters: {'vect__reference_tfidf__analyzer': 'char_wb', 'vect__reference_tfidf__max_features': 4443, 'vect__reference_tfidf__min_df': 5, 'vect__reference_tfidf__norm': 'l1', 'vect__description_tfidf__max_features': 28242, 'vect__description_tfidf__min_df': 4, 'vect__description_tfidf__norm': 'l2', 'vect__fabricant_tfidf__analyzer': 'char', 'vect__fabricant_tfidf__max_features': 2136, 'vect__fabricant_tfidf__min_df': 3, 'vect__fabricant_tfidf__norm': 'l2', 'vect__libelle_tfidf__analyzer': 'char', 'vect__libelle_tfidf__max_features': 4872, 'vect__libelle_tfidf__min_df': 3, 'vect__libelle_tfidf__norm': 'l2'}. Best is trial#0 with value: 0.6724148828271783.[0m
[32m[I 2020-06-04 12:08:48,248][0m Finished trial#1 with value: 0.6492288202679524 with parameters: {'vect__reference_tfidf__analyzer': 'word', 'vect__reference_tfidf__max_features': 3899, 'vect__reference_tfidf__min_df': 2, 'vect__referen

FrozenTrial(number=95, value=0.7652911910054219, datetime_start=datetime.datetime(2020, 6, 4, 14, 8, 55, 862307), datetime_complete=datetime.datetime(2020, 6, 4, 14, 10, 18, 293696), params={'vect__reference_tfidf__analyzer': 'word', 'vect__reference_tfidf__max_features': 5805, 'vect__reference_tfidf__min_df': 4, 'vect__reference_tfidf__norm': 'l1', 'vect__description_tfidf__max_features': 18294, 'vect__description_tfidf__min_df': 1, 'vect__description_tfidf__norm': 'l2', 'vect__fabricant_tfidf__analyzer': 'char', 'vect__fabricant_tfidf__max_features': 2387, 'vect__fabricant_tfidf__min_df': 2, 'vect__fabricant_tfidf__norm': 'l2', 'vect__libelle_tfidf__analyzer': 'word', 'vect__libelle_tfidf__max_features': 8655, 'vect__libelle_tfidf__min_df': 1, 'vect__libelle_tfidf__norm': 'l2'}, distributions={'vect__reference_tfidf__analyzer': CategoricalDistribution(choices=('word', 'char', 'char_wb')), 'vect__reference_tfidf__max_features': IntUniformDistribution(high=10000, low=500, step=1), 'vec

## Conclusion


In [None]:
params={'vect__reference_tfidf__analyzer': 'word', 'vect__reference_tfidf__max_features': 5805, 'vect__reference_tfidf__min_df': 4, 'vect__reference_tfidf__norm': 'l1',
        'vect__description_tfidf__max_features': 18294, 'vect__description_tfidf__min_df': 1, 'vect__description_tfidf__norm': 'l2', 
        'vect__fabricant_tfidf__analyzer': 'char', 'vect__fabricant_tfidf__max_features': 2387, 'vect__fabricant_tfidf__min_df': 2, 'vect__fabricant_tfidf__norm': 'l2', 
        'vect__libelle_tfidf__analyzer': 'word', 'vect__libelle_tfidf__max_features': 8655, 'vect__libelle_tfidf__min_df': 1, 'vect__libelle_tfidf__norm': 'l2'}

## 3) Applicattion du SVM sur le jeu de test

In [21]:
%%time
#Pipeline
preprocess = ColumnTransformer(
    [('reference_tfidf', TfidfVectorizer(sublinear_tf=True,
                                         analyzer='word',
                                         min_df=4,
                                         ngram_range=(1, 1),
                                         stop_words=STOP_WORDS,
                                         max_features = 5804,
                                         norm = 'l1'), 'REFERENCE_COMMERCIALE'),
     
     ('libelle_tfidf', TfidfVectorizer(sublinear_tf=True,
                                         analyzer='word',
                                         min_df=1,
                                         ngram_range=(1, 1),
                                         stop_words=STOP_WORDS,
                                         max_features = 8655,
                                         norm = 'l2'), 'LIBELLE_COMMERCIAL'),
     
     ('description_tfidf',TfidfVectorizer(sublinear_tf=True,
                                         min_df=1,
                                         ngram_range=(1, 1),
                                         stop_words=STOP_WORDS,
                                         max_features = 18294,
                                         norm = 'l2'), 'DESCRIPTION_INCIDENT'),
     
    ('fabricant_tfidf',TfidfVectorizer(sublinear_tf=True,
                                         analyzer='char',
                                         min_df=2,
                                         ngram_range=(1, 1),
                                         stop_words=STOP_WORDS,
                                         max_features = 2387,
                                         norm = 'l2'), 'FABRICANT')],
    
    remainder='passthrough')


pipeline = Pipeline([
    ('vect', preprocess),
    ('clf', CalibratedClassifierCV(LinearSVC(class_weight='balanced'),cv=5, method='isotonic')),
])

X = df_train[['DESCRIPTION_INCIDENT','FABRICANT','REFERENCE_COMMERCIALE','LIBELLE_COMMERCIAL']] # 
y = df_train.DCO_ID


pipeline.fit(X,y)
X_test = df_test[['DESCRIPTION_INCIDENT','FABRICANT','REFERENCE_COMMERCIALE','LIBELLE_COMMERCIAL']]
y_test = df_test.DCO_ID

y_pred = pipeline.predict(X_test)

print("Justesse", accuracy_score(y_test,y_pred) )
print("Balanced_accuracy : ", balanced_accuracy_score(y_test,y_pred))
print("f1-weighted : ", f1_score(y_test, y_pred, average='weighted'))

précison: 0.8467366372419546
Balanced_accuracy :  0.7248460354275542
f1-weighted :  0.8332177155633408
CPU times: user 8min 46s, sys: 5.48 s, total: 8min 51s
Wall time: 8min 51s
