# Inférence de la GRAVITE à partir des données de MRV (Stratégie ML 2)
La variable GRAVITE représente la grvité de l'évenement avec 5 echelons




**Stratégie ML 2**

Classifier Ordinal et XGboost




## 0) Chargement des librairies

In [1]:
import warnings
warnings.filterwarnings('ignore')

from pprint import pprint
from time import time
import logging

import pandas as pd

import numpy as np
import sklearn as sk
import seaborn as sns

import nltk
from nltk import word_tokenize
lang ='french'

import clean_text





import matplotlib.pyplot as plt


from sklearn.feature_extraction.text import TfidfVectorizer,HashingVectorizer
from sklearn.preprocessing import LabelEncoder,OrdinalEncoder,OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GroupShuffleSplit
from sklearn.feature_extraction.text import TfidfTransformer,CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.model_selection import cross_val_score, cross_validate

from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score,f1_score
from sklearn import metrics
from sklearn.linear_model import SGDClassifier
from sklearn.decomposition import TruncatedSVD,IncrementalPCA,SparsePCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.calibration import CalibratedClassifierCV

import spacy
nlp =spacy.load('fr')
from spacy.lang.fr.stop_words import STOP_WORDS

## 0.1 Chargement et exploration des données
### 0.1.1 voir le notebook exploration pour l'analyse du champs de gravité

In [2]:
%time
df_declaration_mrv = pd.read_csv("data/data_mrv/declaration_mrv_complet.csv")#delimiter=';',encoding='ISO-8859-1')
id_to_dco = pd.read_csv("data/ref_MRV/referentiel_dispositif.csv",delimiter=';',encoding='ISO-8859-1')

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 9.78 µs


## 1) Constructions du jeux d'évaluation
On met de coté environ 20% du dataset pour l'évaluation et on ne garde pour l'entrainement seulement les classes avec plus de 10 observations

In [3]:
%%time
df_declaration_mrv = pd.read_csv("data/data_mrv/declaration_mrv_complet.csv")#delimiter=';',encoding='ISO-8859-1')
id_to_dco = pd.read_csv("data/ref_MRV/referentiel_dispositif.csv",delimiter=';',encoding='ISO-8859-1')

#Supression des NaN

#Charegement des colonnes utiles et suppression des NaN

df = df_declaration_mrv[['DESCRIPTION_INCIDENT','TYPE_VIGILANCE','LIBELLE_COMMERCIAL','DCO_ID',
                         'REFERENCE_COMMERCIALE','ETAT_PATIENT','ACTION_PATIENT','FABRICANT',
                          'GRAVITE','CLASSIFICATION']][df_declaration_mrv['GRAVITE'].notna()]

# On complète les NaN avec du vide
df['ETAT_PATIENT'] = df['ETAT_PATIENT'].fillna("")
df['DESCRIPTION_INCIDENT'] = df['DESCRIPTION_INCIDENT'].fillna("")
df['LIBELLE_COMMERCIAL'] = df['LIBELLE_COMMERCIAL'].fillna("")
df['FABRICANT'] = df['FABRICANT'].fillna("")
df["REFERENCE_COMMERCIALE"] = df['REFERENCE_COMMERCIALE'].fillna("")
df['TYPE_VIGILANCE'] = df['TYPE_VIGILANCE'].fillna("")
df['CLASSIFICATION'] = df['CLASSIFICATION'].fillna('')
df['ACTION_PATIENT'] = df['ACTION_PATIENT'].fillna('')
df['DCO_ID'] = df['DCO_ID'].fillna(-1)

# On ajoute des collones pertinentes
df['des_lib'] = df['LIBELLE_COMMERCIAL']+ ' ' + df['DESCRIPTION_INCIDENT']
df['fab_lib'] = df['LIBELLE_COMMERCIAL']+ ' ' + df['FABRICANT']
df['com'] = df['LIBELLE_COMMERCIAL']+ ' ' + df['REFERENCE_COMMERCIALE']
df['Text'] = df['LIBELLE_COMMERCIAL']+ ' ' + df['FABRICANT'] + "" + df['DESCRIPTION_INCIDENT']

# On nettoie les données :
for col in  ['DESCRIPTION_INCIDENT','LIBELLE_COMMERCIAL','ETAT_PATIENT','Text',"des_lib","fab_lib"] :
        df[col] = df[col].map(lambda x: clean_text.preprocess_text(x))

n = 15
# On filtre pour a voir plus de n observations par classse
df_n = df.groupby("GRAVITE").filter(lambda x: len(x) > n)

# On encode les labels
def GRAVITE_ENC(x):
    if x =='NULLE':
        return 0
    elif x == 'MINEU':
        return 1
    elif x == 'MOYEN':
        return 2
    elif x== 'SEVER':
        return 3
    elif x== 'CRITI':
        return 4
df_n.GRAVITE = df_n.GRAVITE.map(lambda x:GRAVITE_ENC(x))

#OnEncode les autres varibles
le = LabelEncoder()
#On encode le type de vigilance
#df_n.TYPE_VIGILANCE = le.fit_transform(df_n.TYPE_VIGILANCE.values)
#On encode la classifcation 
#df_n.CLASSIFICATION = le.fit_transform(df_n.CLASSIFICATION.values)
#on encode le DCO
#df_n.DCO_ID = le.fit_transform(df_n.DCO_ID.values)

# On selection les variables de test en faisant attention aux doublons
train_index,test_index = next(GroupShuffleSplit(random_state=1029,test_size=0.2).split(df_n, groups=df_n['DESCRIPTION_INCIDENT']))


df_train, df_test = df_n.iloc[train_index], df_n.iloc[test_index]


CPU times: user 37.7 s, sys: 252 ms, total: 37.9 s
Wall time: 37.9 s


## 2) Construction du pipeline  pour la gravité ordonnée

In [7]:
%%time
preprocess = ColumnTransformer(
    [('etat_pat_tfidf', TfidfVectorizer(sublinear_tf=True, min_df=3,ngram_range=(1, 1),
                                       stop_words=STOP_WORDS,
                                       max_features = 10000,norm = 'l2'), 'ETAT_PATIENT'),
     
     ('description_tfidf',TfidfVectorizer(sublinear_tf=True, min_df=5,
                            ngram_range=(1, 1),
                            stop_words=STOP_WORDS,
                            max_features = 50000,norm = 'l2'), 'DESCRIPTION_INCIDENT'),
     
     ('action_pat_tfidf',TfidfVectorizer(sublinear_tf=True, min_df=3,
                           ngram_range=(1, 1),
                           stop_words=STOP_WORDS,
                           max_features = 10000,norm = 'l2'), 'ACTION_PATIENT'),
     
     ('fabricant_tfidf',TfidfVectorizer(sublinear_tf=True, min_df=3,
                           ngram_range=(1, 1),
                           stop_words=STOP_WORDS,
                           max_features = 5000,norm = 'l2'), 'FABRICANT'),
     ('classification_enc', TfidfVectorizer(sublinear_tf=True, min_df=5,
                            ngram_range=(1, 1),
                            stop_words=STOP_WORDS,
                            max_features = 100,norm = 'l2'),'CLASSIFICATION')
     
     #('vigilance_enc', OneHotEncoder(handle_unknown='ignore'),'TYPE_VIGILANCE')
     
    ],
    #
    remainder='passthrough')


pipeline = Pipeline([
    ('vect', preprocess),
    ('clf', CalibratedClassifierCV(LinearSVC(class_weight='balanced'),cv=3, method='isotonic')),
])

X = df_train[['DESCRIPTION_INCIDENT','ETAT_PATIENT','ACTION_PATIENT','FABRICANT','CLASSIFICATION']] # 
y = df_train.GRAVITE
CV = 5

X_ = preprocess.fit_transform(X)

#result= cross_validate(pipeline, X, y, scoring=['accuracy','balanced_accuracy','f1_weighted' ], cv=CV)

CPU times: user 7.78 s, sys: 44 ms, total: 7.83 s
Wall time: 7.83 s


In [6]:
from xgboost import XGBClassifier, DMatrix
import xgboost as xgb

X_train, X_valid, y_train, y_valid = train_test_split(
        X, y, test_size=0.1, random_state=23)

d_valid = DMatrix(X_valid, y_valid)
d_train = xgb.DMatrix(X_train, y_train)

watchlist = [(d_valid, 'valid')]


xgb_params = {'eta': 0.3, 
            'max_depth': 6, 
            'objective': 'multi:softmax', 
            'eval_metric': 'mlogloss', 
            'seed': 23,
            'num_class':5
            }


xgb_model = xgb.train(xgb_params, d_train, 15, watchlist, verbose_eval=True, early_stopping_rounds=30)

ValueError: DataFrame.dtypes for data must be int, float or bool.
                Did not expect the data types in fields DESCRIPTION_INCIDENT, ETAT_PATIENT, ACTION_PATIENT, FABRICANT, CLASSIFICATION