# Inférence de la variable DCO à partir des données de MRV (Stratégie ML 1.1)
La variable DCO représente le nom des dispositifs.Il y en a 1500 de différents et notre objectif est d'identifier le dispositif impliqué dans l'incident à partir de deux variables:
- DESCRIPTION DE L'INCIDENT
- LIBELLE COMMERCIAL

Dans ce problème de classification de texte multiclasse, le pipeline sera le suivant :
1. Nettoyer les données textuelles (orthographe, ponctuation, majuscule, tokénisation et/ou lemmatisation etc.)
2. Nettoyage des observations (suppression des NaN, des targets ou le nombre d'observations est trop faible)
3. Construction des features textuelles (CountVectorizer, tf-idf, Word2vec, Fastext, Camenbert etc.)
4. Entrainement de l'algorithme de classification
5. Evaluation du modèle

A travers ce pipeline les choix sont nombreux et nous allons devellopper différentes startégies.
L'objectif de cette stratégie est d'identifier les élements qui fonctionnent bien et de comprendre pourquoi :
* Quel est l'impact des bigrams ?
* Quel est l'impact de la lemmatisation ?
* Quel est l'impact des paramètres du SVM sur les performances ?
* Quel est l'impact des doublons sur nos performances ?

Pui de tester de nouvelles intuitions :

**Stratégie ML 2**
* A définir ?

In [55]:
from pprint import pprint
from time import time
import logging

import pandas as pd
import gensim
import numpy as np
import sklearn as sk
import seaborn as sns

import nltk
from nltk import word_tokenize
lang ='french'

import clean_text



from scipy.stats import randint
from scipy.sparse import csr_matrix


import matplotlib.pyplot as plt


from sklearn.feature_extraction.text import TfidfVectorizer,HashingVectorizer

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GroupShuffleSplit
from sklearn.feature_extraction.text import TfidfTransformer,CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.model_selection import cross_val_score

from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score
from sklearn import metrics
from sklearn.linear_model import SGDClassifier
from sklearn.decomposition import TruncatedSVD,IncrementalPCA,SparsePCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.compose import ColumnTransformer

import spacy
nlp =spacy.load('fr')
from spacy.lang.fr.stop_words import STOP_WORDS


## 1) Chargement et nettoyage des données

In [2]:
df_declaration_mrv = pd.read_csv("data/data_mrv/declaration_mrv.csv",delimiter=';',encoding='ISO-8859-1')
id_to_dco = pd.read_csv("data/ref_MRV/referentiel_dispositif.csv",delimiter=';',encoding='ISO-8859-1')

df = df_declaration_mrv[['DESCRIPTION_INCIDENT','LIBELLE_COMMERCIAL','DCO_ID']]


df['Text'] = df['LIBELLE_COMMERCIAL']+ ' ' + df['DESCRIPTION_INCIDENT']

df = df.dropna()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [67]:
len(df.groupby('Text'))

36463

### Remarques :  
--> En fesant la jointure des collones descripton et libelle, nous avons 36 000 ligne unique au lieu de 33 000 environ.

In [3]:
%%time
df.Text = df.Text.map(lambda x: clean_text.preprocess_text(x))

CPU times: user 9.42 s, sys: 28 ms, total: 9.45 s
Wall time: 9.48 s


In [13]:
%%time
def select_raw_by_nb_obs(df:pd.DataFrame, seuil:int)->pd.DataFrame :
    """
    Renvoie les lignes ou le nombre d'observations est supérieur au seuil entrée
    """
    S = df.groupby('DCO_ID').count()>seuil
    liste_DCO =S[S['Text']==True].index
    df_utilisable= df[df['DCO_ID'].isin(liste_DCO)]
    #df_reduit = df_utilisale[df_utilisale['DCO_ID']>2900]
    #print(len(df_reduit))
    return(df_utilisable)

df_utilisable_10 = select_raw_by_nb_obs(df,10)

CPU times: user 52 ms, sys: 8 ms, total: 60 ms
Wall time: 59.1 ms


## 2) Construction du pipeline et optimisation des paramètres
### Impact des paramètres de la tf-idf et de la vectorisation

In [9]:
#Pipeline
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', LinearSVC(class_weight='balanced')),
])

# Paramètres
parameters = {
    #'max_df': [0.75],#(0.5, 0.75, 1.0),
    'vect__max_features': [5000,10000],#, 10000, 50000),
    'vect__min_df': [5],
    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2')
    #'clf__C': [1, 10, 100]
}
#Grid search
if __name__ == "__main__":
   
    grid_search = GridSearchCV(pipeline, parameters, cv=3,n_jobs=-1, verbose=1)

    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    pprint(parameters)
    t0 = time()
    grid_search.fit(df_utilisable_10.Text, df_utilisable_10.DCO_ID)
    print("done in %0.3fs" % (time() - t0))
    print()

    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters:
{'tfidf__norm': ('l1', 'l2'),
 'tfidf__use_idf': (True, False),
 'vect__max_features': [5000, 10000],
 'vect__min_df': [5],
 'vect__ngram_range': ((1, 1), (1, 2))}
Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed: 40.9min finished


done in 2607.827s

Best score: 0.769
Best parameters set:
	tfidf__norm: 'l2'
	tfidf__use_idf: True
	vect__max_features: 10000
	vect__min_df: 5
	vect__ngram_range: (1, 1)


### Commentaire : 
 * Les bigrammes n'apporte pas de meilleurs performances
 * le calcul de l'idf améliore les performances
 * Il faudrai essayer avec un plus grand nombre de features (max_features>=10000 ?)
 

## 3) Normalisation des données
Nous regardons ici, l'impact de la lemmatisation sur les données
Attention, il faut avoir le fichier df_text_Libelle_Descr_clean.csv dans le même répertoire

In [10]:
doc = nlp(df_utilisable_10.Text[0])
print('un exemple de donnée :', df_utilisable_10.Text[0],'\n')
print('un exemple de donnée lemmatisée : ', " ".join([elt.lemma_ for elt in doc]),'\n')
#print('un exemple de donnée Normalisée : ',"".join([elt.lemma_+'_'+elt.pos_ for elt in doc]))

un exemple de donnée : sonde attain performa le guide est resté coincé à intérieur de la sonde , on ne peut plus le bouger . changement de sonde 

un exemple de donnée lemmatisée :  sond attain performer le guide être resté coincer à intérieur de le sonde , on ne pouvoir plus le bouger . changement de sonde 



In [18]:
#Le texte contient égalemen le pos_tag, cette fonction permet de ne se servir que des lemmes
def get_lem(x):

    lem = ""
    for elt in x.split() :
        lem = lem +" " +elt.split('_')[0]
    return(lem)


# Pour obtenir quelque statistiques sur les nombres de mots suite à la lemmaisation
def nltk_tokenisation(text,sw=True):
    """
    Transforme le texte en liste de tokens, en miniscule, en ayant suprimé la  ponctuatiuon et les mots frequents
    Entrées
    - x::type:str
    Sortie:
    - tokens::type:list(str) liste de tokens
    - lemmas::type:list(str) liste des lemmes
    Exemple : Entrée = "je suis heureux aujourd'hui"; Sortie : ['je', 'suis', 'heureux', "aujourd'hui"]
    """
    if type(text)!= str :
        return ([])
    txt = text.lower()
    if sw==False :
        tokens = nltk.word_tokenize(txt, language=lang, preserve_line=False)
    else : 
        words = nltk.word_tokenize(txt, language=lang, preserve_line=False)
        tokens = [word for word in words  if word not in STOP_WORDS]
    return(tokens)

def nb_mots(x):
    try :
        return(len(x))
    except :
        return(0)

In [None]:
# Chargement des donnée normalisé
df_utilisable_10_norm = pd.read_csv('df_text_Libelle_Descr_clean.csv')
df_utilisable_10_norm = df_utilisable_10_norm.dropna()
df_utilisable_10_norm['Lem'] = df_utilisable_10_norm.Text.map(lambda x:get_lem(x))

In [19]:
%%time
df_utilisable_10_norm['Lem'+'_token'] = df_utilisable_10_norm['Lem'].map(lambda x: nltk_tokenisation(x,sw=False)) # Les mots
df_utilisable_10_norm['Lem'+'_nb_mots'] = df_utilisable_10_norm['Lem'+'_token'].map(lambda x:nb_mots(x)) # On renvoie 0 pour les NaN


CPU times: user 1min 10s, sys: 604 ms, total: 1min 11s
Wall time: 1min 12s


In [21]:
df_utilisable_10_norm['Lem'+'_nb_mots'].describe()

count    76753.000000
mean        75.383829
std         87.174306
min          2.000000
25%         28.000000
50%         49.000000
75%         87.000000
max        884.000000
Name: Lem_nb_mots, dtype: float64

In [23]:


# création du pipeline avec les paramètres précédents

pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', LinearSVC(class_weight='balanced')),
])

# Paramètres
parameters = {
    #'max_df': [0.75],#(0.5, 0.75, 1.0),
    'vect__max_features': [10000],#, 10000, 50000),
    'vect__min_df': [5,10],
    'vect__ngram_range': [(1, 1)],  # unigrams or bigrams
    #'tfidf__use_idf': (True, False),
    #'tfidf__norm': ('l1', 'l2')
    #'clf__C': [1, 10, 100]
}

#Grid search
if __name__ == "__main__":
   
    grid_search = GridSearchCV(pipeline, parameters, cv=3,n_jobs=-1, verbose=1)

    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    pprint(parameters)
    t0 = time()
    grid_search.fit(df_utilisable_10_norm.Lem, df_utilisable_10_norm.DCO_ID)
    print("done in %0.3fs" % (time() - t0))
    print()

    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters:
{'vect__max_features': [10000],
 'vect__min_df': [5, 10],
 'vect__ngram_range': [(1, 1)]}
Fitting 3 folds for each of 2 candidates, totalling 6 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   6 | elapsed:  6.6min remaining:  6.6min
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:  7.1min finished


done in 925.888s

Best score: 0.026
Best parameters set:
	vect__max_features: 10000
	vect__min_df: 10
	vect__ngram_range: (1, 1)




## 4) L'impact des doublons :  changeons le train et test split

In [85]:
df_10 = select_raw_by_nb_obs(df,10)
train_index,test_index = next(GroupShuffleSplit(random_state=1029).split(df_10, groups=df_10['DESCRIPTION_INCIDENT']))
df_train, df_test = df_10.iloc[train_index], df_10.iloc[test_index]

X_train_a, X_test_a, y_train_a, y_test_a = train_test_split(df_10.Text,df_10.DCO_ID,test_size=0.25)

In [25]:
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5,
                            ngram_range=(1, 2),
                            stop_words=STOP_WORDS,
                            max_features = 10000)

vect_tf= tfidf.fit(df_train.Text)

X_train = vect_tf.transform(df_train.Text)
X_test = vect_tf.transform(df_test.Text)

y_train = df_train.DCO_ID
y_test = df_test.DCO_ID


  'stop_words.' % sorted(inconsistent))


In [29]:
model = LinearSVC(class_weight='balanced')
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [32]:
print('\t\t Métriques de CLASSIFICATIION \n')
print(metrics.classification_report(y_test, y_pred))

		 Métriques de CLASSIFICATIION 

              precision    recall  f1-score   support

      2288.0       0.00      0.00      0.00         2
      2291.0       1.00      1.00      1.00         6
      2293.0       1.00      0.60      0.75         5
      2294.0       1.00      1.00      1.00         2
      2296.0       0.50      1.00      0.67         1
      2297.0       1.00      0.40      0.57         5
      2298.0       1.00      0.80      0.89         5
      2300.0       0.79      1.00      0.88        11
      2306.0       1.00      0.71      0.83         7
      2309.0       0.33      0.75      0.46         4
      2310.0       0.71      1.00      0.83         5
      2312.0       0.50      0.50      0.50         2
      2315.0       0.65      0.79      0.71        14
      2316.0       0.80      0.67      0.73         6
      2319.0       0.00      0.00      0.00         3
      2320.0       1.00      0.80      0.89         5
      2321.0       0.00      0.00      0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [39]:
print("précison:", accuracy_score(y_test,y_pred) )
print("Balanced_accuracy : ", balanced_accuracy_score(y_test,y_pred))

précison: 0.7675035721575831
Balanced_accuracy :  0.5774459012789024


In [50]:
%%time
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5,
                            ngram_range=(1, 2),
                            stop_words=STOP_WORDS,
                            max_features = 10000)

vect_tf= tfidf.fit(X_train_a.values)

X_train_a = vect_tf.transform(X_train_a.values)
X_test_a = vect_tf.transform(X_test_a.values)


model = LinearSVC(class_weight='balanced')
model.fit(X_train_a, y_train_a)
y_pred_a = model.predict(X_test_a)

print("précison:", accuracy_score(y_test_a,y_pred_a) )
print("Balanced_accuracy : ", balanced_accuracy_score(y_test_a,y_pred_a))

  'stop_words.' % sorted(inconsistent))


précison: 0.8860996627950544
Balanced_accuracy :  0.7440963277648411
CPU times: user 1min 45s, sys: 440 ms, total: 1min 45s
Wall time: 1min 45s




## Commentaire :
Le choix du train_test est important pour évaluer correctement notre modèle
Lors de nos études, nous avions procédés par cross validation (5 folds) ce qui avait éffacé l'effet que l'on observe si dessus

## 5) Si on  utilisait seulement ~33 000 lignes

In [64]:
df_10_unique = df_10.drop_duplicates('DESCRIPTION_INCIDENT') 

In [66]:
%%time
X_train_u, X_test_u, y_train_u, y_test_u = train_test_split(df_10_unique.Text,df_10_unique.DCO_ID,test_size=0.25)
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5,
                            ngram_range=(1, 2),
                            stop_words=STOP_WORDS,
                            max_features = 10000)

vect_tf= tfidf.fit(X_train_u.values)

X_train_u = vect_tf.transform(X_train_u.values)
X_test_u = vect_tf.transform(X_test_u.values)


model = LinearSVC(class_weight='balanced')
model.fit(X_train_u, y_train_u)
y_pred_u = model.predict(X_test_u)

print("précison:", accuracy_score(y_test_u,y_pred_u) )
print("Balanced_accuracy : ", balanced_accuracy_score(y_test_u,y_pred_u))

  'stop_words.' % sorted(inconsistent))


précison: 0.7563855730971634
Balanced_accuracy :  0.600639466198176
CPU times: user 21.2 s, sys: 116 ms, total: 21.3 s
Wall time: 21.3 s




## COmmentaire : 
Nous observons une baisse sensible de la performance de notre modèle. Toutefois,...

## 6) Créer deux tfidf différents

In [60]:
preprocess = ColumnTransformer(
    [('libelle_tfidf', TfidfVectorizer(sublinear_tf=True, min_df=3,ngram_range=(1, 3),
                                       stop_words=STOP_WORDS,
                                       max_features = 10000), 'LIBELLE_COMMERCIAL'),
     ('description_tfidf',TfidfVectorizer(sublinear_tf=True, min_df=5,
                            ngram_range=(1, 1),
                            stop_words=STOP_WORDS,
                            max_features = 10000), 'DESCRIPTION_INCIDENT')],
    remainder='passthrough')


pipeline = Pipeline([
    ('vect', preprocess),
    ('clf', LinearSVC(class_weight='balanced')),
])

In [61]:
X_train = df_train[['DESCRIPTION_INCIDENT','LIBELLE_COMMERCIAL']]
X_test  = df_test[['DESCRIPTION_INCIDENT','LIBELLE_COMMERCIAL']]
y_train = df_train.DCO_ID
y_test = df_test.DCO_ID

In [62]:
pipeline.fit(X_train,y_train)
y_pred = pipeline.predict(X_test)

print("précison:", accuracy_score(y_test,y_pred) )
print("Balanced_accuracy : ", balanced_accuracy_score(y_test,y_pred))

  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))


précison: 0.8313941620738926
Balanced_accuracy :  0.682057446796152




## Commentaire : 
Faire une transformation pour chaque variable améliore sensiblement le résultat. Par exemple, le résultat s'améliore d'environ 4%.

## 7) Une moyenne d'un embedding ? 
Fait dans n autre notebook, comme attendu moyener les embeddings des mots n'apporte pas de bon résultats
0.20 appliqué à df.Text
     appliqué à df.LIBELLE_COMMERCIALE

## 8 ) Un modèle avec des scores

In [88]:
%%time
from sklearn.calibration import CalibratedClassifierCV

pipeline = Pipeline([
    ('vect', preprocess),
    ('clf',CalibratedClassifierCV(LinearSVC(class_weight='balanced'),cv=3, method='isotonic'))
])


df_10 = select_raw_by_nb_obs(df,20)
train_index,test_index = next(GroupShuffleSplit(random_state=1029).split(df_10, groups=df_10['DESCRIPTION_INCIDENT']))
df_train, df_test = df_10.iloc[train_index], df_10.iloc[test_index]

X_train = df_train[['DESCRIPTION_INCIDENT','LIBELLE_COMMERCIAL']]
X_test  = df_test[['DESCRIPTION_INCIDENT','LIBELLE_COMMERCIAL']]
y_train = df_train.DCO_ID
y_test = df_test.DCO_ID

pipeline.fit(X_train,y_train)
y_pred = pipeline.predict(X_test)

print("précison:", accuracy_score(y_test,y_pred) )
print("Balanced_accuracy : ", balanced_accuracy_score(y_test,y_pred))

  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))


précison: 0.8331945600888149
Balanced_accuracy :  0.6892819196582894
CPU times: user 1min 50s, sys: 636 ms, total: 1min 51s
Wall time: 1min 51s




In [128]:
pipeline.predict_proba(X_test)[0]



array([0.00000000e+00, 0.00000000e+00, 2.52375699e-05, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 2.33958916e-05, 6.53236519e-05,
       0.00000000e+00, 0.00000000e+00, 6.74504386e-05, 6.15025935e-04,
       0.00000000e+00, 0.00000000e+00, 2.35467604e-05, 3.00416184e-05,
       0.00000000e+00, 1.87375497e-04, 0.00000000e+00, 7.91395822e-05,
       2.85927372e-04, 0.00000000e+00, 1.47905782e-05, 6.92042091e-04,
       4.37566211e-05, 0.00000000e+00, 1.68017359e-04, 0.00000000e+00,
       4.57381385e-05, 3.49301755e-04, 4.70061856e-05, 0.00000000e+00,
       3.53314427e-04, 0.00000000e+00, 1.22685049e-03, 0.00000000e+00,
       1.90707073e-04, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       5.71980269e-05, 7.31995772e-05, 4.34283358e-05, 0.00000000e+00,
       1.45459719e-05, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       1.83088456e-05, 6.12604196e-05, 2.89650911e-05, 1.50996898e-05,
       4.91077672e-05, 2.65398138e-04, 6.42498384e-05, 3.01007119e-05,
      

## 9) UTilisation de XGboost

In [131]:
from xgboost.sklearn import XGBClassifier

In [134]:
pipeline_xgb = Pipeline([
    ('vect', preprocess),
    ('clf', XGBClassifier())
]) 

In [135]:
%%time
pipeline_xgb.fit(X_train,y_train)
y_pred = pipeline_xgb.predict(X_test)

print("précison:", accuracy_score(y_test,y_pred) )
print("Balanced_accuracy : ", balanced_accuracy_score(y_test,y_pred))

  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))


précison: 0.7422980849292257
Balanced_accuracy :  0.5458820929051504
CPU times: user 19h 59min 5s, sys: 5min 20s, total: 20h 4min 26s
Wall time: 1h 47min 17s




# Autres tests...

In [76]:
%%time
import spacy 
from sklearn.base import BaseEstimator, TransformerMixin

class SpacyVectorTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, nlp):
        self.nlp = nlp
        self.dim = 300

    def fit(self, X, y):
        return self

    def transform(self, X):
        # Doc.vector defaults to an average of the token vectors.
        # https://spacy.io/api/doc#vector
        return [self.nlp(text).vector for text in X]



embeddings_pipeline = Pipeline(
    steps=[
        ("mean_embeddings", SpacyVectorTransformer(nlp)),
        ("classifier", LinearSVC(class_weight='balanced')),
    ]
)
X_train_u, X_test_u, y_train_u, y_test_u = train_test_split(df_10_unique.Text,df_10_unique.DCO_ID,test_size=0.25)
embeddings_pipeline.fit(X_train_u, y_train_u)
y_pred_u = embeddings_pipeline.predict(X_test_u)

print("précison:", accuracy_score(y_test_u,y_pred_u) )
print("Balanced_accuracy : ", balanced_accuracy_score(y_test_u,y_pred_u))



précison: 0.14580926710306763
Balanced_accuracy :  0.1034307203391295
CPU times: user 32min 55s, sys: 3.29 s, total: 32min 58s
Wall time: 32min 59s




In [None]:
embeddings_pipeline

In [None]:
from sklearn.pipeline import FeatureUnion
from sklearn.linear_model import LogisticRegression

model = Pipeline([
    ('parser', HTMLParser()),
    ('text_union', FeatureUnion(
        transformer_list = [
            ('entity_feature', Pipeline([
                ('entity_extractor', EntityExtractor()),
                ('entity_vect', CountVectorizer()),
            ])),
            ('keyphrase_feature', Pipeline([
                ('keyphrase_extractor', KeyphraseExtractor()),
                ('keyphrase_vect', TfidfVectorizer()),
            ])),
        ],
        transformer_weights= {
            'entity_feature': 0.6,
            'keyphrase_feature': 0.2,
        }
    )),
    ('clf', LogisticRegression()),
])

In [None]:
clf = linear_svc.fit(X_train_transformed,y_train_lables_trf)

calibrated_svc = CalibratedClassifierCV(base_estimator=linear_svc,
                                        cv="prefit")

calibrated_svc.fit(X_train_transformed,y_train_lables_trf)
predicted = calibrated_svc.predict(X_test_transformed)
    
to_predict = ["I have outdated information on my credit report that I have previously disputed that has yet to be removed this information is more then seven years old and does not meet credit reporting requirements"]
p_count = count_vect.transform(to_predict)
p_tfidf = tf_transformer.transform(p_count)
print('Average accuracy on test set={}'.format(np.mean(predicted == labels.transform(y_test))))
print('Predicted probabilities of demo input string are')
print(calibrated_svc.predict_proba(p_tfidf))