In [1]:
import numpy as np
import pandas as pd
import pickle
import joblib
from time import time
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC 
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from collections import Counter
from num2words import num2words

#import nltk
#nltk.download('stopwords')

In [2]:
## Preprocessing functions 
def convert_lower_case(data):
    return np.char.lower(data)

def remove_stop_words(data):
    stop_words = stopwords.words('french')
    words = word_tokenize(str(data), language="french", preserve_line=True)
    new_text = ""
    for w in words:
        if w not in stop_words and len(w) > 1:
            new_text = new_text + " " + w
    return new_text

def remove_punctuation(data):
    symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
    for i in range(len(symbols)):
        data = np.char.replace(data, symbols[i], ' ')
        data = np.char.replace(data, "  ", " ")
    data = np.char.replace(data, ',', '')
    return data

def remove_apostrophe(data):
    return np.char.replace(data, "’", "")

def stemming(data):
    stemmer= PorterStemmer()
    
    tokens = word_tokenize(str(data), language="french", preserve_line=True)
    new_text = ""
    for w in tokens:
        new_text = new_text + " " + stemmer.stem(w)
    return new_text

def convert_numbers(data):
    tokens = word_tokenize(str(data), language="french", preserve_line=True)
    new_text = ""
    for w in tokens:
        try:
            w = num2words(int(w), lang='fr')
        except:
            a = 0
        new_text = new_text + " " + w
    new_text = np.char.replace(new_text, "-", " ")
    return new_text

def preprocess(data):
    data = convert_lower_case(data)
    data = remove_punctuation(data) #remove comma seperately
    data = remove_apostrophe(data)
    data = remove_stop_words(data)
    data = convert_numbers(data)
    data = stemming(data)
    data = remove_punctuation(data)
    data = convert_numbers(data)
    data = stemming(data) #needed again as we need to stem the words
    data = remove_punctuation(data) #needed again as num2word is giving few hypens and commas fourty-one
    data = remove_stop_words(data) #needed again as num2word is giving stop words 101 - one hundred and one
    return data

In [3]:
## Lecture des données
data_df = pd.read_csv('DonneesPedoPsy/labeled_data.csv', sep=";", encoding='cp1252')
data_df.drop('num', axis=1, inplace=True)
data_df.shape

(1648, 200)

In [4]:
## Traitement des labels 
## Préparation des labels à partir des colonnes (troisième colonne)
labels_list = ['+', '-', '0', 'i', 'j', 'f', 's', 'p', 'm', 'a', 't']
raw_labels_names = data_df.columns[2:]
for i, row in data_df.iterrows():
    for key in labels_list:
        data_df.at[i, key] = max([row[c] for c in raw_labels_names if key in c])

print(data_df.shape)        
all_labels = data_df[labels_list]
print(all_labels.shape)

(1648, 211)
(1648, 11)


In [5]:
corpus = []
for i in range(len(data_df["phrases"])):
    corpus.append(str(preprocess(data_df["phrases"].iloc[i])))
#for i in range(len(data_df["phrases"])):
#    processed_text.append(word_tokenize(str(preprocess(data_df["phrases"].iloc[i])), language="french", preserve_line=True))

In [6]:
vectorizer = CountVectorizer()
vectorizer = TfidfVectorizer(min_df=10)
X = vectorizer.fit_transform(corpus)
features_tfidf = pd.DataFrame(X.toarray() ,columns=vectorizer.get_feature_names_out())

In [7]:
train_indices = joblib.load("train_indices.sav")
test_indices = joblib.load("test_indices.sav")
#df_train_nb = int(data_df.shape[0] * .8)
#df_test_nb = data_df.shape[0] - df_train_nb
#X_train, X_test, y_train, y_test = train_test_split(features_tfidf, all_labels, train_size=df_train_nb, test_size=df_test_nb, random_state=11)
X_train, X_test, y_train, y_test =  features_tfidf.loc[train_indices], features_tfidf.loc[test_indices], all_labels.loc[train_indices], all_labels.loc[test_indices]
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1318, 379)
(330, 379)
(1318, 11)
(330, 11)


In [8]:
nb_folds = 5
cv = KFold(n_splits=nb_folds, random_state=109, shuffle=True)

In [9]:
model_to_set = OneVsRestClassifier(LogisticRegression(solver='saga', penalty='elasticnet', max_iter = 4000))
parameters = [{'estimator__C': [4, 3.5, 3, 2.5, 2, 1.5, 1, 0.75, 0.5],
               'estimator__l1_ratio' : [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]}] 
enet_model = GridSearchCV(model_to_set, 
                          param_grid=parameters,
                          cv = cv,
                          verbose=3, 
                          n_jobs=100)
# record current time
start = time()
enet_model.fit(X_train, y_train)
# record current time
end = time()
result = end - start

Fitting 5 folds for each of 99 candidates, totalling 495 fits
[CV 3/5] END estimator__C=4, estimator__l1_ratio=0;, score=0.299 total time=   1.9s
[CV 1/5] END estimator__C=4, estimator__l1_ratio=0;, score=0.186 total time=   1.9s
[CV 2/5] END estimator__C=4, estimator__l1_ratio=0;, score=0.261 total time=   1.9s
[CV 3/5] END estimator__C=3.5, estimator__l1_ratio=0;, score=0.277 total time=   1.7s
[CV 1/5] END estimator__C=3.5, estimator__l1_ratio=0.1;, score=0.186 total time=   2.0s
[CV 4/5] END estimator__C=3.5, estimator__l1_ratio=0;, score=0.247 total time=   2.0s
[CV 5/5] END estimator__C=3.5, estimator__l1_ratio=0;, score=0.221 total time=   1.8s
[CV 4/5] END estimator__C=4, estimator__l1_ratio=0;, score=0.243 total time=   2.3s
[CV 5/5] END estimator__C=4, estimator__l1_ratio=0;, score=0.221 total time=   2.1s
[CV 2/5] END estimator__C=3.5, estimator__l1_ratio=0.3;, score=0.280 total time=   2.2s
[CV 1/5] END estimator__C=4, estimator__l1_ratio=0.1;, score=0.186 total time=   2.5

In [10]:
print('%.3f seconds' % result)
print('Evaluation of OneVsRestClassifier/elasticnet :')
print('best parameters: ', enet_model.best_params_)
print('best score: ', enet_model.best_score_)

20.091 seconds
Evaluation of OneVsRestClassifier/elasticnet :
best parameters:  {'estimator__C': 1.5, 'estimator__l1_ratio': 1}
best score:  0.2814696393593732


In [11]:
y_pred = enet_model.predict(X_test)
print('classification par enet \n',classification_report(y_test, 
                                                        y_pred,
                                                        target_names=labels_list))

classification par enet 
               precision    recall  f1-score   support

           +       0.69      0.49      0.57        72
           -       0.75      0.10      0.17        63
           0       0.33      0.02      0.04        47
           i       0.61      0.75      0.67       146
           j       0.81      1.00      0.89       263
           f       0.79      0.64      0.71        36
           s       0.98      0.81      0.88        57
           p       0.93      0.85      0.89       124
           m       0.87      0.85      0.86       123
           a       0.81      0.64      0.71        72
           t       0.75      0.52      0.62        23

   micro avg       0.79      0.73      0.76      1026
   macro avg       0.76      0.60      0.64      1026
weighted avg       0.78      0.73      0.72      1026
 samples avg       0.80      0.71      0.73      1026



  _warn_prf(average, modifier, msg_start, len(result))


In [12]:
parameters = [{'estimator__learning_rate': [0.025, 0.05, 0.1],
               'estimator__n_estimators': [650, 700, 750]}]
model_to_set = OneVsRestClassifier(lgb.LGBMClassifier(boosting_type='gbdt',  
                                                      objective='binary'))
lgb_model = GridSearchCV(model_to_set, param_grid=parameters, cv = cv, verbose=3, n_jobs=50)
model_to_set.get_params().keys()

dict_keys(['estimator__boosting_type', 'estimator__class_weight', 'estimator__colsample_bytree', 'estimator__importance_type', 'estimator__learning_rate', 'estimator__max_depth', 'estimator__min_child_samples', 'estimator__min_child_weight', 'estimator__min_split_gain', 'estimator__n_estimators', 'estimator__n_jobs', 'estimator__num_leaves', 'estimator__objective', 'estimator__random_state', 'estimator__reg_alpha', 'estimator__reg_lambda', 'estimator__silent', 'estimator__subsample', 'estimator__subsample_for_bin', 'estimator__subsample_freq', 'estimator', 'n_jobs'])

In [13]:
# record current time
start = time()
lgb_model.fit(X_train, y_train)
# record current time
end = time()
# report execution time
result = end - start

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV 4/5] END estimator__learning_rate=0.05, estimator__n_estimators=700;, score=0.209 total time=   9.0s
[CV 1/5] END estimator__learning_rate=0.1, estimator__n_estimators=650;, score=0.242 total time=   9.6s
[CV 5/5] END estimator__learning_rate=0.1, estimator__n_estimators=700;, score=0.194 total time=   9.6s
[CV 2/5] END estimator__learning_rate=0.1, estimator__n_estimators=750;, score=0.246 total time=   9.6s
[CV 2/5] END estimator__learning_rate=0.1, estimator__n_estimators=650;, score=0.250 total time=   9.5s
[CV 5/5] END estimator__learning_rate=0.1, estimator__n_estimators=650;, score=0.190 total time=   9.9s
[CV 5/5] END estimator__learning_rate=0.1, estimator__n_estimators=750;, score=0.194 total time=  10.2s
[CV 4/5] END estimator__learning_rate=0.1, estimator__n_estimators=750;, score=0.202 total time=   9.9s
[CV 5/5] END estimator__learning_rate=0.05, estimator__n_estimators=650;, score=0.217 total time=  10.3s
[C

In [14]:
print('%.3f seconds' % result)
print('Evaluation of OneVsRestClassifier/lightgbm :')
print('best parameters: ', lgb_model.best_params_)
print('best score: ', lgb_model.best_score_)

109.994 seconds
Evaluation of OneVsRestClassifier/lightgbm :
best parameters:  {'estimator__learning_rate': 0.025, 'estimator__n_estimators': 650}
best score:  0.2420353727387948


In [15]:
y_pred = lgb_model.predict(X_test)
print('classification par lgb \n',classification_report(y_test, 
                                                        y_pred,
                                                        target_names=labels_list))

classification par lgb 
               precision    recall  f1-score   support

           +       0.65      0.49      0.56        72
           -       0.44      0.11      0.18        63
           0       0.45      0.11      0.17        47
           i       0.64      0.66      0.65       146
           j       0.80      0.95      0.87       263
           f       0.78      0.81      0.79        36
           s       0.92      0.86      0.89        57
           p       0.93      0.85      0.89       124
           m       0.85      0.90      0.88       123
           a       0.73      0.64      0.68        72
           t       0.83      0.43      0.57        23

   micro avg       0.78      0.73      0.75      1026
   macro avg       0.73      0.62      0.65      1026
weighted avg       0.75      0.73      0.72      1026
 samples avg       0.79      0.70      0.72      1026



  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
# Grid search for random forest 
parameters = [{#'estimator__max_features': [int(x) for x in np.linspace(start = 10, stop = X_train.shape[1] , num = 10)],
               'estimator__max_features': [10, 25, 50, 75, 100],
               'estimator__n_estimators': [500],
               'estimator__bootstrap' : [True]}]
model_to_set = OneVsRestClassifier(RandomForestClassifier(n_jobs=150))
rf_model = GridSearchCV(model_to_set, param_grid=parameters, cv = cv, verbose=3)
# record current time
start = time()
rf_model.fit(X_train, y_train)
# record current time
end = time()
# report execution time
result = end - start

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV 1/5] END estimator__bootstrap=True, estimator__max_features=10, estimator__n_estimators=500;, score=0.193 total time=  22.4s
[CV 2/5] END estimator__bootstrap=True, estimator__max_features=10, estimator__n_estimators=500;, score=0.242 total time=  18.8s
[CV 3/5] END estimator__bootstrap=True, estimator__max_features=10, estimator__n_estimators=500;, score=0.284 total time=  19.1s
[CV 4/5] END estimator__bootstrap=True, estimator__max_features=10, estimator__n_estimators=500;, score=0.205 total time=  18.6s
[CV 5/5] END estimator__bootstrap=True, estimator__max_features=10, estimator__n_estimators=500;, score=0.194 total time=  18.2s
[CV 1/5] END estimator__bootstrap=True, estimator__max_features=25, estimator__n_estimators=500;, score=0.246 total time=  18.4s
[CV 2/5] END estimator__bootstrap=True, estimator__max_features=25, estimator__n_estimators=500;, score=0.254 total time=  18.6s
[CV 3/5] END estimator__bootstrap=Tru

In [17]:
print('%.3f seconds' % result)
print('Evaluation of OneVsRestClassifier/RandomForest :')
print('best parameters: ', rf_model.best_params_)
print('best score: ', rf_model.best_score_)

505.617 seconds
Evaluation of OneVsRestClassifier/RandomForest :
best parameters:  {'estimator__bootstrap': True, 'estimator__max_features': 50, 'estimator__n_estimators': 500}
best score:  0.2564264316165457


In [18]:
y_pred = rf_model.predict(X_test)
print('classification par rf \n',classification_report(y_test, 
                                                        y_pred,
                                                        target_names=labels_list))

classification par rf 
               precision    recall  f1-score   support

           +       0.70      0.43      0.53        72
           -       0.73      0.17      0.28        63
           0       0.33      0.04      0.08        47
           i       0.64      0.59      0.61       146
           j       0.80      0.92      0.86       263
           f       0.72      0.86      0.78        36
           s       0.92      0.95      0.93        57
           p       0.93      0.92      0.92       124
           m       0.89      0.93      0.91       123
           a       0.77      0.71      0.74        72
           t       0.67      0.52      0.59        23

   micro avg       0.80      0.73      0.76      1026
   macro avg       0.74      0.64      0.66      1026
weighted avg       0.77      0.73      0.73      1026
 samples avg       0.79      0.69      0.72      1026



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [19]:
# Grid search for random forest 
parameters = [{'estimator__C': [0.1, 25, 50, 75, 100, 150], 
               'estimator__gamma': [1,0.1,0.01,0.001],
               'estimator__kernel': ['rbf', 'poly', 'sigmoid']}]
model_to_set = OneVsRestClassifier(SVC())
svc_model = GridSearchCV(model_to_set, param_grid=parameters, cv=cv, verbose=3, n_jobs=150)
# record current time
start = time()
svc_model.fit(X_train, y_train)
# record current time
end = time()
# report execution time
result = end - start

Fitting 5 folds for each of 72 candidates, totalling 360 fits
[CV 1/5] END estimator__C=0.1, estimator__gamma=0.001, estimator__kernel=poly;, score=0.000 total time=   1.7s
[CV 4/5] END estimator__C=25, estimator__gamma=0.01, estimator__kernel=poly;, score=0.004 total time=   1.5s
[CV 5/5] END estimator__C=25, estimator__gamma=0.001, estimator__kernel=poly;, score=0.000 total time=   1.5s
[CV 1/5] END estimator__C=25, estimator__gamma=1, estimator__kernel=sigmoid;, score=0.155 total time=   1.7s
[CV 3/5] END estimator__C=0.1, estimator__gamma=0.1, estimator__kernel=poly;, score=0.000 total time=   1.9s
[CV 3/5] END estimator__C=25, estimator__gamma=0.01, estimator__kernel=poly;, score=0.000 total time=   1.7s[CV 5/5] END estimator__C=0.1, estimator__gamma=0.1, estimator__kernel=poly;, score=0.000 total time=   1.9s

[CV 2/5] END estimator__C=0.1, estimator__gamma=0.001, estimator__kernel=rbf;, score=0.000 total time=   1.9s
[CV 5/5] END estimator__C=0.1, estimator__gamma=0.001, estimat

In [20]:
print('%.3f seconds' % result)
print('Evaluation of OneVsRestClassifier/SVC :')
print('best parameters: ', svc_model.best_params_)
print('best score: ', svc_model.best_score_)

10.846 seconds
Evaluation of OneVsRestClassifier/SVC :
best parameters:  {'estimator__C': 100, 'estimator__gamma': 0.01, 'estimator__kernel': 'sigmoid'}
best score:  0.25718688789030997


In [21]:
y_pred = svc_model.predict(X_test)
print('classification par SVC \n',classification_report(y_test, 
                                                        y_pred,
                                                        target_names=labels_list))

classification par SVC 
               precision    recall  f1-score   support

           +       0.70      0.46      0.55        72
           -       1.00      0.05      0.09        63
           0       0.00      0.00      0.00        47
           i       0.66      0.73      0.69       146
           j       0.81      1.00      0.89       263
           f       0.79      0.64      0.71        36
           s       0.94      0.86      0.90        57
           p       0.92      0.78      0.84       124
           m       0.89      0.89      0.89       123
           a       0.79      0.58      0.67        72
           t       0.79      0.48      0.59        23

   micro avg       0.81      0.72      0.76      1026
   macro avg       0.75      0.59      0.62      1026
weighted avg       0.78      0.72      0.72      1026
 samples avg       0.82      0.69      0.73      1026



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
