# Fiche TP N° 02: Feature extraction and embeddings

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, cross_val_score


### A. Préparation de données

In [2]:
data = pd.read_csv('out.csv')
data.drop(['Unnamed: 0'],axis=1,inplace=True)
data

Unnamed: 0,id,text,author
0,id26305,this proces however afforded me no means of as...,EAP
1,id17569,it never once occurred to me that the fumbling...,HPL
2,id11008,in his left hand was a gold snuff box from whi...,EAP
3,id27763,how lovely is spring as we looked from windsor...,MWS
4,id12958,finding nothing else not even gold the superin...,HPL
...,...,...,...
19574,id17718,i could have fancied while i looked at it that...,EAP
19575,id08973,the lids clenched themselves together as if in...,EAP
19576,id05267,mais il faut agir that is to say a frenchman n...,EAP
19577,id17513,for an item of news like this it strikes us it...,EAP


### B. Encodage de la variable à prédire

In [3]:
le = LabelEncoder()
data['author_encoded'] = le.fit_transform(data['author'])
data

Unnamed: 0,id,text,author,author_encoded
0,id26305,this proces however afforded me no means of as...,EAP,0
1,id17569,it never once occurred to me that the fumbling...,HPL,1
2,id11008,in his left hand was a gold snuff box from whi...,EAP,0
3,id27763,how lovely is spring as we looked from windsor...,MWS,2
4,id12958,finding nothing else not even gold the superin...,HPL,1
...,...,...,...,...
19574,id17718,i could have fancied while i looked at it that...,EAP,0
19575,id08973,the lids clenched themselves together as if in...,EAP,0
19576,id05267,mais il faut agir that is to say a frenchman n...,EAP,0
19577,id17513,for an item of news like this it strikes us it...,EAP,0


### C. Construction des bases d’entraînement et de test

In [4]:
X_train, X_test, y_train, y_test = train_test_split(data['text'].values, 
                                                    data['author_encoded'].values, 
                                                    test_size=0.2, 
                                                    random_state=33,
                                                    stratify = data['author_encoded'].values)

In [5]:
print(100*y_train.tolist().count(0)/(len(y_train)))
print(100*y_test.tolist().count(0)/(len(y_test)))

40.349869118304284
40.34729315628192


### D. Méthodes de vectorisation

##### 1. Utiliser la méthode de fréquence lexicale et one-hot encoding pour vectoriser le dataset d’entrainement et du test.

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

# création d'un objet CountVectorizer pour effectuer la fréquence lexicale et l'encodage one-hot 
# (
#    si binary=False => fréquence lexicale
#    si binary=True => one-hot encoding
#)
vectorizer = CountVectorizer(binary=False,analyzer= 'word', stop_words='english')
vectorizer_onehot = CountVectorizer(binary=True , analyzer= 'word', stop_words='english')

vectorizer.fit(X_train)
vectorizer_onehot.fit(X_train)

train_fl = vectorizer.transform(X_train)
test_fl = vectorizer.transform(X_test)

train_oneh = vectorizer_onehot.transform(X_train)
test_oneh = vectorizer_onehot.transform(X_test)

In [10]:
count_array_cv = train_cv.toarray()
df = pd.DataFrame(data=count_array_cv,columns = vectorizer.get_feature_names_out())
df

Unnamed: 0,ab,aback,abandon,abandoned,abandoning,abandonment,abaout,abasement,abashed,abashment,...,zobnarian,zodiac,zodiacal,zoilus,zokar,zone,zones,zory,zubmizion,zuro
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15658,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15659,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15660,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15661,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


##### 2 et 3. Entrainer un modèle de vectorisation TF-IDF sur la partie d’entrainement et vectorisez-le. (et de test)

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

# création d'un objet TfidfVectorizer pour effectuer la vectorisation TF-IDF
tfidf_vectorizer = TfidfVectorizer(analyzer= 'word', stop_words='english')

# ajustement du vectorizer sur le texte d'entraînement
tfidf_vectorizer.fit(X_train)

# transformation du texte d'entraînement et de test en vecteurs TF-IDF
train_tfidf = tfidf_vectorizer.transform(X_train)
test_tfidf = tfidf_vectorizer.transform(X_test)

In [12]:
count_array_tfidf = train_tfidf.toarray()
df = pd.DataFrame(data=count_array_tfidf,columns = tfidf_vectorizer.get_feature_names_out())
df

Unnamed: 0,ab,aback,abandon,abandoned,abandoning,abandonment,abaout,abasement,abashed,abashment,...,zobnarian,zodiac,zodiacal,zoilus,zokar,zone,zones,zory,zubmizion,zuro
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15658,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15659,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15660,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15661,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### E. Entrainement

#### 1. Créer trois modèles du type MLPClassifier. (Vous pouvez changer l’algorithme d’apprentissage : utiliser les autres algorithmes de scikit-learn)

In [13]:
from sklearn.neural_network import MLPClassifier

# Modèle 1 : 1 couche cachée avec 100 neurones
model1 = MLPClassifier(hidden_layer_sizes=(100,), max_iter=200, solver='adam', random_state=1)

# Modèle 2 : 2 couches cachées avec 50 et 25 neurones, respectivement
model2 = MLPClassifier(hidden_layer_sizes=(50, 25), max_iter=200, solver='adam', random_state=1)

# Modèle 3 : 3 couches cachées avec 100, 50 et 25 neurones, respectivement
model3 = MLPClassifier(hidden_layer_sizes=(100, 50, 25), max_iter=200, solver='adam', random_state=1)


#### 2,3 et 4. 
 - Entrainer ces trois modèles sur les trois représentations vectorielles 
 - Prédire les classes en appliquant les trois modèles sur les trois représentations d’entrainement.
 - Afficher le rapport de classification en utilisant les mesures de performance (accuracy, precision, recall…).


In [16]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import pickle



# création des modèles avec différentes architectures de couches cachées
models = [
    model1,
    model2,
    model3
]

# entrainement des modèles sur les différentes représentations
representations = [(train_cv, test_cv),(train_tfidf, test_tfidf)]
for i, model in enumerate(models):
    print(f"Model {i+1}")
    for j, rep in enumerate(representations):
        
        # entraînement du modèle
        model.fit(rep[0], y_train)
        
        # prédiction sur le jeu de train
        y_pred = model.predict(rep[0])
        
        # évaluation de la performance du modèle
        
        print(f"Representation {j+1}:")
        print(classification_report(y_train, y_pred))

        
        # save the model to disk
        filename = f'models/finalized_model{i+1}.{j+1}.sav'
        pickle.dump(model, open(filename, 'wb'))


Model 1
Representation 1:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      6320
           1       1.00      1.00      1.00      4508
           2       1.00      1.00      1.00      4835

    accuracy                           1.00     15663
   macro avg       1.00      1.00      1.00     15663
weighted avg       1.00      1.00      1.00     15663

Representation 2:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      6320
           1       1.00      1.00      1.00      4508
           2       1.00      1.00      1.00      4835

    accuracy                           1.00     15663
   macro avg       1.00      1.00      1.00     15663
weighted avg       1.00      1.00      1.00     15663

Model 2
Representation 1:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      6320
           1       1.00      1.00      1.00      4508
     

In [34]:
#import pickle
#from sklearn.metrics import accuracy_score

# load the model from disk
#filename = "models/finalized_model1.2.sav"
#loaded_model = pickle.load(open(filename, 'rb'))
#y_pred = loaded_model.predict(train_cv)

#result = accuracy_score(y_pred, y_train)
#print(result)

### F. Test


##### Pour Model1 et La Representation 1 (CountVect ) et 2 (TF-IDF)

In [35]:
import time
import pickle
from sklearn.metrics import classification_report

for i in range(3):
    print('\n\n************************************************************')
    print(f"Pour Model N°{i+1} et La Representation 1 (CountVect ) et 2 (TF-IDF)")
    representations = [test_cv, test_tfidf]

    for j, rep in enumerate(representations):
        print(f"Representation {j+1}:")

        # load the model from disk
        filename = f"models/finalized_model{i+1}.{j+1}.sav"
        loaded_model = pickle.load(open(filename, 'rb'))
        
        # Model prediction time
        start_time = time.time()
        y_pred = loaded_model.predict(rep)
        end_time = time.time()

        result = classification_report(y_pred, y_test)
        print(result)
        print(f"Model {i+1} prediction time: {(end_time - start_time)} seconds")



************************************************************
Pour Model N°1 et La Representation 1 (CountVect ) et 2 (TF-IDF)
Representation 1:
              precision    recall  f1-score   support

           0       0.72      0.76      0.74      1511
           1       0.70      0.73      0.72      1084
           2       0.77      0.71      0.74      1321

    accuracy                           0.73      3916
   macro avg       0.73      0.73      0.73      3916
weighted avg       0.73      0.73      0.73      3916

Model 1 prediction time: 0.009311914443969727 seconds
Representation 2:
              precision    recall  f1-score   support

           0       0.76      0.78      0.77      1532
           1       0.73      0.78      0.76      1054
           2       0.80      0.73      0.77      1330

    accuracy                           0.77      3916
   macro avg       0.77      0.77      0.76      3916
weighted avg       0.77      0.77      0.77      3916

Model 1 prediction t

### G. Vectorisations basées sur les embeddings de mots