**IMPORTS :** 

In [2]:
import os
import xml.etree.ElementTree as ET
from nltk.corpus import stopwords 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, accuracy_score, precision_score, recall_score, f1_score
import pandas as pd

**Récupération du corpus** (pour qu'on puisse l'utiliser avec sklearn)

In [3]:
dossier_train = "../../deft09/corpus_train"
dossier_test = "../../deft09/corpus_test"

In [4]:
def recup_train_data(dossier) : 
    """
    extraction du texte et des étiquettes des documents XML d'un répertoire.
    """
    textes = []
    labels = []
    for fichier in os.listdir(dossier) : 
        if fichier.endswith(".xml"):
            chemin_fichier = os.path.join(dossier, fichier)
            tree = ET.parse(chemin_fichier) #analyse et parsing du fichier
            root = tree.getroot() #obtient l'élément racine de l'arbre XML (le noeud principal de l'arbre)

            for doc in root.findall("doc") : 
                intervention = doc.find(".//texte/p")
                if intervention is not None and intervention.text is not None : 
                 texte = intervention.text
                else : 
                    texte = ""
                parti = doc.find(".//EVALUATION/EVAL_PARTI/PARTI")
                if parti is not None and "valeur" in parti.attrib : 
                    label = parti.get("valeur")
                else : 
                    label = "Unknown"

                textes.append(texte)
                labels.append(label)
                
    return textes, labels


def recup_test_data(dossier) : 
    """
    extraction du texte des documents XML d'un répertoire.
    """
    textes = []
    for fichier in os.listdir(dossier) : 
        if fichier.endswith(".xml"):
            chemin_fichier = os.path.join(dossier, fichier)
            tree = ET.parse(chemin_fichier) #analyse et parsing du fichier
            root = tree.getroot() #obtient l'élément racine de l'arbre XML (le noeud principal de l'arbre)

            for doc in root.findall("doc") : 
                intervention = doc.find(".//texte/p")
                if intervention is not None and intervention.text is not None : 
                 texte = intervention.text
                else : 
                    texte = ""

                textes.append(texte)
    
    return textes

In [5]:
textes_train, labels_train = recup_train_data(dossier_train)
textes_test = recup_test_data(dossier_test)

**Vectorisation TFIDF** (+enlever les stopwords)

In [6]:
stopwords_list = stopwords.words("english") + stopwords.words("french") + stopwords.words("italian")

In [7]:
tfidf = TfidfVectorizer(stop_words=stopwords_list)
X_train_tfidf = tfidf.fit_transform(textes_train)
X_test_tfidf = tfidf.transform(textes_test)

Validation croisée

In [8]:
# Division de l'ensemble d'entraînement en sous-ensembles pour effectuer une validation croisée
X_train, X_test, y_train, y_test = train_test_split(X_train_tfidf, labels_train, test_size=0.2, random_state=42)

## KNeighbors Classifier

In [9]:
param_gridKNN = {
    "n_neighbors" : [1, 2, 10, 15],
    "weights" : ["uniform", "distance"], 
}

#Initialisation
KNN = KNeighborsClassifier()
KNN_grid = GridSearchCV(KNN, param_gridKNN, cv=5, verbose=2, n_jobs=-1)

#Entraînement du modèle
KNN_grid.fit(X_train, y_train)

#Afficher les meilleurs paramètres
print("Meilleurs paramètres : ", KNN_grid.best_params_)

#Prédiction avec les meilleurs paramètres obtenus
best_KNN = KNN_grid.best_estimator_
y_pred_best_KNN = best_KNN.predict(X_test)

#Résultats
print("Rapport de Classification KNeighbors Classifier:\n")
print(classification_report(y_test, y_pred_best_KNN))

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END .....................n_neighbors=1, weights=uniform; total time=  57.9s
[CV] END .....................n_neighbors=1, weights=uniform; total time=  58.6s
[CV] END .....................n_neighbors=1, weights=uniform; total time= 1.1min
[CV] END .....................n_neighbors=1, weights=uniform; total time= 1.1min
[CV] END .....................n_neighbors=1, weights=uniform; total time=  42.5s
[CV] END ....................n_neighbors=1, weights=distance; total time=  50.4s
[CV] END ....................n_neighbors=1, weights=distance; total time=  48.7s
[CV] END ....................n_neighbors=1, weights=distance; total time=  49.9s
[CV] END ....................n_neighbors=1, weights=distance; total time=  45.1s
[CV] END ....................n_neighbors=1, weights=distance; total time=  44.4s
[CV] END .....................n_neighbors=2, weights=uniform; total time=  47.5s
[CV] END .....................n_neighbors=2, wei

  _data = np.array(data, dtype=dtype, copy=copy,


Meilleurs paramètres :  {'n_neighbors': 15, 'weights': 'distance'}
Rapport de Classification KNeighbors Classifier:

              precision    recall  f1-score   support

        ELDR       0.16      0.03      0.05      1221
     GUE-NGL       0.75      0.00      0.00      1645
      PPE-DE       0.69      0.01      0.01      4100
         PSE       0.28      0.98      0.44      3251
   Verts-ALE       0.36      0.00      0.01      1405

    accuracy                           0.28     11622
   macro avg       0.45      0.20      0.10     11622
weighted avg       0.49      0.28      0.13     11622



## Multinomial Naive Bayes

In [10]:
param_gridMultiNB = {
    "alpha": [0.05, 0.2, 0.7, 1.0, 2],
    "fit_prior": [True, False]
}

#Initialisation
MultiNB = MultinomialNB()
MultiNB_grid = GridSearchCV(MultiNB, param_gridMultiNB, cv=5, verbose=2, n_jobs=-1)

#Entraînement du modèle
MultiNB_grid.fit(X_train, y_train)

#Afficher les meilleurs paramètres
print("Meilleurs paramètres : ", MultiNB_grid.best_params_)

#Prédiction avec les meilleurs paramètres obtenus
best_MultiNB = MultiNB_grid.best_estimator_
y_pred_best_MultiNB = best_MultiNB.predict(X_test)

#Résultats
print("Rapport de Classification Multinomial Naive Bayes:\n")
print(classification_report(y_test, y_pred_best_MultiNB))

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END .........................alpha=0.05, fit_prior=True; total time=   0.3s
[CV] END .........................alpha=0.05, fit_prior=True; total time=   0.3s
[CV] END .........................alpha=0.05, fit_prior=True; total time=   0.3s
[CV] END .........................alpha=0.05, fit_prior=True; total time=   0.3s
[CV] END .........................alpha=0.05, fit_prior=True; total time=   0.4s
[CV] END ........................alpha=0.05, fit_prior=False; total time=   0.4s
[CV] END ........................alpha=0.05, fit_prior=False; total time=   0.4s
[CV] END ........................alpha=0.05, fit_prior=False; total time=   0.4s
[CV] END ........................alpha=0.05, fit_prior=False; total time=   0.3s
[CV] END ........................alpha=0.05, fit_prior=False; total time=   0.4s
[CV] END ..........................alpha=0.2, fit_prior=True; total time=   0.4s
[CV] END ..........................alpha=0.2, fi

## Complement Naive Bayes

In [11]:
param_gridComplNB = {
    "alpha": [0.05, 0.2, 0.7, 1.0, 2],
    "fit_prior": [True, False],
    "norm": [True, False]
}

#Initialisation
ComplNB = ComplementNB()
ComplNB_grid = GridSearchCV(ComplNB, param_gridComplNB, cv=5, verbose=2, n_jobs=-1)

#Entraînement du modèle
ComplNB_grid.fit(X_train, y_train)

#Afficher les meilleurs paramètres
print("Meilleurs paramètres : ", ComplNB_grid.best_params_)

#Prédiction avec les meilleurs paramètres obtenus
best_ComplNB = ComplNB_grid.best_estimator_
y_pred_best_ComplNB = best_ComplNB.predict(X_test)

#Résultats
print("Rapport de Classification Complement Naive Bayes :\n")
print(classification_report(y_test, y_pred_best_ComplNB))

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV] END ..............alpha=0.05, fit_prior=True, norm=True; total time=   0.5s
[CV] END ..............alpha=0.05, fit_prior=True, norm=True; total time=   0.4s
[CV] END ..............alpha=0.05, fit_prior=True, norm=True; total time=   0.5s
[CV] END ..............alpha=0.05, fit_prior=True, norm=True; total time=   0.5s
[CV] END ..............alpha=0.05, fit_prior=True, norm=True; total time=   0.4s
[CV] END .............alpha=0.05, fit_prior=True, norm=False; total time=   0.4s
[CV] END .............alpha=0.05, fit_prior=True, norm=False; total time=   0.4s
[CV] END .............alpha=0.05, fit_prior=True, norm=False; total time=   0.6s
[CV] END .............alpha=0.05, fit_prior=True, norm=False; total time=   1.0s
[CV] END .............alpha=0.05, fit_prior=False, norm=True; total time=   1.1s
[CV] END .............alpha=0.05, fit_prior=True, norm=False; total time=   1.3s
[CV] END .............alpha=0.05, fit_prior=Fal

## SVM

In [12]:
param_gridSVM = { 
    "C" : [0.2, 1, 10],
    "dual": [True, False],
    "class_weight" : ["balanced", None],
    "max_iter" : [700, 1000]
}

#Initialisation
svm = LinearSVC()
svm_grid = GridSearchCV(svm, param_gridSVM, cv=5, verbose=2, n_jobs=-1)

#Entraînement du modèle
svm_grid.fit(X_train, y_train)

#Afficher les meilleurs paramètres
print("Meilleurs paramètres : ", svm_grid.best_params_)

#Prédiction avec les meilleurs paramètres obtenus
best_svm = svm_grid.best_estimator_
y_pred_best_svm = best_svm.predict(X_test)

#Résultats
print("Rapport de Classification SVM :\n")
print(classification_report(y_test, y_pred_best_svm))

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV] END C=0.2, class_weight=balanced, dual=True, max_iter=1000; total time=   7.0s
[CV] END C=0.2, class_weight=balanced, dual=True, max_iter=1000; total time=   7.2s
[CV] END C=0.2, class_weight=balanced, dual=True, max_iter=1000; total time=   7.2s
[CV] END C=0.2, class_weight=balanced, dual=True, max_iter=1000; total time=   7.1s
[CV] END C=0.2, class_weight=balanced, dual=True, max_iter=2000; total time=   7.3s
[CV] END C=0.2, class_weight=balanced, dual=True, max_iter=1000; total time=   7.4s
[CV] END C=0.2, class_weight=balanced, dual=True, max_iter=2000; total time=   7.4s
[CV] END C=0.2, class_weight=balanced, dual=True, max_iter=2000; total time=   7.4s
[CV] END C=0.2, class_weight=balanced, dual=True, max_iter=2000; total time=   5.6s
[CV] END C=0.2, class_weight=balanced, dual=True, max_iter=2000; total time=   5.7s
[CV] END C=0.2, class_weight=balanced, dual=False, max_iter=1000; total time=  14.5s
[CV] END C=0.

## Logistic Regression 

In [13]:
param_gridLR = { 
    "C" : [0.2, 1, 10],
    "class_weight" : ["balanced", None],
    "max_iter" : [1000, 2000]
}

#Initialisation
LR = LinearSVC()
LR_grid = GridSearchCV(LR, param_gridLR, cv=5, verbose=1, n_jobs=-1)

#Entraînement du modèle
LR_grid.fit(X_train, y_train)

#Afficher les meilleurs paramètres
print("Meilleurs paramètres : ", LR_grid.best_params_)

#Prédiction avec les meilleurs paramètres obtenus
best_LR = LR_grid.best_estimator_
y_pred_best_LR = best_LR.predict(X_test)

#Résultats
print("Rapport de Classification SVM :\n")
print(classification_report(y_test, y_pred_best_LR))

Fitting 5 folds for each of 12 candidates, totalling 60 fits


KeyboardInterrupt: 

## Random Forest ??
(pas sûre que ça vaille le coup, niveau temps/résultats)

## COMPARAISON DES PERFORMANCES DES MODÈLES

In [18]:
resultats = pd.DataFrame({
    "Modèle": ["KNeighbors", "Multinomial Naive Bayes", "Complement Naive Bayes", "SVM"],
    "Accuracy" : 
        [accuracy_score(y_test, y_pred_best_KNN),
        accuracy_score(y_test, y_pred_best_MultiNB),
        accuracy_score(y_test, y_pred_best_ComplNB),
        accuracy_score(y_test, y_pred_best_svm)],
    "Précision": 
        [precision_score(y_test, y_pred_best_KNN, average="weighted"),
        precision_score(y_test, y_pred_best_MultiNB, average="weighted"),
        precision_score(y_test, y_pred_best_ComplNB, average="weighted"),
        precision_score(y_test, y_pred_best_svm, average="weighted")],
    "Rappel" : 
        [recall_score(y_test, y_pred_best_KNN, average="weighted"),
        recall_score(y_test, y_pred_best_MultiNB, average="weighted"),
        recall_score(y_test, y_pred_best_ComplNB, average="weighted"),
        recall_score(y_test, y_pred_best_svm, average="weighted")],
    "F-mesure" : 
        [f1_score(y_test, y_pred_best_KNN, average="weighted"),
        f1_score(y_test, y_pred_best_MultiNB, average="weighted"),
        f1_score(y_test, y_pred_best_ComplNB, average="weighted"),
        f1_score(y_test, y_pred_best_svm, average="weighted")]
})

print(resultats)

                    Modèle  Accuracy  Précision    Rappel  F-mesure
0               KNeighbors  0.280158   0.487558  0.280158  0.132033
1  Multinomial Naive Bayes  0.441490   0.453733  0.441490  0.403205
2   Complement Naive Bayes  0.446911   0.436986  0.446911  0.420170
3                      SVM  0.441920   0.430441  0.441920  0.429741
