<a href="https://colab.research.google.com/github/nas961/blank-app/blob/main/Main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
import io
import sys
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, f1_score, roc_curve, auc, confusion_matrix
from sklearn.preprocessing import StandardScaler, LabelEncoder, label_binarize
from sklearn.decomposition import TruncatedSVD
from xgboost import XGBClassifier
from google.colab import drive
import os
import matplotlib.pyplot as plt
from datetime import datetime
import nltk
from nltk.corpus import stopwords
from tqdm import tqdm
import time
from itertools import cycle
from sklearn.utils.class_weight import compute_class_weight
import joblib
import seaborn as sns
import torch
from torch import nn, optim
from torch.utils.data import DataLoader, TensorDataset


# Vérifier si CUDA est disponible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Téléchargement des stop words pour toutes les langues disponibles
nltk.download('stopwords', quiet=True)
all_stopwords = set()
for lang in stopwords.fileids():
    all_stopwords.update(stopwords.words(lang))

# Monter Google Drive
drive.mount('/content/drive')

# Chemins vers les fichiers dans Google Drive
csv_dir = '/content/drive/MyDrive/DATA'
save_dir = '/content/drive/MyDrive/Text_Classification'

# Créer le dossier de sauvegarde s'il n'existe pas
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

def load_and_prepare_data():
    try:
        X_train = pd.read_csv(os.path.join(csv_dir, 'X_train.csv'))
        Y_train = pd.read_csv(os.path.join(csv_dir, 'Y_train.csv'))
        X_test = pd.read_csv(os.path.join(csv_dir, 'X_test.csv'))

        print("Colonnes dans X_train:", X_train.columns)
        print("Colonnes dans Y_train:", Y_train.columns)

        if 'productid' in X_train.columns and 'productid' in Y_train.columns:
            data = pd.merge(X_train, Y_train, on='productid')
        else:
            print("Attention: 'productid' n'est pas présent dans les dataframes. On suppose qu'ils sont déjà alignés.")
            X_train = X_train.reset_index(drop=True)
            Y_train = Y_train.reset_index(drop=True)
            data = pd.concat([X_train, Y_train], axis=1)

        required_columns = ['designation', 'description', 'prdtypecode']
        for col in required_columns:
            if col not in data.columns:
                raise ValueError(f"La colonne '{col}' n'est pas présente dans les données fusionnées.")

        # Gestion des valeurs NaN
        data['designation'] = data['designation'].fillna('')
        data['description'] = data['description'].fillna('')

        # Combiner 'designation' et 'description' pour une meilleure représentation du texte
        X = data['designation'] + " " + data['description']
        y = data['prdtypecode']

        le = LabelEncoder()
        y_encoded = le.fit_transform(y)

        print("Vectorisation TF-IDF...")
        tfidf = TfidfVectorizer(max_features=10000, stop_words=list(all_stopwords), ngram_range=(1, 2))
        X_tfidf = tfidf.fit_transform(X)

        print("Réduction de dimensionnalité avec TruncatedSVD...")
        svd = TruncatedSVD(n_components=300, random_state=42)
        X_svd = svd.fit_transform(X_tfidf)

        print("Standardisation des données...")
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X_svd)

        X_train, X_val, y_train, y_val = train_test_split(X_scaled, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

        # Calcul des poids des classes
        class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
        class_weight_dict = dict(zip(np.unique(y_train), class_weights))

        # Convertir les données en tensors PyTorch et les déplacer sur le GPU
        X_train_tensor = torch.FloatTensor(X_train).to(device)
        y_train_tensor = torch.LongTensor(y_train).to(device)
        X_val_tensor = torch.FloatTensor(X_val).to(device)
        y_val_tensor = torch.LongTensor(y_val).to(device)

        return X_train_tensor, X_val_tensor, y_train_tensor, y_val_tensor, le.classes_, class_weight_dict, tfidf, svd, scaler
    except Exception as e:
        print(f"Erreur lors du chargement et de la préparation des données : {str(e)}")
        raise

class SVMWithGPU(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(SVMWithGPU, self).__init__()
        self.linear = nn.Linear(input_dim, num_classes)

    def forward(self, x):
        return self.linear(x)

def create_models(input_dim, num_classes, class_weight_dict):
    svm_model = SVMWithGPU(input_dim, num_classes).to(device)
    rf_model = RandomForestClassifier(n_estimators=200, random_state=42, class_weight=class_weight_dict, n_jobs=-1)
    lr_model = LogisticRegression(random_state=42, max_iter=1000, class_weight=class_weight_dict, n_jobs=-1)
    xgb_model = XGBClassifier(random_state=42, use_label_encoder=False, tree_method='hist', device='cuda')

    return [
        (svm_model, "SVM"),
        (rf_model, "Random Forest"),
        (lr_model, "Logistic Regression"),
        (xgb_model, "XGBoost")
    ]

def train_and_evaluate_model(model, X_train, y_train, X_val, y_val, model_name):
    print(f"Entraînement du modèle {model_name}...")

    start_time = time.time()

    try:
        if model_name == "SVM":
            criterion = nn.CrossEntropyLoss()
            optimizer = torch.optim.Adam(model.parameters())
            num_epochs = 10
            batch_size = 64

            train_dataset = TensorDataset(X_train, y_train)
            train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

            for epoch in range(num_epochs):
                for batch_X, batch_y in train_loader:
                    optimizer.zero_grad()
                    outputs = model(batch_X)
                    loss = criterion(outputs, batch_y)
                    loss.backward()
                    optimizer.step()

            model.eval()
            with torch.no_grad():
                y_train_pred = model(X_train).argmax(dim=1)
                y_val_pred = model(X_val).argmax(dim=1)
        elif model_name == "XGBoost":
            X_train_cpu = X_train.cpu().numpy()
            y_train_cpu = y_train.cpu().numpy()
            X_val_cpu = X_val.cpu().numpy()
            y_val_cpu = y_val.cpu().numpy()

            model.fit(X_train_cpu, y_train_cpu, eval_set=[(X_val_cpu, y_val_cpu)], verbose=False)
            y_train_pred = model.predict(X_train_cpu)
            y_val_pred = model.predict(X_val_cpu)
        else:
            X_train_cpu = X_train.cpu().numpy()
            y_train_cpu = y_train.cpu().numpy()
            X_val_cpu = X_val.cpu().numpy()

            model.fit(X_train_cpu, y_train_cpu)
            y_train_pred = model.predict(X_train_cpu)
            y_val_pred = model.predict(X_val_cpu)

        end_time = time.time()
        training_time = end_time - start_time

        y_train_cpu = y_train.cpu().numpy()
        y_val_cpu = y_val.cpu().numpy()

        if isinstance(y_train_pred, torch.Tensor):
            y_train_pred = y_train_pred.cpu().numpy()
        if isinstance(y_val_pred, torch.Tensor):
            y_val_pred = y_val_pred.cpu().numpy()

        train_accuracy = accuracy_score(y_train_cpu, y_train_pred)
        train_f1 = f1_score(y_train_cpu, y_train_pred, average='weighted')

        val_accuracy = accuracy_score(y_val_cpu, y_val_pred)
        val_f1 = f1_score(y_val_cpu, y_val_pred, average='weighted')

        print(f"\nRésultats pour {model_name}:")
        print(f"Temps d'entraînement: {training_time:.2f} secondes")
        print(f"Accuracy d'entraînement: {train_accuracy:.4f}")
        print(f"F1 Score d'entraînement: {train_f1:.4f}")
        print(f"Accuracy de validation: {val_accuracy:.4f}")
        print(f"F1 Score de validation: {val_f1:.4f}")
        print("\nRapport de classification (validation):")
        print(classification_report(y_val_cpu, y_val_pred))

        return model, val_accuracy, val_f1
    except Exception as e:
        print(f"Erreur lors de l'entraînement du modèle {model_name}: {str(e)}")
        return None, 0, 0

def plot_roc_curve(model, X_val, y_val, model_name, class_names):
    try:
        n_classes = len(np.unique(y_val.cpu().numpy()))

        if n_classes == 2:
            if hasattr(model, "predict_proba"):
                y_score = model.predict_proba(X_val.cpu().numpy())[:, 1]
            else:
                y_score = model(X_val).detach().cpu().numpy()[:, 1]

            fpr, tpr, _ = roc_curve(y_val.cpu().numpy(), y_score)
            roc_auc = auc(fpr, tpr)

            plt.figure()
            plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
            plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
            plt.xlim([0.0, 1.0])
            plt.ylim([0.0, 1.05])
            plt.xlabel('False Positive Rate')
            plt.ylabel('True Positive Rate')
            plt.title(f'ROC Curve - {model_name}')
            plt.legend(loc="lower right")
        else:
            fpr = dict()
            tpr = dict()
            roc_auc = dict()

            if hasattr(model, "predict_proba"):
                y_score = model.predict_proba(X_val.cpu().numpy())
            else:
                y_score = model(X_val).detach().cpu().numpy()

            y_test_bin = label_binarize(y_val.cpu().numpy(), classes=np.arange(n_classes))

            for i in range(n_classes):
                fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_score[:, i])
                roc_auc[i] = auc(fpr[i], tpr[i])

            plt.figure(figsize=(10, 8))
            colors = cycle(['aqua', 'darkorange', 'cornflowerblue', 'red', 'green', 'yellow', 'purple'])
            for i, color in zip(range(n_classes), colors):
                plt.plot(fpr[i], tpr[i], color=color, lw=2,
                         label=f'ROC curve of class {class_names[i]} (AUC = {roc_auc[i]:.2f})')

            plt.plot([0, 1], [0, 1], 'k--', lw=2)
            plt.xlim([0.0, 1.0])
            plt.ylim([0.0, 1.05])
            plt.xlabel('False Positive Rate')
            plt.ylabel('True Positive Rate')
            plt.title(f'Multi-class ROC Curve - {model_name}')
            plt.legend(loc="lower right")

        plt.savefig(os.path.join(save_dir, f'{model_name}_ROC_curve.png'))
        plt.close()
    except Exception as e:
        print(f"Erreur lors de la création de la courbe ROC pour {model_name}: {str(e)}")

def plot_confusion_matrix(y_true, y_pred, class_names, model_name):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
    plt.title(f'Confusion Matrix - {model_name}')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.savefig(os.path.join(save_dir, f'{model_name}_confusion_matrix.png'))
    plt.close()

def plot_feature_importance(model, feature_names, model_name):
    if hasattr(model, 'feature_importances_'):
        importances = model.feature_importances_
        indices = np.argsort(importances)[::-1]
        plt.figure(figsize=(10, 8))
        plt.title(f"Feature Importances - {model_name}")
        plt.bar(range(10), importances[indices][:10])
        plt.xticks(range(10), [feature_names[i] for i in indices[:10]], rotation=45, ha='right')
        plt.tight_layout()
        plt.savefig(os.path.join(save_dir, f'{model_name}_feature_importance.png'))
        plt.close()

def train_and_evaluate_all_models(models, X_train, X_val, y_train, y_val, class_names, feature_names):
    results = []
    total_models = len(models)
    start_time = time.time()

    for i, (model, name) in enumerate(models, 1):
        print(f"\nEntraînement du modèle {i}/{total_models}: {name}")
        trained_model, val_accuracy, val_f1 = train_and_evaluate_model(model, X_train, y_train, X_val, y_val, name)
        results.append((name, val_accuracy, val_f1))

        if trained_model is not None:
            plot_roc_curve(trained_model, X_val, y_val, name, class_names)

            if name != "SVM":  # SVM personnalisé n'a pas de méthode predict
                y_val_pred = trained_model.predict(X_val.cpu().numpy())
            else:
                y_val_pred = trained_model(X_val).argmax(dim=1).cpu().numpy()

            plot_confusion_matrix(y_val.cpu().numpy(), y_val_pred, class_names, name)

            if name in ["Random Forest", "XGBoost"]:
                plot_feature_importance(trained_model, feature_names, name)

        elapsed_time = time.time() - start_time
        avg_time_per_model = elapsed_time / i
        eta = avg_time_per_model * (total_models - i)

        print(f"Progression : {i}/{total_models} modèles")
        print(f"Temps écoulé : {elapsed_time:.2f} secondes")
        print(f"Temps estimé restant : {eta:.2f} secondes")

    print("\nEntraînement de tous les modèles terminé!")
    return results

def plot_model_comparison(results):
    plt.figure(figsize=(12, 8))
    names = [r[0] for r in results]
    accuracies = [r[1] for r in results]
    f1_scores = [r[2] for r in results]

    x = range(len(names))
    width = 0.35

    fig, ax = plt.subplots()
    rects1 = ax.bar([i - width/2 for i in x], accuracies, width, label='Accuracy', color='skyblue')
    rects2 = ax.bar([i + width/2 for i in x], f1_scores, width, label='F1 Score', color='lightgreen')

    ax.set_ylabel('Scores')
    ax.set_title('Comparaison des performances des modèles')
    ax.set_xticks(x)
    ax.set_xticklabels(names, rotation=45, ha='right')
    ax.legend()

    ax.set_ylim(0.70, 1.00)

    def autolabel(rects):
        for rect in rects:
            height = rect.get_height()
            ax.annotate(f'{height:.3f}',
                        xy=(rect.get_x() + rect.get_width() / 2, height),
                        xytext=(0, 3),  # 3 points vertical offset
                        textcoords="offset points",
                        ha='center', va='bottom', rotation=90)

    autolabel(rects1)
    autolabel(rects2)

    fig.tight_layout()
    plt.savefig(os.path.join(save_dir, 'model_comparison.png'))
    plt.close()

def plot_training_time(training_times):
    plt.figure(figsize=(10, 6))
    names = list(training_times.keys())
    times = list(training_times.values())

    plt.bar(names, times, color='lightcoral')
    plt.title('Temps d\'entraînement par modèle')
    plt.xlabel('Modèles')
    plt.ylabel('Temps (secondes)')
    plt.xticks(rotation=45, ha='right')

    for i, v in enumerate(times):
        plt.text(i, v, f'{v:.2f}', ha='center', va='bottom')

    plt.tight_layout()
    plt.savefig(os.path.join(save_dir, 'training_time_comparison.png'))
    plt.close()

def create_description_file(results, training_times):
    current_date = datetime.now().strftime("%Y-%m-%d")
    with open(os.path.join(save_dir, f'{current_date}-ML_pipeline_description.txt'), 'w', encoding='utf-8') as f:
        f.write("Description de la pipeline de Machine Learning\n")
        f.write("===========================================\n\n")
        f.write("1. Chargement et préparation des données\n")
        f.write("2. Gestion des valeurs NaN dans les colonnes textuelles\n")
        f.write("3. Combinaison de 'designation' et 'description' pour une meilleure représentation du texte\n")
        f.write("4. Prétraitement du texte avec TfidfVectorizer, utilisation de bi-grammes et suppression des stop words de toutes les langues\n")
        f.write("5. Réduction de dimensionnalité avec TruncatedSVD\n")
        f.write("6. Standardisation des données avec StandardScaler\n")
        f.write("7. Encodage des labels avec LabelEncoder\n")
        f.write("8. Calcul et application des poids des classes pour gérer le déséquilibre des classes\n")
        f.write("9. Utilisation du GPU pour l'entraînement des modèles\n")
        f.write("10. Entraînement de quatre modèles : SVM (personnalisé avec PyTorch), Random Forest, Logistic Regression, XGBoost\n")
        f.write("11. Évaluation des modèles avec des métriques de performance (Accuracy, F1 Score)\n")
        f.write("12. Génération de courbes ROC pour chaque modèle\n")
        f.write("13. Création de matrices de confusion pour chaque modèle\n")
        f.write("14. Visualisation de l'importance des caractéristiques pour Random Forest et XGBoost\n")
        f.write("15. Comparaison visuelle des performances des modèles\n")
        f.write("16. Analyse des temps d'entraînement\n")
        f.write("\nRésultats :\n")
        for name, accuracy, f1 in results:
            f.write(f"{name}: Accuracy = {accuracy:.4f}, F1 Score = {f1:.4f}, Temps d'entraînement = {training_times[name]:.2f} secondes\n")
        f.write("\nNote: SVM est implémenté avec PyTorch pour utiliser le GPU. XGBoost est configuré pour utiliser le GPU.\n")
        f.write("Tous les résultats et visualisations sont sauvegardés dans le dossier 'Text_Classification' sur Google Drive.\n")

    print(f"Les résultats et visualisations ont été sauvegardés dans le dossier 'Text_Classification' sur Google Drive.")
    print(f"Un fichier de description '{current_date}-ML_pipeline_description.txt' a été créé dans le même dossier.")

if __name__ == "__main__":
    try:
        X_train, X_val, y_train, y_val, class_names, class_weight_dict, tfidf, svd, scaler = load_and_prepare_data()
        input_dim = X_train.shape[1]
        num_classes = len(class_names)
        models = create_models(input_dim, num_classes, class_weight_dict)

        feature_names = tfidf.get_feature_names_out()

        training_times = {}
        results = []

        for model, name in models:
            start_time = time.time()
            trained_model, val_accuracy, val_f1 = train_and_evaluate_model(model, X_train, y_train, X_val, y_val, name)
            end_time = time.time()
            training_time = end_time - start_time
            training_times[name] = training_time
            results.append((name, val_accuracy, val_f1))

            if trained_model is not None:
                plot_roc_curve(trained_model, X_val, y_val, name, class_names)

                if name != "SVM":
                    y_val_pred = trained_model.predict(X_val.cpu().numpy())
                else:
                    y_val_pred = trained_model(X_val).argmax(dim=1).cpu().numpy()

                plot_confusion_matrix(y_val.cpu().numpy(), y_val_pred, class_names, name)

                if name in ["Random Forest", "XGBoost"]:
                    plot_feature_importance(trained_model, feature_names, name)

        plot_model_comparison(results)
        plot_training_time(training_times)
        create_description_file(results, training_times)

        # Sauvegarder les meilleurs modèles
        best_model = max(results, key=lambda x: x[2])[0]  # Choisir le modèle avec le meilleur F1 score
        for model, name in models:
            if name == best_model:
                print(f"Sauvegarde du meilleur modèle : {name}")
                model_filename = os.path.join(save_dir, f'{name}_best_model.joblib')
                joblib.dump(model, model_filename)
                print(f"Modèle sauvegardé : {model_filename}")

        # Sauvegarder le TfidfVectorizer, TruncatedSVD et le StandardScaler
        tfidf_filename = os.path.join(save_dir, 'tfidf_vectorizer.joblib')
        svd_filename = os.path.join(save_dir, 'truncated_svd.joblib')
        scaler_filename = os.path.join(save_dir, 'standard_scaler.joblib')
        joblib.dump(tfidf, tfidf_filename)
        joblib.dump(svd, svd_filename)
        joblib.dump(scaler, scaler_filename)
        print(f"TfidfVectorizer sauvegardé : {tfidf_filename}")
        print(f"TruncatedSVD sauvegardé : {svd_filename}")
        print(f"StandardScaler sauvegardé : {scaler_filename}")

        # Prédiction sur l'ensemble de test
        X_test = pd.read_csv(os.path.join(csv_dir, 'X_test.csv'))
        X_test['designation'] = X_test['designation'].fillna('')
        X_test['description'] = X_test['description'].fillna('')
        X_test_text = X_test['designation'] + " " + X_test['description']
        X_test_tfidf = tfidf.transform(X_test_text)
        X_test_svd = svd.transform(X_test_tfidf)
        X_test_scaled = scaler.transform(X_test_svd)
        X_test_tensor = torch.FloatTensor(X_test_scaled).to(device)

        best_model_name = best_model
        best_model = joblib.load(os.path.join(save_dir, f'{best_model_name}_best_model.joblib'))

        if best_model_name == "SVM":
            y_test_pred = best_model(X_test_tensor).argmax(dim=1).cpu().numpy()
        else:
            y_test_pred = best_model.predict(X_test_scaled)

        # Convertir les prédictions en catégories originales
        le = LabelEncoder()
        le.classes_ = class_names
        y_test_pred_categories = le.inverse_transform(y_test_pred)

        # Créer un DataFrame avec les prédictions
        predictions_df = pd.DataFrame({
            'productid': X_test['productid'],
            'predicted_category': y_test_pred_categories
        })

        # Sauvegarder les prédictions dans un fichier CSV
        predictions_filename = os.path.join(save_dir, 'test_predictions.csv')
        predictions_df.to_csv(predictions_filename, index=False)
        print(f"Prédictions sur l'ensemble de test sauvegardées : {predictions_filename}")

    except Exception as e:
        print(f"Une erreur s'est produite lors de l'exécution du script : {str(e)}")

Using device: cuda
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Colonnes dans X_train: Index(['Unnamed: 0', 'designation', 'description', 'productid', 'imageid'], dtype='object')
Colonnes dans Y_train: Index(['Unnamed: 0', 'prdtypecode'], dtype='object')
Attention: 'productid' n'est pas présent dans les dataframes. On suppose qu'ils sont déjà alignés.
Vectorisation TF-IDF...




Réduction de dimensionnalité avec TruncatedSVD...
Standardisation des données...
Entraînement du modèle SVM...

Résultats pour SVM:
Temps d'entraînement: 17.35 secondes
Accuracy d'entraînement: 0.7516
F1 Score d'entraînement: 0.7543
Accuracy de validation: 0.7350
F1 Score de validation: 0.7377

Rapport de classification (validation):
              precision    recall  f1-score   support

           0       0.31      0.61      0.41       623
           1       0.68      0.54      0.60       502
           2       0.73      0.67      0.70       336
           3       0.86      0.78      0.82       166
           4       0.70      0.67      0.69       534
           5       0.86      0.83      0.84       791
           6       0.54      0.38      0.44       153
           7       0.60      0.52      0.56       974
           8       0.53      0.41      0.47       414
           9       0.80      0.86      0.83      1009
          10       0.90      0.88      0.89       161
          11   

Parameters: { "use_label_encoder" } are not used.




Résultats pour XGBoost:
Temps d'entraînement: 40.60 secondes
Accuracy d'entraînement: 0.9741
F1 Score d'entraînement: 0.9750
Accuracy de validation: 0.7561
F1 Score de validation: 0.7567

Rapport de classification (validation):
              precision    recall  f1-score   support

           0       0.36      0.50      0.42       623
           1       0.67      0.57      0.62       502
           2       0.80      0.73      0.76       336
           3       0.96      0.77      0.86       166
           4       0.72      0.70      0.71       534
           5       0.89      0.86      0.88       791
           6       0.70      0.38      0.49       153
           7       0.55      0.57      0.56       974
           8       0.53      0.44      0.48       414
           9       0.80      0.87      0.83      1009
          10       0.94      0.81      0.87       161
          11       0.74      0.64      0.68       498
          12       0.71      0.69      0.70       648
          13  

<Figure size 1200x800 with 0 Axes>