In [None]:
import pickle
import numpy as np
import pandas as pd
import mlflow
import mlflow.sklearn
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import jaccard_score

In [None]:
# Load the TF-IDF
with open('tfidf_vectorizer.pkl', 'rb') as file:
    tfidf = pickle.load(file)

# Load the transformed training data
with open('X_train_tfidf.pkl', 'rb') as file:
    X_train_tfidf = pickle.load(file)

# Load the transformed testing data
with open('X_test_tfidf.pkl', 'rb') as file:
    X_test_tfidf = pickle.load(file)

# Load the feature names
with open('features_names_tfidf.pkl', 'rb') as file:
    features_names_tfidf = pickle.load(file)

y_train = pd.read_csv("C:/Users/lzeferino/Documents/GitHub/NLP-Multi-Label-Text-Classification-for-Stack-Overflow-Tag-Prediction/y_train.csv")
y_test = pd.read_csv("C:/Users/lzeferino/Documents/GitHub/NLP-Multi-Label-Text-Classification-for-Stack-Overflow-Tag-Prediction/y_test.csv")

## Approche supervisée classique

In [61]:
def j_score(y_true, y_pred):
    jaccard= np.minimum(y_true, y_pred).sum(axis=1)/np.maximum(y_true, y_pred).sum(axis=1)
    return jaccard.mean() * 100

def print_score(y_pred, clf):
    print('clf:', clf.__class__.__name__)
    print('Jaccard socre: {}'.format(j_score(y_test, y_pred)))
    print('----')

In [None]:
sgd = SGDClassifier()
lr= LogisticRegression(solver= 'lbfgs')
svc = LinearSVC(dual=False, C=1.5, penalty='l1')

for classifier in [sgd, lr, svc]:
    clf = OneVsRestClassifier(classifier)
    clf.fit(X_train_tfidf, y_train)
    y_pred = clf.predict(X_test_tfidf)
    print_score(y_pred, classifier)
    #print(jaccard_score(y_test, y_pred, average='weighted'))

In [5]:
# Modèles à tester
models = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "SGDClassifier": SGDClassifier(max_iter=1000),
    "RandomForestClassifier": RandomForestClassifier(),
    "GradientBoostingClassifier": GradientBoostingClassifier(),
    "XGBClassifier": XGBClassifier(use_label_encoder=False, eval_metric='logloss')    
}

# Commencez une nouvelle expérience MLFlow
mlflow.set_experiment("Tag Suggestion Classification")

input_example = X_train_tfidf[0:1]

for model_name, model in models.items():
    with mlflow.start_run(run_name=f"{model_name}_OneVsRest"):
        
        # Construction du pipeline pour appliquer OneVsRestClassifier avec le modèle
        pipeline = Pipeline([
            ('classifier', OneVsRestClassifier(model))
        ])

        # Entraînement du modèle
        pipeline.fit(X_train_tfidf, y_train)

        # Prédiction sur le jeu de test
        y_pred = pipeline.predict(X_test_tfidf)

        # Calcul du score Jaccard
        jaccard = jaccard_score(y_test, y_pred, average='samples')

        # Logging dans MLFlow
        mlflow.log_param("model_name", model_name)
        mlflow.log_metric("jaccard_score", jaccard)

        # Enregistrement du modèle dans MLFlow       
        mlflow.sklearn.log_model(pipeline, artifact_path=f"models/{model_name}", input_example=input_example)

        print(f"Model {model_name} - Jaccard Score: {jaccard}")


  _warn_prf(average, modifier, msg_start, len(result))


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Model LogisticRegression - Jaccard Score: 0.2831666666666666


  _warn_prf(average, modifier, msg_start, len(result))


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Model SGDClassifier - Jaccard Score: 0.37403333333333333


In [None]:
# Analyse de stabilité mensuelle
for month in range(1, 13):
    with mlflow.start_run(run_name=f"Stability_Test_Month_{month}"):
        
        # Partition du dataset pour chaque mois
        X_test_month, _, y_test_month, _ = train_test_split(X_test_tfidf, y_test, test_size=0.8, random_state=month)

        # Prédiction avec `predict_with_proba`
        y_pred_month = predict_with_proba(pipeline, X_test_month)

        # Calcul du jaccard score pour le mois spécifique
        jaccard_month = jaccard_score(y_test_month, y_pred_month, average='samples')

        # Logging des résultats
        mlflow.log_param("month", month)
        mlflow.log_metric("jaccard_score_monthly", jaccard_month)
        
        print(f"Month {month} - Jaccard Score: {jaccard_month}")