## Import des bibliothèques 

In [8]:
import pandas as pd

import optuna
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Télécharger les ressources NLTK nécessaires
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

import warnings
warnings.filterwarnings('ignore')


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/neilmarteau/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/neilmarteau/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/neilmarteau/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Chargement des dataframe pour le projet

In [2]:
x_train = pd.read_csv(r"./data/x_train.csv")
y_train = pd.read_csv(r"./data/y_train.csv")
x_test = pd.read_csv(r"./data/x_test.csv")

x_train = x_train.set_index('ID')
y_train = y_train.set_index('ID')
x_test = x_test.set_index('ID')

train = pd.concat([x_train, y_train], axis=1)
data = pd.concat([train, x_test], axis=0)

### Début de l'analyse des dataframes

In [3]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)

(8028, 1)
(8028, 1)
(2035, 1)


# Pré-traitements des données

### Pre-processing : 

In [3]:
def preprocess_text(text):
    # Convertir le texte en minuscules
    text = text.lower()
    
    # Supprimer les balises HTML (le cas échéant)
    text = re.sub(r'<.*?>', '', text)
    
    # Supprimer la ponctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Tokenization (division en mots)
    tokens = nltk.word_tokenize(text)
    
    # Supprimer les mots vides
    stop_words = set(stopwords.words('french'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # Rejoindre les mots traités en une seule chaîne de caractères
    processed_text = ' '.join(tokens)
    
    return processed_text

# Exemple d'utilisation
data.question = data.question.apply(preprocess_text)

### Vectorizer : 

In [4]:
vectorizers = {
    "HashingVectorizer": HashingVectorizer(),
    "TfidfVectorizer": TfidfVectorizer(),
    "CountVectorizer": CountVectorizer()
}

param_grids = {
    "HashingVectorizer": {'vectorizer__n_features': [1000, 5000, 10000]},
    "TfidfVectorizer": {'vectorizer__max_features': [5000, 10000, 15000]},
    "CountVectorizer": {'vectorizer__max_features': [5000, 10000, 15000]}
}

logistic_regression_model = LogisticRegression(max_iter=1000)

for vec_name, vectorizer in vectorizers.items():
    print(f"Vectorizing using {vec_name}...")
    
    pipeline = Pipeline([
        ('vectorizer', vectorizer),
        ('classifier', logistic_regression_model)
    ])
    
    grid_search = GridSearchCV(pipeline, param_grids[vec_name], cv=5, scoring='accuracy')
    grid_search.fit(x_train['question'], y_train)
    
    print("Best parameters found:")
    print(grid_search.best_params_)
    print("Best accuracy found:", grid_search.best_score_)


Vectorizing using HashingVectorizer...
Best parameters found:
{'vectorizer__n_features': 10000}
Best accuracy found: 0.5546844194085264
Vectorizing using TfidfVectorizer...
Best parameters found:
{'vectorizer__max_features': 5000}
Best accuracy found: 0.5858274461423865
Vectorizing using CountVectorizer...
Best parameters found:
{'vectorizer__max_features': 10000}
Best accuracy found: 0.6438727047714373


In [4]:
#vectorizer = CountVectorizer(max_features=10000)
#vectorizer.fit(data['question'])

# Initialize the HashingVectorizer
vectorizer = HashingVectorizer(n_features=5000)  # You can adjust the number of features (n_features) as needed
#vectorizer.fit(data['question'])

#vectorizer = TfidfVectorizer(max_features=10000)  # You can adjust the number of max features as needed
vectorizer.fit(data['question'])

# Transform x_train and x_test using the same vectorizer
x_train_transformed = vectorizer.transform(x_train['question'])
x_test_transformed = vectorizer.transform(x_test['question'])


# Etapes de modélisations : 


In [5]:
# Diviser les données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(x_train_transformed, y_train, test_size=0.2, random_state=42)

### Modèles moins performants :

In [8]:
models = {
    "SVC": SVC(),
    "MultinomialNB": MultinomialNB(),
    "RandomForest": RandomForestClassifier()
}

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_val_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_val_pred)
    print(f"{model_name} Accuracy:", accuracy)


SVC Accuracy: 0.5417185554171855
MultinomialNB Accuracy: 0.3026151930261519
RandomForest Accuracy: 0.5859277708592777


### Meilleurs modèles : 

In [7]:
models = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "MLPClassifier": MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000),
}

for model_name, model in models.items():

    model.fit(X_train, y_train)
    y_val_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_val_pred)
    print(f"{model_name} Accuracy:", accuracy)

LogisticRegression Accuracy: 0.5616438356164384
MLPClassifier Accuracy: 0.6257783312577833


### Optimisation des modèles avec Optuna : 

In [18]:
# Définir la fonction objectif
def objectif(trial):
    # Définir les paramètres à optimiser
    C = trial.suggest_loguniform('C', 1e-5, 100)
    penalty = trial.suggest_categorical('penalty', ['l2'])

    # Initialiser le modèle de Régression Logistique avec les paramètres suggérés
    modele_regression_logistique = LogisticRegression(C=C, penalty=penalty, max_iter=1000)

    # Entraîner le modèle sur les données d'entraînement
    modele_regression_logistique.fit(X_train, y_train)

    # Prédictions sur l'ensemble de validation
    y_val_pred = modele_regression_logistique.predict(X_test)

    # Calculer la précision
    precision = accuracy_score(y_test, y_val_pred)

    return precision

# Créer un objet d'étude et optimiser la fonction objectif
etude = optuna.create_study(direction='maximize')
etude.optimize(objectif, n_trials=40, show_progress_bar=True)

# Afficher les meilleurs paramètres et la meilleure précision
print("Meilleurs paramètres trouvés:", etude.best_params)
print("Meilleure précision trouvée:", etude.best_value)


[I 2024-04-02 13:04:33,998] A new study created in memory with name: no-name-7683150e-9a82-4a5e-9618-226e9098422d


  0%|          | 0/40 [00:00<?, ?it/s]

[I 2024-04-02 13:04:40,633] Trial 0 finished with value: 0.6419676214196762 and parameters: {'C': 16.30548135294488, 'penalty': 'l2'}. Best is trial 0 with value: 0.6419676214196762.
[I 2024-04-02 13:04:41,905] Trial 1 finished with value: 0.23785803237858033 and parameters: {'C': 0.006193271041066651, 'penalty': 'l2'}. Best is trial 0 with value: 0.6419676214196762.
[I 2024-04-02 13:04:45,878] Trial 2 finished with value: 0.23785803237858033 and parameters: {'C': 4.044468213486067e-05, 'penalty': 'l2'}. Best is trial 0 with value: 0.6419676214196762.
[I 2024-04-02 13:04:47,047] Trial 3 finished with value: 0.3094645080946451 and parameters: {'C': 0.06314054027410404, 'penalty': 'l2'}. Best is trial 0 with value: 0.6419676214196762.
[I 2024-04-02 13:04:50,699] Trial 4 finished with value: 0.23785803237858033 and parameters: {'C': 4.929834370134826e-05, 'penalty': 'l2'}. Best is trial 0 with value: 0.6419676214196762.
[I 2024-04-02 13:04:57,178] Trial 5 finished with value: 0.2378580323

In [7]:
# Définir la fonction objectif
def objectif(trial):
    # Définir les paramètres à optimiser
    hidden_layer_sizes = trial.suggest_categorical('hidden_layer_sizes', [(50,), (100,), (200,)])
    alpha = trial.suggest_loguniform('alpha', 1e-5, 1e-1)
    
    # Initialiser le modèle de réseau de neurones avec les paramètres suggérés
    modele_reseau_neurones = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, alpha=alpha, max_iter=1000)
    
    # Entraîner le modèle sur les données d'entraînement
    modele_reseau_neurones.fit(X_train, y_train)
    
    # Prédictions sur l'ensemble de validation
    y_val_pred = modele_reseau_neurones.predict(X_test)
    
    # Calculer la précision
    precision = accuracy_score(y_test, y_val_pred)
    
    return precision

# Diviser les données en ensembles d'entraînement et de validation
# X_train, X_val, y_train, y_val = train_test_split(x_train_transformed, y_train, test_size=0.2, random_state=42)

# Créer un objet d'étude et optimiser la fonction objectif
etude = optuna.create_study(direction='maximize')
etude.optimize(objectif, n_trials=25, show_progress_bar=True)

# Afficher les meilleurs paramètres et la meilleure précision
print("Meilleurs paramètres trouvés:", etude.best_params)
print("Meilleure précision trouvée:", etude.best_value)

[I 2024-03-27 20:09:51,949] A new study created in memory with name: no-name-42fe31fa-087e-40e5-9c2b-388983946de7


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2024-03-27 20:11:45,939] Trial 0 finished with value: 0.6500622665006227 and parameters: {'hidden_layer_sizes': (200,), 'alpha': 6.86597408288579e-05}. Best is trial 0 with value: 0.6500622665006227.
[I 2024-03-27 20:12:53,231] Trial 1 finished with value: 0.6469489414694894 and parameters: {'hidden_layer_sizes': (100,), 'alpha': 5.0357153648691513e-05}. Best is trial 0 with value: 0.6500622665006227.
[I 2024-03-27 20:14:21,232] Trial 2 finished with value: 0.6625155666251556 and parameters: {'hidden_layer_sizes': (100,), 'alpha': 0.001026569834824792}. Best is trial 2 with value: 0.6625155666251556.
[I 2024-03-27 20:28:37,643] Trial 3 finished with value: 0.6531755915317559 and parameters: {'hidden_layer_sizes': (50,), 'alpha': 0.00021235175310644282}. Best is trial 2 with value: 0.6625155666251556.
[I 2024-03-27 20:38:08,973] Trial 4 finished with value: 0.6432129514321295 and parameters: {'hidden_layer_sizes': (100,), 'alpha': 1.1301390310065813e-05}. Best is trial 2 with value: 

### Modèle final : 

In [21]:
models = {
    
    "MLPClassifier": MLPClassifier(hidden_layer_sizes=(50,), alpha=0.09158614968227348, max_iter=1000),
    "LogisticRegression": LogisticRegression(C=25.791116488879055, max_iter=1000)
    
}

# Loop through each model
for model_name, model in models.items():
    # Train the model on the training data
    model.fit(X_train, y_train)

    # Predictions on the validation set
    y_val_pred = model.predict(X_test)

    # Evaluate the performance of the model
    accuracy = accuracy_score(y_test, y_val_pred)
    print(f"{model_name} Accuracy:", accuracy)

MLPClassifier Accuracy: 0.6407222914072229
LogisticRegression Accuracy: 0.6469489414694894


In [22]:
model

# Submissions : 

In [11]:
y_pred = model.predict(x_test_transformed)

In [12]:
submission = pd.DataFrame({"ID" : list(x_test.index), "intention" : y_pred})
submission.set_index("ID", inplace=True)
submission.to_csv("Submission_LR.csv")