In [46]:
import pandas as pd
from sklearn.model_selection import  GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report , accuracy_score




In [42]:
# Charger les données
df = pd.read_csv('..\\data\\processed\\train_optimized_v3.csv')

df_test = pd.read_csv('..\\data\\raw\\test_tweets.csv')
df_test["text_cleaned"] = df_test["text"] 

X = df[["text_cleaned", "keyword"]]
y = df['target']
X_test = df_test[["text_cleaned", "keyword"]]
y_test = df_test['target']



In [43]:
# Prétraitement : Tfidf sur text, OneHot sur keyword
preprocessor = ColumnTransformer([
    ("tfidf", TfidfVectorizer(), "text_cleaned"),
    ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=True), ["keyword"])
])

# Pipeline complet
pipeline = Pipeline([
    ("preprocessing", preprocessor),
    ("classifier", LogisticRegression(max_iter=1000))
])

# Grille d’hyperparamètres à tester
param_grid = {
    "preprocessing__tfidf__ngram_range": [(1,1), (1,2)],
    "classifier__C": [0.1, 1, 10],
    "classifier__penalty": ["l2"]
}




In [44]:
# GridSearchCV
grid = GridSearchCV(pipeline, param_grid, cv=3, scoring="accuracy", n_jobs=-1)
grid.fit(X, y)

# Résultats
print("Meilleurs paramètres :", grid.best_params_)
print("Score de validation :", grid.best_score_)

Meilleurs paramètres : {'classifier__C': 1, 'classifier__penalty': 'l2', 'preprocessing__tfidf__ngram_range': (1, 1)}
Score de validation : 0.7898863818210914


### Maintenant que nous avons les paramètres les plus optimisés ( accuracy de 0,78), nous allons entrainer le modèle de regression logistique 

In [47]:
#création du model entrainé avec les meilleurs hyperparamètres 

model = grid.best_estimator_

y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy sur le test : {acc:.4f}")

print("Rapport de classification :")
print(classification_report(y_test, y_pred))

Accuracy sur le test : 0.7995
Rapport de classification :
              precision    recall  f1-score   support

           0       0.78      0.91      0.84       435
           1       0.85      0.65      0.74       328

    accuracy                           0.80       763
   macro avg       0.81      0.78      0.79       763
weighted avg       0.81      0.80      0.79       763

