In [15]:
import pandas as pd
from sklearn.model_selection import  GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report , accuracy_score , confusion_matrix




In [3]:
# Charger les données
df = pd.read_csv('..\\data\\processed\\train_optimized_v3.csv')

df_test = pd.read_csv('..\\data\\raw\\test_tweets.csv')
df_test["text_cleaned"] = df_test["text"] 

X = df[["text_cleaned", "keyword"]]
y = df['target']
X_test = df_test[["text_cleaned", "keyword"]]
y_test = df_test['target']



In [4]:
# Prétraitement : Tfidf sur text, OneHot sur keyword
preprocessor = ColumnTransformer([
    ("tfidf", TfidfVectorizer(), "text_cleaned"),
    ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=True), ["keyword"])
])

# Pipeline complet
pipeline = Pipeline([
    ("preprocessing", preprocessor),
    ("classifier", LogisticRegression(max_iter=1000))
])

# Grille d’hyperparamètres à tester
param_grid = {
    "preprocessing__tfidf__ngram_range": [(1,1), (1,2)],
    "classifier__C": [0.1, 1, 10],
    "classifier__penalty": ["l2"]
}




In [5]:
# GridSearchCV
grid = GridSearchCV(pipeline, param_grid, cv=3, scoring="accuracy", n_jobs=-1)
grid.fit(X, y)

# Résultats
print("Meilleurs paramètres :", grid.best_params_)
print("Score de validation :", grid.best_score_)

Meilleurs paramètres : {'classifier__C': 1, 'classifier__penalty': 'l2', 'preprocessing__tfidf__ngram_range': (1, 1)}
Score de validation : 0.7898863818210914


### Maintenant que nous avons les paramètres les plus optimisés ( accuracy de 0,78), nous allons entrainer le modèle de regression logistique 

In [51]:
#création du model entrainé avec les meilleurs hyperparamètres 

model = grid.best_estimator_

y_pred = model.predict(X_test)
y_pred2 = (model.predict_proba(X_test)[:,1] >= 0.25).astype(int)

acc = accuracy_score(y_test, y_pred)
acc2 = accuracy_score(y_test, y_pred)
print(f"Accuracy sur le test : {acc:.4f}")

print("Rapport de classification :")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred, labels=[0, 1]))  
print(f"Accuracy sur le test : {acc2:.4f}")
print(classification_report(y_test, y_pred2))
print(confusion_matrix(y_test, y_pred2, labels=[0, 1]))  

Accuracy sur le test : 0.7995
Rapport de classification :
              precision    recall  f1-score   support

           0       0.78      0.91      0.84       435
           1       0.85      0.65      0.74       328

    accuracy                           0.80       763
   macro avg       0.81      0.78      0.79       763
weighted avg       0.81      0.80      0.79       763

Confusion Matrix:
[[396  39]
 [114 214]]
Accuracy sur le test : 0.7995
              precision    recall  f1-score   support

           0       0.87      0.62      0.72       435
           1       0.63      0.88      0.74       328

    accuracy                           0.73       763
   macro avg       0.75      0.75      0.73       763
weighted avg       0.77      0.73      0.73       763

[[269 166]
 [ 40 288]]


In [47]:
text = "tsunami destroyed everything in the city stade" 
keyword = "destroyed" 

test_tweet = pd.DataFrame({
    "text_cleaned": [text],
    "keyword": [keyword]
})
prediction = (model.predict_proba(X_test)[:,1] >= 0.25).astype(int)
# prediction = model.predict(test_tweet)
if prediction[0] == 1:
    print("Le tweet est une catastrophe naturelle")
else:
    print("Le tweet n'est pas une catastrophe naturelle")

Le tweet est une catastrophe naturelle
