In [4]:
import pandas as pd 
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler


In [17]:
df = pd.read_csv('..\\data\\processed\\train_optimized.csv')
df_test = pd.read_csv('..\\data\\processed\\test_cleaned.csv')

X = df.drop(['id', 'target'], axis=1)
y = df['target'] 

X_test = df_test[['text_cleaned', 'keyword']]
y_test = df_test['target']

# Fill NaN values in 'keyword' column with empty string
X_test['keyword'] = X_test['keyword'].fillna('')

# 📝 Conservation pour analyse
text_data = df['text_cleaned']
metadata = df[['id', 'text_cleaned']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['keyword'] = X_test['keyword'].fillna('')


In [6]:
preprocessor = ColumnTransformer(transformers=[
    ('text_cleaned', TfidfVectorizer(), 'text_cleaned'),
    ('keyword', TfidfVectorizer(), 'keyword')
])

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('svm', SVC(probability=True))
])

param_grid = {
    'tfidf__max_df': [0.9, 1.0],
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'svm__C': [0.1, 1, 10],
    'svm__kernel': ['linear', 'rbf'],
    'svm__class_weight': [None, 'balanced']
}

In [8]:
param_grid = {
    'svm__C': [0.1, 1, 10],
    'svm__kernel': ['linear'],
    'svm__class_weight': [None, 'balanced']
}

grid = GridSearchCV(
    pipeline,
    param_grid,
    scoring='f1',
    cv=5,
    verbose=2,
    n_jobs=-1
)

grid.fit(X, y)
print("Meilleurs paramètres :", grid.best_params_)
print("Meilleur F1-score :", grid.best_score_)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
Meilleurs paramètres : {'svm__C': 1, 'svm__class_weight': 'balanced', 'svm__kernel': 'linear'}
Meilleur F1-score : 0.7279646815862899


In [20]:


model = grid.best_estimator_

y_pred = model.predict(X_test)

accuracy = (y_pred == y_test).mean()
print(f"Accuracy sur le jeu de test : {accuracy:.4f}")
from sklearn.metrics import classification_report, confusion_matrix
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy sur le jeu de test : 0.7971
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.81      0.82       430
           1       0.75      0.78      0.77       319

    accuracy                           0.80       749
   macro avg       0.79      0.79      0.79       749
weighted avg       0.80      0.80      0.80       749

Confusion Matrix:
[[349  81]
 [ 71 248]]
