# Data loading

Data download from Kaggle : https://www.kaggle.com/datasets/rajnathpatel/multilingual-spam-data?resource=download

⚠️ Note :  

The french mails of this dataset were obtened with automatic translation.  
We use this datatset because we didn't find similar data in native French.  
However the quality is sufficient for this project not intended for production.

In [2]:
import pandas as pd
data_path = "../data/data-en-hi-de-fr.csv"

df = pd.read_csv(data_path)[['text_fr', 'labels']]
df = df.rename(columns={"text_fr": "text", "labels": "label"})
df = df.drop_duplicates(subset=["text"])
df = df.replace({"label": {"spam": 1, "ham": 0}})

In [3]:
print(df.shape)
df[df.label == 1].sample(10)

(5134, 2)


Unnamed: 0,text,label
4068,Vous êtes contacté par notre service Rencontre...,1
4061,"Ces semaines, les offres de membres SavaMob so...",1
4436,Ne pas b floppy... b snappy & happy! Seul le s...,1
579,"notre numéro mobile a gagné £5000, pour réclam...",1
3132,LookAtMe!: Merci pour votre achat d'un clip vi...,1
831,U ont un admirateur secret. REVEAL qui pense U...,1
358,Le solde de trésorerie de l'Ur est actuellemen...,1
2941,Vous avez 1 nouveau message. S'il vous plaît a...,1
4968,"Vous pouvez faire un don de 2,50 £ au Fonds as...",1
3885,Un lien vers votre photo a été envoyé. Vous po...,1


## Data splitting

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df.text.values,
    df.label.values,
    test_size=0.3,
    random_state=123,
    stratify=df.label.values
)

print(X_train.shape)
print(X_test.shape)


(3593,)
(1541,)


# Model training

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

model = Pipeline([
    ('vect',
         FeatureUnion([
             ('vect_1', TfidfVectorizer(
                analyzer='word',
                token_pattern=r'(?u)\b\w+\b',
                lowercase=True,
                ngram_range=(1, 3))),
             ('vect_2', TfidfVectorizer(
                analyzer='char_wb',
                lowercase=True,
                ngram_range=(2, 3))),
         ]),
    ),  
    ('clf',
        CalibratedClassifierCV(
            SGDClassifier(loss="hinge"),
            # LinearSVC(),
        ))
    ])

params = {
    'clf__method': ["sigmoid", "isotonic"],
    'clf__base_estimator__class_weight': [None, "balanced"],
    'clf__base_estimator__penalty': ["l2", "l1", "elasticnet"]
    }

gs = GridSearchCV(model, params, scoring="f1_macro")
model = gs.fit(X_train, y_train).best_estimator_


print("Best config :")
print(model)

print()
print("Scores :")
y_pred = model.predict(X_test)
report = classification_report(y_test, y_pred)
print(report)

Best config :
Pipeline(steps=[('vect',
                 FeatureUnion(transformer_list=[('vect_1',
                                                 TfidfVectorizer(ngram_range=(1,
                                                                              3),
                                                                 token_pattern='(?u)\\b\\w+\\b')),
                                                ('vect_2',
                                                 TfidfVectorizer(analyzer='char_wb',
                                                                 ngram_range=(2,
                                                                              3)))])),
                ('clf',
                 CalibratedClassifierCV(base_estimator=SGDClassifier()))])

Scores :
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1349
           1       0.98      0.91      0.94       192

    accuracy                           0.99      1541
 

# Model testing

In [9]:
import numpy as np
from pprint import pprint


features_names = model.named_steps["vect"].get_feature_names_out()

def get_scores(feats, features_names, n_feats=10):
    idx = np.nonzero(feats)[1]
    names = features_names[idx]
    coefs = feats.toarray()[0][idx]
    scores = {name: round(coef, 3) for name, coef in zip(names, coefs)}
    scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:n_feats]
    return scores

text = """
Bonjour à toutes et à tous,

Étant désormais officiellement positive au covid, la séance de cette
semaine se déroulera malheureusement une nouvelle fois en visio. Les
informations de connexion seront mises en ligne sur la page iCampus du
cours juste avant le cours.

Bien cordialement
"""

feats = model.named_steps["vect"].transform([text])
proba = model.named_steps["clf"].predict_proba(feats)[0]
pred = 1 if proba[1] > proba[0] else 0
score = round(proba[pred] * 100, 1)

print("SPAM !!!!" if pred else "OK")
print(f"confidence: {score} %")

if pred:
    print()
    scores = get_scores(feats, features_names, n_feats=20)
    for name, coef in scores:
        print(f"{name}\t\t{coef}")

OK
confidence: 79.3 %


# Model saving

In [10]:
from time import gmtime, strftime
import os
import joblib

model_to_save = model.fit(df.text.values, df.label.values)

def save(
    model, prefix="", directory='../app/ml_models'
):
    current_time = strftime("%Y%m%d-%H%M%S", gmtime())
    dir_name = f"{prefix}-{current_time}"
    dir_path = os.path.join(directory, dir_name)
    os.makedirs(dir_path, exist_ok=True)
    joblib.dump(model, os.path.join(dir_path, "model.joblib"))

save(model_to_save, prefix="anti-spam")