In [4]:
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import numpy as np

pd.set_option("display.max_colwidth", 120)

In [8]:
url = "https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv"
df = pd.read_csv(url, sep="\t", header=None, names=["label", "message"])
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std t...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"


In [9]:
print(df.shape)
print(df["label"].value_counts())
df.isna().sum()

(5572, 2)
label
ham     4825
spam     747
Name: count, dtype: int64


label      0
message    0
dtype: int64

In [10]:
df["target"] = df["label"].map({"ham": 0, "spam": 1})
x = df["message"]
y = df["target"]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)
len(x_train), len(x_test), y_train.mean().round(3), y_test.mean().round(3)


(4457, 1115, np.float64(0.134), np.float64(0.134))

In [11]:
model = make_pipeline(
    CountVectorizer(),
    MultinomialNB()
)

model.fit(x_train, y_train)


In [12]:
y_pred = model.predict(x_test)
acc = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

print(f"accuracy : {acc:3f}")
print("\nMatrice de confusion (lignes = vrai, colonnes = predit) :\n", cm)
print("\nRapport de classification :\n", classification_report(y_test, y_pred, target_names=["ham","spam"]))

accuracy : 0.987444

Matrice de confusion (lignes = vrai, colonnes = predit) :
 [[964   2]
 [ 12 137]]

Rapport de classification :
               precision    recall  f1-score   support

         ham       0.99      1.00      0.99       966
        spam       0.99      0.92      0.95       149

    accuracy                           0.99      1115
   macro avg       0.99      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115



In [16]:
exemples = [
    "Vous avez gagné 1000€ ! Cliquez ici pour réclamer votre prix : http://bit.ly/xxxx",
    "Salut, on se voit demain à 14h pour le cours de ML ?",
    "URGENT!!! Votre colis est retenu. Payez des frais ici: http://fraude.xyz",
    "Bonjour, Pense à apporter le chargeur stp."
]

pred = model.predict(exemples)
for msg, p in zip(exemples, pred):
    print(f"[{'SPAM' if p==1 else 'HAM'}] {msg}")


[SPAM] Vous avez gagné 1000€ ! Cliquez ici pour réclamer votre prix : http://bit.ly/xxxx
[HAM] Salut, on se voit demain à 14h pour le cours de ML ?
[SPAM] URGENT!!! Votre colis est retenu. Payez des frais ici: http://fraude.xyz
[SPAM] Bonjour, Pense à apporter le chargeur stp Cordialement.
