In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [5]:

# Charger les données
data = pd.read_csv("E:/dataset/SMSCollection.csv", sep=',')

# Check column names
print(data.columns)

# Assuming 'sms' is the text column and 'Class' is the type column
x = data["sms"]  # Replace 'text' with 'sms' if that's the correct column name
y = data["Class"]  # Replace 'type' with 'Class' if that's the correct column name

# Dividing the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

Index(['Class', 'sms'], dtype='object')


In [6]:
#treatement de valeurs manquantes
print('savoir le nombre de valeurs manquantes')
data.isna().sum()

savoir le nombre de valeurs manquantes


Class    0
sms      0
dtype: int64

In [7]:
# TF-IDF(gaussian)
tfidf_vect = TfidfVectorizer()
X_train_tfidf = tfidf_vect.fit_transform(X_train)
X_test_tfidf= tfidf_vect.transform(X_test)

In [9]:
# Fréquence de mot(multinomial)
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
X_test_counts= count_vect.transform(X_test)


In [10]:
# Existence de mot(bernoulli)
binary_vect = CountVectorizer(binary=True)
X_train_binary = binary_vect.fit_transform(X_train)
X_test_binary= binary_vect.transform(X_test)

In [11]:
# Multinomial (tfidf)
#train the model
clf_mnb = MultinomialNB()
clf_mnb.fit(X_train_tfidf, y_train)
#predict output
y_pred_mnb = clf_mnb.predict(X_test_tfidf)

# Calcul des métriques
acc_mnb = accuracy_score(y_test, y_pred_mnb)
cm_mnb = confusion_matrix(y_test, y_pred_mnb)
cr_mnb = classification_report(y_test, y_pred_mnb)
# Affichage des résultats
print("Accuracy MultinomialNB:", acc_mnb)
print("Confusion Matrix MultinomialNB:\n", cm_mnb)
print("Classification Report MultinomialNB:\n", cr_mnb)

Accuracy MultinomialNB: 0.9668161434977578
Confusion Matrix MultinomialNB:
 [[966   0]
 [ 37 112]]
Classification Report MultinomialNB:
               precision    recall  f1-score   support

         ham       0.96      1.00      0.98       966
        spam       1.00      0.75      0.86       149

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.96      1115



In [12]:
# Multinomial (frequence)
#train the model
clf_mnb = MultinomialNB()
clf_mnb.fit(X_train_counts, y_train)
#predict output
y_pred_mnb = clf_mnb.predict(X_test_counts)

# Calcul des métriques
acc_mnb = accuracy_score(y_test, y_pred_mnb)
cm_mnb = confusion_matrix(y_test, y_pred_mnb)
cr_mnb = classification_report(y_test, y_pred_mnb)
# Affichage des résultats
print("Accuracy MultinomialNB:", acc_mnb)
print("Confusion Matrix MultinomialNB:\n", cm_mnb)
print("Classification Report MultinomialNB:\n", cr_mnb)

Accuracy MultinomialNB: 0.9919282511210762
Confusion Matrix MultinomialNB:
 [[966   0]
 [  9 140]]
Classification Report MultinomialNB:
               precision    recall  f1-score   support

         ham       0.99      1.00      1.00       966
        spam       1.00      0.94      0.97       149

    accuracy                           0.99      1115
   macro avg       1.00      0.97      0.98      1115
weighted avg       0.99      0.99      0.99      1115



In [13]:
# Multinomial (existance)
#train the model
clf_mnb = MultinomialNB()
clf_mnb.fit(X_train_binary, y_train)
#predict output
y_pred_mnb = clf_mnb.predict(X_test_binary)

# Calcul des métriques
acc_mnb = accuracy_score(y_test, y_pred_mnb)
cm_mnb = confusion_matrix(y_test, y_pred_mnb)
cr_mnb = classification_report(y_test, y_pred_mnb)
# Affichage des résultats
print("Accuracy MultinomialNB:", acc_mnb)
print("Confusion Matrix MultinomialNB:\n", cm_mnb)
print("Classification Report MultinomialNB:\n", cr_mnb)

Accuracy MultinomialNB: 0.9910313901345291
Confusion Matrix MultinomialNB:
 [[966   0]
 [ 10 139]]
Classification Report MultinomialNB:
               precision    recall  f1-score   support

         ham       0.99      1.00      0.99       966
        spam       1.00      0.93      0.97       149

    accuracy                           0.99      1115
   macro avg       0.99      0.97      0.98      1115
weighted avg       0.99      0.99      0.99      1115



In [14]:

# Gaussian (tfidf)
#train the model
clf_gnb = GaussianNB()
clf_gnb.fit(X_train_tfidf.toarray(), y_train)
#predict output
y_pred_gnb = clf_gnb.predict(X_test_tfidf.toarray())
# Calcul des métriques
acc_gnb = accuracy_score(y_test, y_pred_gnb)
cm_gnb = confusion_matrix(y_test, y_pred_gnb)
cr_gnb = classification_report(y_test, y_pred_gnb)
# Affichage des résultats
print("Accuracy GaussianNB:", acc_gnb)
print("Confusion Matrix GaussianNB:\n", cm_gnb)
print("Classification Report GaussianNB:\n", cr_gnb)


Accuracy GaussianNB: 0.9049327354260089
Confusion Matrix GaussianNB:
 [[874  92]
 [ 14 135]]
Classification Report GaussianNB:
               precision    recall  f1-score   support

         ham       0.98      0.90      0.94       966
        spam       0.59      0.91      0.72       149

    accuracy                           0.90      1115
   macro avg       0.79      0.91      0.83      1115
weighted avg       0.93      0.90      0.91      1115



In [15]:
# Gaussian (frequence)
#train the model
clf_gnb = GaussianNB()
clf_gnb.fit(X_train_counts.toarray(), y_train)
#predict output
y_pred_gnb = clf_gnb.predict(X_test_counts.toarray())
# Calcul des métriques
acc_gnb = accuracy_score(y_test, y_pred_gnb)
cm_gnb = confusion_matrix(y_test, y_pred_gnb)
cr_gnb = classification_report(y_test, y_pred_gnb)
# Affichage des résultats
print("Accuracy GaussianNB:", acc_gnb)
print("Confusion Matrix GaussianNB:\n", cm_gnb)
print("Classification Report GaussianNB:\n", cr_gnb)

Accuracy GaussianNB: 0.9067264573991032
Confusion Matrix GaussianNB:
 [[873  93]
 [ 11 138]]
Classification Report GaussianNB:
               precision    recall  f1-score   support

         ham       0.99      0.90      0.94       966
        spam       0.60      0.93      0.73       149

    accuracy                           0.91      1115
   macro avg       0.79      0.91      0.84      1115
weighted avg       0.94      0.91      0.91      1115



In [16]:
# Gaussian (existance)
#train the model
clf_gnb = GaussianNB()
clf_gnb.fit(X_train_binary.toarray(), y_train)
#predict output
y_pred_gnb = clf_gnb.predict(X_test_binary.toarray())
# Calcul des métriques
acc_gnb = accuracy_score(y_test, y_pred_gnb)
cm_gnb = confusion_matrix(y_test, y_pred_gnb)
cr_gnb = classification_report(y_test, y_pred_gnb)
# Affichage des résultats
print("Accuracy GaussianNB:", acc_gnb)
print("Confusion Matrix GaussianNB:\n", cm_gnb)
print("Classification Report GaussianNB:\n", cr_gnb)

Accuracy GaussianNB: 0.9067264573991032
Confusion Matrix GaussianNB:
 [[873  93]
 [ 11 138]]
Classification Report GaussianNB:
               precision    recall  f1-score   support

         ham       0.99      0.90      0.94       966
        spam       0.60      0.93      0.73       149

    accuracy                           0.91      1115
   macro avg       0.79      0.91      0.84      1115
weighted avg       0.94      0.91      0.91      1115



In [17]:

# Bernoulli (tfidf)
#train the model
clf_bnb = BernoulliNB()
clf_bnb.fit(X_train_tfidf, y_train)
#predict output
y_pred_bnb = clf_bnb.predict(X_test_tfidf)

# Calcul des métriques
acc_bnb = accuracy_score(y_test, y_pred_bnb)
cm_bnb = confusion_matrix(y_test, y_pred_bnb)
cr_bnb = classification_report(y_test, y_pred_bnb)

# Affichage des résultats
print("Accuracy BernoulliNB:", acc_bnb)
print("Confusion Matrix BernoulliNB:\n", cm_bnb)
print("Classification Report BernoulliNB:\n", cr_bnb)

Accuracy BernoulliNB: 0.9820627802690582
Confusion Matrix BernoulliNB:
 [[966   0]
 [ 20 129]]
Classification Report BernoulliNB:
               precision    recall  f1-score   support

         ham       0.98      1.00      0.99       966
        spam       1.00      0.87      0.93       149

    accuracy                           0.98      1115
   macro avg       0.99      0.93      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [18]:

# Bernoulli (frquence)
#train the model
clf_bnb = BernoulliNB()
clf_bnb.fit(X_train_counts, y_train)
#predict output
y_pred_bnb = clf_bnb.predict(X_test_counts)

# Calcul des métriques
acc_bnb = accuracy_score(y_test, y_pred_bnb)
cm_bnb = confusion_matrix(y_test, y_pred_bnb)
cr_bnb = classification_report(y_test, y_pred_bnb)

# Affichage des résultats
print("Accuracy BernoulliNB:", acc_bnb)
print("Confusion Matrix BernoulliNB:\n", cm_bnb)
print("Classification Report BernoulliNB:\n", cr_bnb)

Accuracy BernoulliNB: 0.9820627802690582
Confusion Matrix BernoulliNB:
 [[966   0]
 [ 20 129]]
Classification Report BernoulliNB:
               precision    recall  f1-score   support

         ham       0.98      1.00      0.99       966
        spam       1.00      0.87      0.93       149

    accuracy                           0.98      1115
   macro avg       0.99      0.93      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [19]:

# Bernoulli (existance)
#train the model
clf_bnb = BernoulliNB()
clf_bnb.fit(X_train_binary, y_train)
#predict output
y_pred_bnb = clf_bnb.predict(X_test_binary)

# Calcul des métriques
acc_bnb = accuracy_score(y_test, y_pred_bnb)
cm_bnb = confusion_matrix(y_test, y_pred_bnb)
cr_bnb = classification_report(y_test, y_pred_bnb)

# Affichage des résultats
print("Accuracy BernoulliNB:", acc_bnb)
print("Confusion Matrix BernoulliNB:\n", cm_bnb)
print("Classification Report BernoulliNB:\n", cr_bnb)

Accuracy BernoulliNB: 0.9820627802690582
Confusion Matrix BernoulliNB:
 [[966   0]
 [ 20 129]]
Classification Report BernoulliNB:
               precision    recall  f1-score   support

         ham       0.98      1.00      0.99       966
        spam       1.00      0.87      0.93       149

    accuracy                           0.98      1115
   macro avg       0.99      0.93      0.96      1115
weighted avg       0.98      0.98      0.98      1115

