In [13]:
#1. load the data set 
#2. droping and renaming columns to make it easy 
#3. displaying head of data 

import pandas as pd

df = pd.read_csv(r'D:\eziline\spam.csv', encoding='latin1')

df = df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)

df.columns = ['label', 'message']

print(df.head())


  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [14]:
#1. checking missing val
#2. droping rows and displaying cleaned data
print(df.isnull().sum())

df = df.dropna()

print(df.head())


label      0
message    0
dtype: int64
  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [9]:
#1. importing necessary libraries
#2. using TD-IDF method i.e term frequency-inverse document frequency method, emphasizing more on unique and relevant terms 


from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(df['message']).toarray()

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(df['label'])

print(X.shape, y.shape)


(5572, 1000) (5572,)


In [10]:
#1. importing necessary libraries
#2. splitting data into training and test data, i.e 30% test data and 70% training data


from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)


(3900, 1000) (1672, 1000) (3900,) (1672,)


In [15]:
#1. importing necessary libraries 
#2. training the model and making predictions
#3. evaluate matrix and accuracy score, and making confusion matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix

model = MultinomialNB()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:\n', conf_matrix)


Accuracy: 97.49%
Confusion Matrix:
 [[1449    4]
 [  38  181]]


In [17]:
#1. importing necessary and initializing necessary models
#2. train and predict accuracy of each model applied.

from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

# Initialize models
models = {
    'MultinomialNB': MultinomialNB(),
    'GaussianNB': GaussianNB(),
    'DecisionTree': DecisionTreeClassifier(),
    'SVC': SVC()
}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f'{name} Accuracy: {accuracy * 100:.2f}%')


MultinomialNB Accuracy: 97.49%
GaussianNB Accuracy: 80.92%
DecisionTree Accuracy: 96.77%
SVC Accuracy: 97.97%
