In [9]:
import pandas as pd

# Load the dataset, select relevant columns and rename them
df = pd.read_csv('/spam.csv', encoding='latin-1')
df = df[['v1', 'v2']]
df.columns = ['label', 'message']

# Display the first few rows and the info of the dataframe
display(df.head())
display(df.info())

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    5572 non-null   object
 1   message  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


None

In [10]:
import string

# Convert text to lowercase
df['message'] = df['message'].str.lower()

# Remove punctuation
df['message'] = df['message'].str.replace('[%s]' % string.punctuation, '', regex=True)

# Display the first few rows after cleaning
display(df.head())

Unnamed: 0,label,message
0,ham,go until jurong point crazy available only in ...
1,ham,ok lar joking wif u oni
2,spam,free entry in 2 a wkly comp to win fa cup fina...
3,ham,u dun say so early hor u c already then say
4,ham,nah i dont think he goes to usf he lives aroun...


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000) # Consider the top 5000 features

# Fit and transform the message data
X = tfidf_vectorizer.fit_transform(df['message']).toarray()

# Display the shape of the feature matrix
print("Shape of feature matrix:", X.shape)

Shape of feature matrix: (5572, 5000)


In [12]:
from sklearn.model_selection import train_test_split

# Create the target variable
y = df['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Display the shapes of the training and testing sets
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (4179, 5000)
Shape of X_test: (1393, 5000)
Shape of y_train: (4179,)
Shape of y_test: (1393,)


In [13]:
from sklearn.naive_bayes import MultinomialNB

# Initialize and train the Multinomial Naive Bayes model
model = MultinomialNB()
model.fit(X_train, y_train)

In [14]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Predict on the test data
y_pred = model.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label='spam')
recall = recall_score(y_test, y_pred, pos_label='spam')
f1 = f1_score(y_test, y_pred, pos_label='spam')

# Display the evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Accuracy: 0.9669777458722182
Precision: 0.9931972789115646
Recall: 0.7643979057591623
F1 Score: 0.863905325443787
