In [13]:
#Loading libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, KFold  
from sklearn.metrics import accuracy_score,f1_score
import pickle


In [9]:
#Loading dataset and creating dataframe and transform the output and printing first 5 rows
data = pd.read_csv('spamDataset.csv', encoding='latin-1')
data = data.replace('ham','Not Spam')
data = data.replace('spam','Spam')
data.head()
data.value_counts('Category')


Category
Not Spam    3841
Spam         746
dtype: int64

In [7]:
# Split the data into features (X) - input and target variable (y) - output
X1 = data['Message']
y1 = data['Category']

In [8]:
#Convert text into numerical vectors
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(X1)
y = y1

#Saving the vectorizer
with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

In [10]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# Initialize KFold with 10 folds
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

In [15]:
# Import the MultinomialNB classifier
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()

print("MultinomialNB")
#Training the model with KFold
# Initialize variables to track the best model and its performance
best_accuracy = 0.0
best_model = None
accuracy_scores = []

# Iterate over the folds
for train_index, test_index in kfold.split(X):
    # Split the data into training and testing sets for the current fold
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Train the model and making predictions
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Calculate f1_score for the current fold
    accuracy = f1_score(y_test, y_pred, pos_label='Spam')
    # accuracy = accuracy_score(y_test, y_pred)
    accuracy_scores.append(accuracy)

    # Check if the current model performs better than the previous best model
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = model

# Print the best accuracy achieved
print("Best accuracy using KFold:", best_accuracy)

# Save the best model for future predictions
print('Average Accuracy using KFold:',np.mean(accuracy_scores))
pickle.dump(best_model, open('MultinomialNB_KFold.pkl', 'wb'))


#Training the model with train test split
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the accuracy of the classifier
accuracy = f1_score(y_test, y_pred, pos_label='Spam')
# accuracy = accuracy_score(y_test, y_pred)
print("Accuracy using Train Test:", accuracy)

#save the model
import pickle
pickle.dump(model, open('MultinomialNB_Train_Test.pkl','wb'))

# Load the best model for future predictions
best_model = pickle.load(open('MultinomialNB_KFold.pkl', 'rb'))

# Predict the category of a new message using the best model
#new_message = ["Hi Mom, Please let me know if you need anything. I am always there for you. Love you."]
new_message = ["Congratulations! You have won a cash prize of $1,000,000. To claim your prize, reply to this email with your full name, address, and bank account details. Act quickly to secure your winnings."]
new_message_transformed = vectorizer.transform(new_message)
print(best_model.predict(new_message_transformed))



MultinomialNB
Best accuracy using KFold: 0.972972972972973
Average Accuracy using KFold: 0.9409272234673132
Accuracy using Train Test: 0.9707602339181286
['Spam']


In [16]:
# Import the support vector classifier
from sklearn.svm import SVC
model = SVC(kernel='linear')

print("Support Vector Classifier (SVC)")
#Training the model with KFold
# Initialize variables to track the best model and its performance
best_accuracy = 0.0
best_model = None
accuracy_scores = []

# Iterate over the folds
for train_index, test_index in kfold.split(X):
    # Split the data into training and testing sets for the current fold
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Train the model and making predictions
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Calculate accuracy for the current fold
    accuracy = f1_score(y_test, y_pred, pos_label='Spam')
    accuracy_scores.append(accuracy)

    # Check if the current model performs better than the previous best model
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = model

# Print the best accuracy achieved
print("Best accuracy using KFold:", best_accuracy)

# Save the best model for future predictions
print('Average Accuracy using KFold:',np.mean(accuracy_scores))
pickle.dump(best_model, open('SVC_KFold.pkl', 'wb'))


#Training the model with train test split
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the accuracy of the classifier
accuracy = f1_score(y_test, y_pred, pos_label='Spam')
print("Accuracy using Train Test:", accuracy)

#save the model
import pickle
pickle.dump(model, open('SVC_Train_Test.pkl','wb'))

Support Vector Classifier (SVC)
Best accuracy using KFold: 0.9714285714285714
Average Accuracy using KFold: 0.946709309987944
Accuracy using Train Test: 0.9325153374233129


In [17]:
# Import the logistic regression algorithm
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()

print("Logistic Regression Algorithm")
#Training the model with KFold
# Initialize variables to track the best model and its performance
best_accuracy = 0.0
best_model = None
accuracy_scores = []

# Iterate over the folds
for train_index, test_index in kfold.split(X):
    # Split the data into training and testing sets for the current fold
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Train the model and making predictions
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Calculate accuracy for the current fold
    accuracy = f1_score(y_test, y_pred, pos_label='Spam')
    accuracy_scores.append(accuracy)

    # Check if the current model performs better than the previous best model
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = model

# Print the best accuracy achieved
print("Best accuracy using KFold:", best_accuracy)

# Save the best model for future predictions
print('Average Accuracy using KFold:',np.mean(accuracy_scores))
pickle.dump(best_model, open('Logistic_Regression_KFold.pkl', 'wb'))


#Training the model with train test split
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the accuracy of the classifier
accuracy = f1_score(y_test, y_pred, pos_label='Spam')
print("Accuracy using Train Test:", accuracy)

#save the model
import pickle
pickle.dump(model, open('Logistic_Regression_Train_Test.pkl','wb'))

Logistic Regression Algorithm
Best accuracy using KFold: 0.9496402877697842
Average Accuracy using KFold: 0.930588614538326
Accuracy using Train Test: 0.9316770186335402


In [18]:
# Import the decision tree algorithm
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()

print("Decision Tree Algorithm")
#Training the model with KFold
# Initialize variables to track the best model and its performance
best_accuracy = 0.0
best_model = None
accuracy_scores = []

# Iterate over the folds
for train_index, test_index in kfold.split(X):
    # Split the data into training and testing sets for the current fold
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Train the model and making predictions
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Calculate accuracy for the current fold
    accuracy = f1_score(y_test, y_pred, pos_label='Spam')
    accuracy_scores.append(accuracy)

    # Check if the current model performs better than the previous best model
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = model

# Print the best accuracy achieved
print("Best accuracy using KFold:", best_accuracy)

# Save the best model for future predictions
print('Average Accuracy using KFold:',np.mean(accuracy_scores))
pickle.dump(best_model, open('Decision_Tree_KFold.pkl', 'wb'))


#Training the model with train test split
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the accuracy of the classifier
accuracy = f1_score(y_test, y_pred, pos_label='Spam')
print("Accuracy using Train Test:", accuracy)

#save the model
import pickle
pickle.dump(model, open('Decision_Tree_Train_Test.pkl','wb'))

Decision Tree Algorithm
Best accuracy using KFold: 0.9461077844311377
Average Accuracy using KFold: 0.888449198355473
Accuracy using Train Test: 0.923076923076923


In [19]:
# Import the random forest algorithm
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()

print("Random Forest Algorithm")
#Training the model with KFold
# Initialize variables to track the best model and its performance
best_accuracy = 0.0
best_model = None
accuracy_scores = []

# Iterate over the folds
for train_index, test_index in kfold.split(X):
    # Split the data into training and testing sets for the current fold
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Train the model and making predictions
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Calculate accuracy for the current fold
    accuracy = f1_score(y_test, y_pred, pos_label='Spam')
    accuracy_scores.append(accuracy)

    # Check if the current model performs better than the previous best model
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = model

# Print the best accuracy achieved
print("Best accuracy using KFold:", best_accuracy)

# Save the best model for future predictions
print('Average Accuracy using KFold:',np.mean(accuracy_scores))
pickle.dump(best_model, open('Random_Forest_KFold.pkl', 'wb'))


#Training the model with train test split
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the accuracy of the classifier
accuracy = f1_score(y_test, y_pred, pos_label='Spam')
print("Accuracy using Train Test:", accuracy)

#save the model
import pickle
pickle.dump(model, open('Random_Forest_Train_Test.pkl','wb'))

Random Forest Algorithm
Best accuracy using KFold: 0.950354609929078
Average Accuracy using KFold: 0.9125240470860664
Accuracy using Train Test: 0.925


In [20]:
# Import the KNN algorithm
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier()

print("KNN Algorithm")
#Training the model with KFold
# Initialize variables to track the best model and its performance
best_accuracy = 0.0
best_model = None
accuracy_scores = []

# Iterate over the folds
for train_index, test_index in kfold.split(X):
    # Split the data into training and testing sets for the current fold
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Train the model and making predictions
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Calculate accuracy for the current fold
    accuracy = f1_score(y_test, y_pred, pos_label='Spam')
    accuracy_scores.append(accuracy)

    # Check if the current model performs better than the previous best model
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = model

# Print the best accuracy achieved
print("Best accuracy using KFold:", best_accuracy)

# Save the best model for future predictions
print('Average Accuracy using KFold:',np.mean(accuracy_scores))
pickle.dump(best_model, open('KNN_KFold.pkl', 'wb'))


#Training the model with train test split
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the accuracy of the classifier
accuracy = f1_score(y_test, y_pred, pos_label='Spam')
print("Accuracy using Train Test:", accuracy)

#save the model
import pickle
pickle.dump(model, open('KNN_Train_Test.pkl','wb'))

KNN Algorithm


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


Best accuracy using KFold: 0.6782608695652175
Average Accuracy using KFold: 0.6056313636464152
Accuracy using Train Test: 0.671875


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [21]:
# Import the gradient boosting algorithm
from sklearn.ensemble import GradientBoostingClassifier
model = GradientBoostingClassifier()

print("Gradient Boosting Classifier Algorithm")
#Training the model with KFold
# Initialize variables to track the best model and its performance
best_accuracy = 0.0
best_model = None
accuracy_scores = []

# Iterate over the folds
for train_index, test_index in kfold.split(X):
    # Split the data into training and testing sets for the current fold
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Train the model and making predictions
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Calculate accuracy for the current fold
    accuracy = f1_score(y_test, y_pred, pos_label='Spam')
    accuracy_scores.append(accuracy)

    # Check if the current model performs better than the previous best model
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = model

# Print the best accuracy achieved
print("Best accuracy using KFold:", best_accuracy)

# Save the best model for future predictions
print('Average Accuracy using KFold:',np.mean(accuracy_scores))
pickle.dump(best_model, open('Gradient_Boosting_Classifier.pkl', 'wb'))


#Training the model with train test split
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the accuracy of the classifier
accuracy = f1_score(y_test, y_pred, pos_label='Spam')
print("Accuracy using Train Test:", accuracy)

#save the model
import pickle
pickle.dump(model, open('Gradient_Boosting_Classifier_Train_Test.pkl','wb'))

Gradient Boosting Classifier Algorithm
Best accuracy using KFold: 0.9253731343283582
Average Accuracy using KFold: 0.8888185288245334
Accuracy using Train Test: 0.8903225806451613


In [22]:
# Import the bagging classifier algorithm
from sklearn.ensemble import BaggingClassifier
model = BaggingClassifier()

print("Bagging Classifier Algorithm")
#Training the model with KFold
# Initialize variables to track the best model and its performance
best_accuracy = 0.0
best_model = None
accuracy_scores = []

# Iterate over the folds
for train_index, test_index in kfold.split(X):
    # Split the data into training and testing sets for the current fold
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Train the model and making predictions
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Calculate accuracy for the current fold
    accuracy = f1_score(y_test, y_pred, pos_label='Spam')
    accuracy_scores.append(accuracy)

    # Check if the current model performs better than the previous best model
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = model

# Print the best accuracy achieved
print("Best accuracy using KFold:", best_accuracy)

# Save the best model for future predictions
print('Average Accuracy using KFold:',np.mean(accuracy_scores))
pickle.dump(best_model, open('Bagging_Classifier_KFold.pkl', 'wb'))


#Training the model with train test split
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the accuracy of the classifier
accuracy = f1_score(y_test, y_pred, pos_label='Spam')
print("Accuracy using Train Test:", accuracy)

#save the model
import pickle
pickle.dump(model, open('Bagging_Classifier_Train_Test.pkl','wb'))

Bagging Classifier Algorithm
Best accuracy using KFold: 0.9343065693430657
Average Accuracy using KFold: 0.8999526254224405
Accuracy using Train Test: 0.9390243902439025


In [30]:
# Import the adaboost classifier algorithm
from sklearn.ensemble import AdaBoostClassifier
model = AdaBoostClassifier()

print("AdaBoost Classifier Algorithm")
#Training the model with KFold
# Initialize variables to track the best model and its performance
best_accuracy = 0.0
best_model = None
accuracy_scores = []

# Iterate over the folds
for train_index, test_index in kfold.split(X):
    # Split the data into training and testing sets for the current fold
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Train the model and making predictions
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Calculate accuracy for the current fold
    accuracy = f1_score(y_test, y_pred, pos_label='Spam')
    accuracy_scores.append(accuracy)

    # Check if the current model performs better than the previous best model
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = model

# Print the best accuracy achieved
print("Best accuracy using KFold:", best_accuracy)

# Save the best model for future predictions
print('Average Accuracy using KFold:',np.mean(accuracy_scores))
pickle.dump(best_model, open('AdaBoost_Classifier_KFold.pkl', 'wb'))


#Training the model with train test split
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the accuracy of the classifier
accuracy = f1_score(y_test, y_pred, pos_label='Spam')
print("Accuracy using Train Test:", accuracy)

#save the model
import pickle
pickle.dump(model, open('AdaBoost_Classifier_Train_Test.pkl','wb'))

AdaBoost Classifier Algorithm
Best accuracy using KFold: 0.9387755102040817
Average Accuracy using KFold: 0.903901829490575
Accuracy using Train Test: 0.9192546583850931


In [34]:
# Import the necessary libraries
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold, train_test_split
import numpy as np
import pickle

# Create an instance of the Gaussian Naive Bayes model
model = GaussianNB()

print("Gaussian Naive Bayes Algorithm")

# Training the model with KFold
# Initialize variables to track the best model and its performance
best_accuracy = 0.0
best_model = None
accuracy_scores = []

# Initialize your X and y variables

# Assuming you have already defined your data, X, and labels, y

# Convert sparse matrix X to dense numpy array
X = X.toarray()

# Iterate over the folds
for train_index, test_index in kfold.split(X):
    # Split the data into training and testing sets for the current fold
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Train the model and make predictions
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Calculate accuracy for the current fold
    accuracy = f1_score(y_test, y_pred, pos_label='Spam')
    accuracy_scores.append(accuracy)

    # Check if the current model performs better than the previous best model
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = model

# Print the best accuracy achieved
print("Best accuracy using KFold:", best_accuracy)

# Save the best model for future predictions
print('Average Accuracy using KFold:', np.mean(accuracy_scores))
pickle.dump(best_model, open('GaussianNB_KFold.pkl', 'wb'))


# Training the model with train test split
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the accuracy of the classifier
accuracy = f1_score(y_test, y_pred, pos_label='Spam')
print("Accuracy using Train Test:", accuracy)

# Save the model
pickle.dump(model, open('GaussianNB_TrainTest.pkl', 'wb'))


Gaussian Naive Bayes Algorithm
Best accuracy using KFold: 0.7959183673469389
Average Accuracy using KFold: 0.7522234372561601
Accuracy using Train Test: 0.7322404371584699
