In [None]:
import numpy as np
import pandas as pd

from time import time
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

# This DATASET is being used for TRAINING & TESTING 
raw_mail_data = pd.read_csv('mail_data.csv')
mail_data = raw_mail_data[['Category', 'Message']]
mail_data = raw_mail_data.where(pd.notnull(raw_mail_data), '')

# set spam = 0 & ham = 1
mail_data.loc[mail_data['Category'] == 'spam', 'Category'] = 0
mail_data.loc[mail_data['Category'] == 'ham', 'Category'] = 1
mail_data['Category'] = mail_data['Category'].astype(int)

X = mail_data['Message']
Y = mail_data['Category']

# test_size = 0.2 sets the TESTING SET to 20 PERCENT of z and y
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.8)

In [None]:
# -------- THIS CODE USE SVM ALGORITHM ---------
cv = CountVectorizer()
X_train_features = cv.fit_transform(X_train)
X_test_features = cv.transform(X_test)

model_svm = svm.SVC()
start=time()
model_svm.fit(X_train_features, Y_train)
end=time()
train_time_svm=end-start
# Evaluate the model on the training data
prediction_train = model_svm.predict(X_train_features)
accuracy_svm_train = accuracy_score(Y_train, prediction_train)
print('Accuracy on training data: ', accuracy_svm_train)

# Evaluate the model on the testing data
prediction_test = model_svm.predict(X_test_features)
accuracy_svm_test = accuracy_score(Y_test, prediction_test)
print('\nAccuracy on testing data: ', accuracy_svm_test)


In [None]:
feature_extraction = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')


# --- Logistic Regression ---
model_lg = LogisticRegression()

# training the model
start=time()
model_lg.fit(X_train_features, Y_train)
end=time()
train_time_lg=end-start
# prediction on trained data
prediction_train = model_lg.predict(X_train_features)
accuracy_lg_train = accuracy_score(Y_train, prediction_train)
print('Accuracy on training data: ', accuracy_lg_train)

# prediction on tested data
prediction_test = model_lg.predict(X_test_features)
accuracy_lg_test = accuracy_score(Y_test, prediction_test)
print('\nAccuracy on testing data: ', accuracy_lg_test)


In [None]:
vectorizer = CountVectorizer()
X_train_features = vectorizer.fit_transform(X_train)
X_test_features = vectorizer.transform(X_test)

# Initialize and train the Naive Bayes model
model_v = MultinomialNB()
start=time()
model_v.fit(X_train_features, Y_train)
end=time()
train_time_v=end-start
# Evaluate the model on the training data
prediction_train = model_v.predict(X_train_features)
accuracy_v_train = accuracy_score(Y_train, prediction_train)
print('Accuracy on training data: ', accuracy_v_train)

# Evaluate the model on the testing data
prediction_test = model_v.predict(X_test_features)
accuracy_v_test = accuracy_score(Y_test, prediction_test)
print('\nAccuracy on testing data: ', accuracy_v_test)


In [None]:
model_rf = RandomForestClassifier()
start=time()
model_rf.fit(X_train_features, Y_train)
end=time()
train_time_rf=end-start
# Evaluate the model on the training data
prediction_train_rf = model_rf.predict(X_train_features)
accuracy_rf_train = accuracy_score(Y_train, prediction_train_rf)
print('Accuracy on training data: ', accuracy_rf_train)

# Evaluate the model on the testing data
prediction_test_rf = model_rf.predict(X_test_features)
accuracy_rf_test = accuracy_score(Y_test, prediction_test_rf)
print('\nAccuracy on testing data: ', accuracy_rf_test)

In [None]:
models_cross = pd.DataFrame({
    'Model': ['LogisticRegression','Multinomial Naive Bayes classifier','SVM','RandomForestClassifier'],
    'Score of training': [accuracy_lg_train,accuracy_v_train,accuracy_svm_train,accuracy_rf_train],
    'Score of testing': [accuracy_lg_test,accuracy_v_test,accuracy_svm_test, accuracy_rf_test],
    'Time': [train_time_lg,train_time_v,train_time_svm,train_time_rf]})
    
models_cross.sort_values(by='Score of testing', ascending=False)