In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score



















In [None]:
raw_mail_data = pd.read_csv('/content/mail_data.csv')

mail_data = raw_mail_data.where(pd.notnull(raw_mail_data), '')

mail_data.loc[mail_data['Category'] == 'spam', 'Category'] = 0
mail_data.loc[mail_data['Category'] == 'ham', 'Category'] = 1

X = mail_data['Message']
Y = mail_data['Category']

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=3)

feature_extraction = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)
X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [None]:
logistic_model = LogisticRegression()
logistic_model.fit(X_train_features, Y_train)

logistic_cv_scores = cross_val_score(logistic_model, X_train_features, Y_train, cv=5)
print('Cross-validation mean score for Logistic Regression: ', logistic_cv_scores.mean())


Cross-validation mean score for Logistic Regression:  0.9403199911421138


In [None]:
nb_model = MultinomialNB()
nb_model.fit(X_train_features, Y_train)

nb_cv_scores = cross_val_score(nb_model, X_train_features, Y_train, cv=5)
print('Cross-validation mean score for Naive Bayes: ', nb_cv_scores.mean())

Cross-validation mean score for Naive Bayes:  0.9685867141771476


In [None]:
rf_model = RandomForestClassifier()
rf_model.fit(X_train_features, Y_train)

rf_cv_scores = cross_val_score(rf_model, X_train_features, Y_train, cv=5)
print('Cross-validation mean score for Random Forest: ', rf_cv_scores.mean())


Cross-validation mean score for Random Forest:  0.973748697739729


In [None]:
svm_model = SVC(kernel='linear')  # Linear SVM for text classification
svm_model.fit(X_train_features, Y_train)

svm_cv_scores = cross_val_score(svm_model, X_train_features, Y_train, cv=5)
print('Cross-validation mean score for SVM: ', svm_cv_scores.mean())

Cross-validation mean score for SVM:  0.9759916051395873


In [None]:

logistic_train_predictions = logistic_model.predict(X_train_features)
logistic_train_accuracy = accuracy_score(Y_train, logistic_train_predictions)
print('Accuracy on training data (Logistic Regression): ', logistic_train_accuracy)

logistic_test_predictions = logistic_model.predict(X_test_features)
logistic_test_accuracy = accuracy_score(Y_test, logistic_test_predictions)
print('Accuracy on test data (Logistic Regression): ', logistic_test_accuracy)

Accuracy on training data (Logistic Regression):  0.9676912721561588
Accuracy on test data (Logistic Regression):  0.9668161434977578


In [None]:
input_mail = ["I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times"]

input_data_features = feature_extraction.transform(input_mail)

logistic_prediction = logistic_model.predict(input_data_features)
print('Logistic Regression Prediction:', 'Ham mail' if logistic_prediction[0] == 1 else 'Spam mail')

Logistic Regression Prediction: Ham mail


In [None]:
nb_prediction = nb_model.predict(input_data_features)
rf_prediction = rf_model.predict(input_data_features)
svm_prediction = svm_model.predict(input_data_features)

print('Naive Bayes Prediction:', 'Ham mail' if nb_prediction[0] == 1 else 'Spam mail')
print('Random Forest Prediction:', 'Ham mail' if rf_prediction[0] == 1 else 'Spam mail')
print('SVM Prediction:', 'Ham mail' if svm_prediction[0] == 1 else 'Spam mail')


Naive Bayes Prediction: Ham mail
Random Forest Prediction: Ham mail
SVM Prediction: Ham mail


In [None]:
new_data = X_test[:100]
new_data_features = feature_extraction.transform(new_data)

new_data_predictions = logistic_model.predict(new_data_features)
nb_prediction = nb_model.predict(new_data_features)
rf_prediction = rf_model.predict(new_data_features)
svm_prediction = svm_model.predict(new_data_features)

prediction_results = pd.DataFrame({
    'Message': new_data,
    'Actual': Y_test[:100].values,
    'Predicted': new_data_predictions,
    'nb_predicted': nb_prediction,
    'rf_predicted': rf_prediction,
    'svm_predicted': svm_prediction,

})

prediction_results.to_csv('/content/prediction_results.csv', index=False)

print('Predictions for 100 new mails have been saved to prediction_results.csv')


Predictions for 100 new mails have been saved to prediction_results.csv


In [None]:
voting_classifier = VotingClassifier(estimators=[
    ('logistic', logistic_model),
    ('naive_bayes', nb_model),
    ('random_forest', rf_model),
    ('svm', svm_model)
], voting='hard')

voting_classifier.fit(X_train_features, Y_train)

voting_predictions = voting_classifier.predict(X_test_features)
voting_accuracy = accuracy_score(Y_test, voting_predictions)
print('Accuracy with Voting Classifier: ', voting_accuracy)


Accuracy with Voting Classifier:  0.9802690582959641
