<a href="https://colab.research.google.com/github/oshmita26/E-mail-Spam-Classifier-using-ensemble-methods/blob/main/Email_spam_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Import the Bagging, RandomForest, and AdaBoost Classifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier

In [3]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [6]:
df = pd.read_table('/content/gdrive/MyDrive/SMSSpamCollection', sep='\t', 
                   header=None, 
                   names=['label', 'sms_message'])

In [7]:
# Fix our desired response
df['label'] = df.label.map({'spam':1, 'ham':0})

In [8]:
# Split our dataset into training and testing data
X_train, X_test, y_train, y_test = train_test_split(df['sms_message'], 
                                                    df['label'], 
                                                    random_state=1)

# Instantiate the CountVectorizer method
count_vector = CountVectorizer()

In [9]:
# Fit the training data and then return the matrix
training_data = count_vector.fit_transform(X_train)

# Transform testing data and return the matrix. Note we are not fitting the testing data into the CountVectorizer()
testing_data = count_vector.transform(X_test)

In [10]:
# Instantiate a BaggingClassifier with:
# 200 weak learners (n_estimators) and everything else as default values

bagging=BaggingClassifier(n_estimators=200)

# Instantiate a RandomForestClassifier with:
# 200 weak learners (n_estimators) and everything else as default values
rforest = RandomForestClassifier(n_estimators=200)

# Instantiate an a AdaBoostClassifier with:
# With 300 weak learners (n_estimators) and a learning_rate of 0.2
adaboost = AdaBoostClassifier(n_estimators=300, learning_rate=0.2)

In [11]:
# Fit your BaggingClassifier to the training data
bagging.fit(training_data, y_train)

# Fit your RandomForestClassifier to the training data
rforest.fit(training_data, y_train)

# Fit your AdaBoostClassifier to the training data
adaboost.fit(training_data, y_train)


AdaBoostClassifier(learning_rate=0.2, n_estimators=300)

In [12]:
# Predict using BaggingClassifier on the test data
pred_bag = bagging.predict(testing_data)

# Predict using RandomForestClassifier on the test data
pred_for = rforest.predict(testing_data)

# Predict using AdaBoostClassifier on the test data
pred_ada = adaboost.predict(testing_data)

In [13]:
def print_metrics(y_true, preds, model_name=None):
    '''
    INPUT:
    y_true - the y values that are actually true in the dataset (NumPy array or pandas series)
    preds - the predictions for those values from some model (NumPy array or pandas series)
    model_name - (str - optional) a name associated with the model if you would like to add it to the print statements 
    
    OUTPUT:
    None - prints the accuracy, precision, recall, and F1 score
    '''
    if model_name == None:
        print('Accuracy score: ', format(accuracy_score(y_true, preds)))
        print('Precision score: ', format(precision_score(y_true, preds)))
        print('Recall score: ', format(recall_score(y_true, preds)))
        print('F1 score: ', format(f1_score(y_true, preds)))
        print('\n\n')
    
    else:
        print('Accuracy score for ' + model_name + ' :' , format(accuracy_score(y_true, preds)))
        print('Precision score ' + model_name + ' :', format(precision_score(y_true, preds)))
        print('Recall score ' + model_name + ' :', format(recall_score(y_true, preds)))
        print('F1 score ' + model_name + ' :', format(f1_score(y_true, preds)))
        print('\n\n')

In [14]:
# Print Bagging scores
print_metrics(y_test, pred_bag)

# Print Random Forest scores
print_metrics(y_test, pred_for)

# Print AdaBoost scores
print_metrics(y_test, pred_ada)

Accuracy score:  0.9763101220387652
Precision score:  0.9222222222222223
Recall score:  0.8972972972972973
F1 score:  0.9095890410958904



Accuracy score:  0.9820531227566404
Precision score:  1.0
Recall score:  0.8648648648648649
F1 score:  0.927536231884058



Accuracy score:  0.9770279971284996
Precision score:  0.9693251533742331
Recall score:  0.8540540540540541
F1 score:  0.9080459770114943



