In [1]:
import pandas as pd
import numpy as np
import re
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\manmi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
data = pd.read_csv('data/spamHam_eda.csv')
data.head()


Unnamed: 0,Label,Message
0,ham,Subject: enron methanol ; meter # : 988291\r\n...
1,ham,"Subject: hpl nom for january 9 , 2001\r\n( see..."
2,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar..."
3,spam,"Subject: photoshop , windows , office . cheap ..."
4,ham,Subject: re : indian springs\r\nthis deal is t...


In [3]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

ps = PorterStemmer()

corpus = []

for i in range(0, len(data)):
    review = re.sub('[^a-zA-Z]', ' ', data['Message'][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
x = cv.fit_transform(corpus).toarray()

In [5]:
x

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], shape=(10162, 40272))

In [6]:
y = data['Label'].map({'ham' : 0, 'spam' : 1})

In [7]:
y

0        0
1        0
2        0
3        1
4        0
        ..
10157    1
10158    0
10159    0
10160    0
10161    0
Name: Label, Length: 10162, dtype: int64

In [8]:
#SVC and Naive Bayes Model
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score


models = {
    'mnb': MultinomialNB(),
    'gnb': GaussianNB()
}

In [9]:
# Create a function which can evaluate models and return a report 

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
def evaluate_models(X, y, models):
    '''
    This function takes in X and y and models dictionary as input
    It splits the data into Train Test split
    Iterates through the given model dictionary and evaluates the metrics
    Returns: Dataframe which contains report of all models metrics with cost
    '''
    # separate dataset into train and test
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
    

    models_list = []
    scores = []
    
    for i in range(len(list(models))):
        model = list(models.values())[i]
        model.fit(X_train, y_train) # Train model

        # Make predictions
        y_pred = model.predict(X_test)

        score = accuracy_score(y_test,y_pred)
        
        model_name = list(models.keys())[i]
        print(f'---- Score for --- {model_name} ----')
        print(f"{score}")
        models_list.append(model_name)
        scores.append(score)
    
    print()
    
    report = pd.DataFrame()
    report['Model_name'] = models_list
    report['Score'] = scores        
    return report

In [10]:
report = evaluate_models(x, y, models)

---- Score for --- mnb ----
0.9394982784062961
---- Score for --- gnb ----
0.926709296606001



In [11]:
report.sort_values('Score')

Unnamed: 0,Model_name,Score
1,gnb,0.926709
0,mnb,0.939498


In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=42)

In [14]:
from sklearn.naive_bayes import GaussianNB
import numpy as np

params = {'var_smoothing': np.random.exponential(.000000001,20)
         }
gnb_model = GaussianNB()
gnb_cv = GridSearchCV(gnb_model, params, cv = 10)
gnb_cv.fit(X_train, y_train)

print("tuned hpyerparameters :(best parameters) ",gnb_cv.best_params_)
print("accuracy :",gnb_cv.best_score_)

tuned hpyerparameters :(best parameters)  {'var_smoothing': np.float64(4.161705673975432e-10)}
accuracy : 0.9193024073097874


In [15]:
from sklearn.metrics import confusion_matrix, accuracy_score

spam_detect_model = GaussianNB(**gnb_cv.best_params_)
spam_detect_model.fit(X_train, y_train)
y_pred = spam_detect_model.predict(X_test)
confusion_m = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy of the model is {accuracy}")
print(f"The confusion matrix is: \n{confusion_m}")



Accuracy of the model is 0.926709296606001
The confusion matrix is: 
[[1536   71]
 [  78  348]]


In [None]:
# therefor accuracy of model id 92.67% which is a better side