In [1]:
import pandas as pd
import string
import nltk

from nltk import FreqDist
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

# Import libraries for predcton
from sklearn import metrics
from sklearn.metrics import confusion_matrix,accuracy_score,roc_auc_score,roc_curve,auc,f1_score

In [2]:
df = pd.read_csv('spam.csv',encoding='latin-1', sep=',', engine='python')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [3]:
df_sample = df.copy()
df_sample.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'], axis=1, inplace=True)
df_sample.rename(columns={'v1':'label','v2':'text'}, inplace=True)
df_sample.describe()

Unnamed: 0,label,text
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [4]:
# remove punctuation, stopwords and perform word stemming
def pre_process(text):
    
    text = text.translate(str.maketrans('', '', string.punctuation))  #remove punctuation
    text = [word for word in text.split() if word.lower() not in stopwords.words('english')]  #remove stopwords
    words = ""
    for i in text:  #word stemming
            stemmer = SnowballStemmer("english")
            words += (stemmer.stem(i))+" "
    return words

In [5]:
# Text transformation
# to lowercase and remove numbers
df_sample['text'] = df_sample['text'].str.lower()
df_sample['text'] = df_sample['text'].str.replace(r'\d+','')

# pre process text
textFeatures = df_sample['text'].copy()
textFeatures = textFeatures.apply(pre_process)

In [6]:
# Document-term-matrix creation
#create TfidVectorizer to create the numerical values
vectorizer = TfidfVectorizer(stop_words="english",decode_error='ignore', lowercase = True, min_df=2)

#numericalize the textFeatures
features = vectorizer.fit_transform(textFeatures.values.astype('U'))

#create the test and train datasets
features_train, features_test, labels_train, labels_test = train_test_split(features, df_sample['label'], test_size=0.2, random_state=42)

print('Train size: ', features_train.shape)
print('Test size: ', features_test.shape)

Train size:  (4457, 3523)
Test size:  (1115, 3523)


In [8]:
# Create model
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

mnb = MultinomialNB(alpha=0.2)
mnb.fit(features_train, labels_train)
prediction = mnb.predict(features_test)
print('Accuracy of Model: ', accuracy_score(labels_test,prediction))

Accuracy of Model:  0.9829596412556054


In [9]:
from sklearn.pipeline import Pipeline
import joblib
pipeline = Pipeline([('vectorizer', vectorizer), ('classifier', mnb)])

# save the model to disk
joblib.dump(pipeline, 'sms-spam.pkl')

['sms-spam.pkl']