In [28]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [23]:
# Importing the data into a data frame
df = pd.read_table('SMS_SPAM_FOLDER', sep='\t', names=['label', 'sms_message'])
df

Unnamed: 0,label,sms_message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [24]:
# Preprocessing the data into a matrix filled with 0s and 1s
df['label'] = df.label.map({'ham': 0, 'spam': 1})

In [25]:
# Splitting the data into testing and training data
X_train, X_test, y_train, y_test = train_test_split(df['sms_message'],
                                                    df['label'],
                                                    random_state = 1)

In [26]:
# Aplpying Bag of words processing to our dataset
count_vector = CountVectorizer()
training_data = count_vector.fit_transform(X_train) # teaches our machine learning and turns it into a Matrix
testing_data = count_vector.transform(X_test) # Just converts it to a 0, 1 matrix

In [31]:
#Naive Bayes implementation using scikit-learn
naive_bayes = MultinomialNB()
naive_bayes.fit(training_data, y_train)
predictions = naive_bayes.predict(testing_data)

In [33]:
# Evaluating our model
print('Accuracy Score: {}'.format(accuracy_score(predictions, y_test)))
print('Precision Score: {}'.format(precision_score(predictions, y_test)))
print('Recall Score: {}'.format(recall_score(predictions, y_test)))
print('f1 Score: {}'.format(f1_score(predictions, y_test)))

Accuracy Score: 0.9885139985642498
Precision Score: 0.9405405405405406
Recall Score: 0.9720670391061452
f1 Score: 0.9560439560439562
