In [2]:
import pandas as pd
import string
df = pd.read_table('dataset/SMSSpamCollection',
                   header=None,
                   sep='\t',
                   names=['label', 'message']
                  )
df.label = df.label.map({
    'ham': 0,
    'spam': 1,
})

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

# Implementing Bag of Words from scratch

In [4]:
documents = ['Hello, how are you!',
             'Win money, win from home.',
             'Call me now.',
             'Hello, Call hello you tomorrow?']

In [5]:
lower_case_documents = []
for line in documents:
    purified_line = line.lower()
    purified_line = purified_line.translate(str.maketrans("", "", string.punctuation))
    lower_case_documents.append(purified_line)

In [6]:
print(lower_case_documents)

['hello how are you', 'win money win from home', 'call me now', 'hello call hello you tomorrow']


In [30]:
word_bag = {}
for line in lower_case_documents:
    for word in line.split(' '):
        if word_bag.get(word) is not None:
            word_bag[word] += 1
        else:
            word_bag[word] = 1
            
frequecy_list = []

for line in lower_case_documents:
    frequecy_list_for_line = []
    for word, _ in word_bag.items():
        cnt = len(list(filter(lambda x: x == word, line.split(" "))))
        frequecy_list_for_line.append({
            word: cnt
        })
    frequecy_list.append(frequecy_list_for_line)


In [31]:
frequecy_list

[[{'hello': 1},
  {'how': 1},
  {'are': 1},
  {'you': 1},
  {'win': 0},
  {'money': 0},
  {'from': 0},
  {'home': 0},
  {'call': 0},
  {'me': 0},
  {'now': 0},
  {'tomorrow': 0}],
 [{'hello': 0},
  {'how': 0},
  {'are': 0},
  {'you': 0},
  {'win': 2},
  {'money': 1},
  {'from': 1},
  {'home': 1},
  {'call': 0},
  {'me': 0},
  {'now': 0},
  {'tomorrow': 0}],
 [{'hello': 0},
  {'how': 0},
  {'are': 0},
  {'you': 0},
  {'win': 0},
  {'money': 0},
  {'from': 0},
  {'home': 0},
  {'call': 1},
  {'me': 1},
  {'now': 1},
  {'tomorrow': 0}],
 [{'hello': 2},
  {'how': 0},
  {'are': 0},
  {'you': 1},
  {'win': 0},
  {'money': 0},
  {'from': 0},
  {'home': 0},
  {'call': 1},
  {'me': 0},
  {'now': 0},
  {'tomorrow': 1}]]

# Now using scikit learn

In [42]:
from sklearn.feature_extraction.text import CountVectorizer
count_vector = CountVectorizer()

documents = ['Hello, how are you!',
                'Win money, win from home.',
                'Call me now.',
                'Hello, Call hello you tomorrow?']

In [43]:
print(count_vector)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)


In [44]:
count_vector.fit(documents)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [45]:
count_vector.get_feature_names()

['are',
 'call',
 'from',
 'hello',
 'home',
 'how',
 'me',
 'money',
 'now',
 'tomorrow',
 'win',
 'you']

In [46]:
doc_array = count_vector.transform(documents).toarray()
doc_array

array([[1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1],
       [0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 2, 0],
       [0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0],
       [0, 1, 0, 2, 0, 0, 0, 0, 0, 1, 0, 1]])

In [47]:
frequency_matrix = pd.DataFrame(doc_array, 
                                columns = count_vector.get_feature_names())
frequency_matrix

Unnamed: 0,are,call,from,hello,home,how,me,money,now,tomorrow,win,you
0,1,0,0,1,0,1,0,0,0,0,0,1
1,0,0,1,0,1,0,0,1,0,0,2,0
2,0,1,0,0,0,0,1,0,1,0,0,0
3,0,1,0,2,0,0,0,0,0,1,0,1


# Now use this to do sms spam detection

In [51]:
from sklearn.cross_validation import train_test_split

In [52]:
X_tr, X_te, y_tr, y_te = train_test_split(df['message'], df['label'], random_state=1)

In [53]:
print('Number of rows in the total set: {}'.format(df.shape[0]))
print('Number of rows in the training set: {}'.format(X_tr.shape[0]))
print('Number of rows in the test set: {}'.format(X_te.shape[0]))


Number of rows in the total set: 5572
Number of rows in the training set: 4179
Number of rows in the test set: 1393


In [56]:
count_vector = CountVectorizer()
training_data = count_vector.fit_transform(X_tr)
testing_data = count_vector.transform(X_te)

In [82]:
from sklearn.naive_bayes import MultinomialNB
naive_bayes = MultinomialNB()
naive_bayes.fit(training_data, y_tr)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [85]:
predictions = naive_bayes.predict(testing_data)

In [87]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('Accuracy score: ', format(accuracy_score(y_te, predictions)))
print('Precision score: ', format(precision_score(y_te, predictions)))
print('Recall score: ', format(recall_score(y_te, predictions)))
print('F1 score: ', format(f1_score(y_te, predictions)))

Accuracy score:  0.9885139985642498
Precision score:  0.9720670391061452
Recall score:  0.9405405405405406
F1 score:  0.9560439560439562
