In [1]:
import pandas as pd

original_df = pd.read_csv('spam.csv', encoding='latin-1')
# ゴミカラムの除去
original_df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)

original_df['v1'].value_counts()
# ham     4825
# spam     747

X = pd.DataFrame(original_df['v2'])
y = original_df['v1'].apply(lambda s: 1 if s == 'spam' else 0) # 目的変数(スパムなら1)

In [2]:
print(original_df)

        v1                                                 v2
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...
...    ...                                                ...
5567  spam  This is the 2nd time we have tried 2 contact u...
5568   ham              Will Ì_ b going to esplanade fr home?
5569   ham  Pity, * was in mood for that. So...any other s...
5570   ham  The guy did some bitching but I acted like i'd...
5571   ham                         Rofl. Its true to its name

[5572 rows x 2 columns]


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

# 学習データ(557サンプル)とテストデータ(5015サンプル)の分離
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.1)

# CoutVectorizer
vectorizer = CountVectorizer()
vectorizer.fit(X_train['v2'])

print('Vocabulary size: {}'.format(len(vectorizer.vocabulary_)))
print('Vocabulary content: {}'.format(vectorizer.vocabulary_))
# Vocabulary size: 2284
# Vocabulary content: {'buy': 414, 'space': 1832, 'invaders': 1048, 'chance': 463, 'win': 2199, 'orig': 1445, 'arcade': 276, ...

# 文章を特徴ベクトル化
X_train_bow = vectorizer.transform(X_train['v2'])
X_test_bow = vectorizer.transform(X_test['v2'])

print('X_train_bow:\n{}'.format(repr(X_train_bow)))
print('X_test_bow:\n{}'.format(repr(X_test_bow)))
# X_train_bow:
# <557x2284 sparse matrix of type '<class 'numpy.int64'>'
#  with 7471 stored elements in Compressed Sparse Row format>
# X_test_bow:
# <5015x2284 sparse matrix of type '<class 'numpy.int64'>'
#  with 53413 stored elements in Compressed Sparse Row format>

Vocabulary size: 2349
Vocabulary content: {'you': 2334, 'know': 1166, 'my': 1400, 'old': 1483, 'dom': 680, 'told': 2068, 'about': 188, 'yesterday': 2327, 'his': 1019, 'name': 1405, 'is': 1111, 'roger': 1721, 'he': 994, 'got': 947, 'in': 1083, 'touch': 2089, 'with': 2277, 'me': 1318, 'last': 1182, 'night': 1434, 'and': 251, 'wants': 2214, 'to': 2063, 'meet': 1324, 'him': 1016, 'today': 2065, 'at': 298, 'pm': 1580, 'so': 1871, 'love': 1263, 'excited': 785, 'each': 718, 'day': 611, 'spend': 1894, 'make': 1293, 'happy': 986, 'wake': 2204, 'up': 2148, 'long': 1249, 'ago': 224, 'already': 240, 'dunno': 714, 'what': 2246, 'other': 1507, 'thing': 2025, 'spending': 1895, 'new': 1428, 'years': 2323, 'brother': 407, 'family': 805, 'lets': 1207, 'plan': 1565, 'next': 1430, 'week': 2233, 'are': 277, 'ready': 1670, 'be': 340, 'spoiled': 1901, 'hello': 1005, 'wats': 2223, 'talks': 1977, 'email': 743, 'address': 206, 'yo': 2331, 'call': 434, 'when': 2249, 'get': 917, 'the': 2015, 'chance': 471, 'frien

In [4]:
from sklearn.naive_bayes import BernoulliNB

model = BernoulliNB()
model.fit(X_train_bow, y_train)

print('Train accuracy: {:.3f}'.format(model.score(X_train_bow, y_train)))
print('Test accuracy: {:.3f}'.format(model.score(X_test_bow, y_test)))
# Train accuracy: 0.978
# Test accuracy: 0.920

Train accuracy: 0.971
Test accuracy: 0.896
