In [75]:
#データセット
#データセットを読み込む
import pandas as pd
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

df = pd.read_table("SMSSpamCollection",names=["label","message"])
df

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [76]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
label      5572 non-null object
message    5572 non-null object
dtypes: object(2)
memory usage: 87.1+ KB


In [77]:
#前処理
df["label"]=df["label"].map({"ham":0,"spam":1})
df

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
5,1,FreeMsg Hey there darling it's been 3 week's n...
6,0,Even my brother is not like to speak with me. ...
7,0,As per your request 'Melle Melle (Oru Minnamin...
8,1,WINNER!! As a valued network customer you have...
9,1,Had your mobile 11 months or more? U R entitle...


In [78]:
#Bag of wordsを体験する
from sklearn.feature_extraction.text import CountVectorizer
count_vec_sample = CountVectorizer()
count_vec_sample

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [79]:

messages = ['Thank you for calling.',
            'Thank you for your inquiry',
            'Thanks for keeping in touch.',
            'Thanks for getting in touch with me?']


In [80]:
count_vec_sample.fit(messages)
count_vec_sample.vocabulary_


{'calling': 0,
 'for': 1,
 'getting': 2,
 'in': 3,
 'inquiry': 4,
 'keeping': 5,
 'me': 6,
 'thank': 7,
 'thanks': 8,
 'touch': 9,
 'with': 10,
 'you': 11,
 'your': 12}

In [81]:
data = count_vec_sample.transform(messages)
data.todense()

matrix([[1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0],
        [0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1],
        [0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0],
        [0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0]])

In [82]:
#データセットを分割する

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['message'],df['label'],
                                                    random_state=1)

In [83]:
#X_train、X_testをBag of wordsに置き換える
count_vector = CountVectorizer()
count_vector.fit(X_train)
count_vector.vocabulary_

{'4mths': 509,
 'half': 3181,
 'price': 5193,
 'orange': 4781,
 'line': 3971,
 'rental': 5479,
 'latest': 3880,
 'camera': 1572,
 'phones': 4987,
 'free': 2864,
 'had': 3170,
 'your': 7424,
 'phone': 4983,
 '11mths': 264,
 'call': 1552,
 'mobilesdirect': 4375,
 'on': 4743,
 '08000938767': 50,
 'to': 6656,
 'update': 6892,
 'now': 4662,
 'or2stoptxt': 4779,
 'cs': 2022,
 'did': 2222,
 'you': 7420,
 'stitch': 6218,
 'his': 3316,
 'trouser': 6758,
 'hope': 3362,
 'enjoyed': 2502,
 'new': 4580,
 'content': 1916,
 'text': 6514,
 'stop': 6228,
 '61610': 563,
 'unsubscribe': 6882,
 'help': 3276,
 '08712400602450p': 98,
 'provided': 5255,
 'by': 1538,
 'tones2you': 6683,
 'co': 1810,
 'uk': 6829,
 'not': 4647,
 'heard': 3255,
 'from': 2899,
 'u4': 6823,
 'while': 7199,
 'rude': 5612,
 'chat': 1691,
 'private': 5206,
 '01223585334': 5,
 'cum': 2040,
 'wan': 7075,
 '2c': 374,
 'pics': 5002,
 'of': 4704,
 'me': 4238,
 'gettin': 3002,
 'shagged': 5804,
 'then': 6552,
 'pix': 5023,
 '8552': 660,
 '

In [84]:
#transformメソッドを使用して、X_train,X_testを変換しましょう。
X_train = count_vector.transform(X_train)
X_test = count_vector.transform(X_test)

In [89]:
#モデル実装・学習
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train,y_train)
predict = clf.predict(X_test)

In [93]:
#評価
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
accuracy= accuracy_score(predict,y_test)
precision= precision_score(predict,y_test)
recall = recall_score(predict,y_test)
f1 = f1_score(predict,y_test)

In [105]:
print('accuracy_score:{0:.2f}'.format(accuracy))
print('precision_score:{0:.2f}'.format(precision))
print('recall_score:{0:.2f}'.format(recall))
print('f1_score:{0:.2f}'.format(f1))

accuracy_score:0.99
precision_score:0.94
recall_score:0.97
f1_score:0.96
