In [11]:
import numpy as np

In [12]:
# 단어집

vocabulary = {}
with open('./dataset/SMSSpamCollection') as file_handle:
    for line in file_handle:
        splits = line.split()
        label = splits[0]
        text = splits[1:]
        
        for word in text:
            lower = word.lower()
            if not word in vocabulary:
                vocabulary[word] = len(vocabulary) # 각 단어에 넘버링

In [13]:
# Number of words
len(vocabulary)

15733

In [14]:
# 단어 빈도 피처

features = []
with open('./dataset/SMSSpamCollection')as file_handle:
    for line in file_handle:
        feature = np.zeros(len(vocabulary))
        text = splits[1:]
        for word in text:
            if word == "Rofl.": # 에러 처리
                word = word[:-1]
            lower = word.lower()
            feature[vocabulary[lower]] += 1
        
        feature = feature / sum(feature)
        features.append(feature)

In [15]:
len(features)

5574

In [16]:
labels = []
with open('./dataset/SMSSpamCollection') as file_handle:
    for line in file_handle:
        splits = line.split()
        label = splits[0]
        if label == "spam":
            labels.append(1) # 1 for words begin with spam
        else:
            labels.append(0)

In [29]:
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

spam_header = "spam\t"
no_spam_header = "ham\t"
documents = []
labels = []

with open ('./dataset/SMSSpamCollection') as file_handle:
    for line in file_handle:
        if line.startswith(spam_header):
            labels.append(1)
            documents.append(line[len(spam_header):])
        elif line.startswith(no_spam_header):
            labels.append(0)
            documents.append(line[len(no_spam_header):])
            
vectorizer = CountVectorizer()
term_counts = vectorizer.fit_transform(documents)
vocabulary = vectorizer.get_feature_names()

tf_transformer = TfidfTransformer(use_idf=False).fit(term_counts)
features = tf_transformer.transform(term_counts)

with open ('./dataset/processed.pickle', 'wb') as file_handle:
    pickle.dump((vocabulary, features, labels), file_handle)

In [30]:
features.shape

(5574, 8713)

In [31]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

with open('./dataset/processed.pickle', 'rb') as file_handle:
    vocabulary, features, labels = pickle.load(file_handle)
    
labels = np.array(labels)
X_train, X_test, y_train, y_test = train_test_split(features, labels)

lrclf = LogisticRegression()
lrclf.fit(X_train, y_train)
print("Train Score: {:.4f}".format(lrclf.score(X_train, y_train)))
print("Test Score: {:.4f}".format(lrclf.score(X_test, y_test)))

Train Score: 0.9775
Test Score: 0.9663


In [32]:
weights = lrclf.coef_[0]
pairs = []
for idx, value in enumerate(weights):
    pairs.append((abs(value), vocabulary[idx]))
pairs.sort(key=lambda x: x[0], reverse=True)
for pair in pairs[:20]:
    print("score: {:4f} word: {}".format(pair[0], pair[1]))

score: 4.854228 word: call
score: 4.687929 word: txt
score: 3.723260 word: free
score: 3.289794 word: text
score: 3.060820 word: stop
score: 3.036895 word: www
score: 2.829750 word: 150p
score: 2.818706 word: uk
score: 2.755340 word: claim
score: 2.659738 word: reply
score: 2.640322 word: mobile
score: 2.559195 word: from
score: 2.391166 word: or
score: 2.385622 word: to
score: 2.335529 word: service
score: 2.318410 word: your
score: 2.259466 word: my
score: 2.243783 word: me
score: 2.202420 word: com
score: 2.200597 word: prize
