## spam 구분하는 머신러닝 모델

목표 : 텍스트 처리 모델 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
message_df = pd.read_csv("data-files/SMSSpamCollection", sep="\t", header=None, names=['label', 'message'])

In [3]:
message_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    5572 non-null   object
 1   message  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [4]:
message_df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
message_df['label'].value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [6]:
# 범주형 인코딩
from sklearn.preprocessing import LabelEncoder

message_df['label'] = LabelEncoder().fit_transform(message_df['label'])

In [7]:
message_df['label'].value_counts()

0    4825
1     747
Name: label, dtype: int64

In [8]:
# preprocessing
message_df['message'] = message_df['message'].str.lower()
message_df['message'] = message_df['message'].str.replace('[^\w\s]', '')

message_df['message']

0       go until jurong point crazy available only in ...
1                                 ok lar joking wif u oni
2       free entry in 2 a wkly comp to win fa cup fina...
3             u dun say so early hor u c already then say
4       nah i dont think he goes to usf he lives aroun...
                              ...                        
5567    this is the 2nd time we have tried 2 contact u...
5568                  will ü b going to esplanade fr home
5569    pity  was in mood for that soany other suggest...
5570    the guy did some bitching but i acted like id ...
5571                            rofl its true to its name
Name: message, Length: 5572, dtype: object

In [9]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/nahyeonan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [10]:
message_df['message'] = message_df['message'].map(nltk.word_tokenize)

In [11]:
message_df['message'].head()

0    [go, until, jurong, point, crazy, available, o...
1                       [ok, lar, joking, wif, u, oni]
2    [free, entry, in, 2, a, wkly, comp, to, win, f...
3    [u, dun, say, so, early, hor, u, c, already, t...
4    [nah, i, dont, think, he, goes, to, usf, he, l...
Name: message, dtype: object

In [12]:
# 어근 추출
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

message_df['message'] = message_df['message'].map(lambda x: [stemmer.stem(i) for i in x])

message_df['message'] = message_df['message'].map(lambda x: ' '.join(x))

In [13]:
# 단어 -> 숫자(절대 빈도수)
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
count = vectorizer.fit_transform(message_df['message'])

count

<5572x8169 sparse matrix of type '<class 'numpy.int64'>'
	with 72500 stored elements in Compressed Sparse Row format>

In [14]:
# 단어 -> 숫자(상대 빈도수)
from sklearn.feature_extraction.text import TfidfTransformer

transformer = TfidfTransformer()
count2 = transformer.fit_transform(count)

count2

<5572x8169 sparse matrix of type '<class 'numpy.float64'>'
	with 72500 stored elements in Compressed Sparse Row format>

In [15]:
print(count)

  (0, 1146)	1
  (0, 1340)	1
  (0, 1748)	1
  (0, 1750)	1
  (0, 2029)	1
  (0, 2248)	1
  (0, 3336)	1
  (0, 3388)	1
  (0, 3425)	1
  (0, 3872)	1
  (0, 4128)	1
  (0, 4273)	1
  (0, 5292)	1
  (0, 5635)	1
  (0, 7130)	1
  (0, 7497)	1
  (0, 7715)	1
  (0, 7925)	1
  (1, 4094)	1
  (1, 4308)	1
  (1, 5257)	1
  (1, 5289)	1
  (1, 7835)	1
  (2, 71)	1
  (2, 433)	1
  :	:
  (5570, 1777)	1
  (5570, 2492)	1
  (5570, 2760)	1
  (5570, 3105)	1
  (5570, 3148)	1
  (5570, 3255)	1
  (5570, 3477)	1
  (5570, 3559)	1
  (5570, 3823)	1
  (5570, 3872)	1
  (5570, 3940)	1
  (5570, 3987)	1
  (5570, 4396)	1
  (5570, 5048)	1
  (5570, 6587)	1
  (5570, 6596)	1
  (5570, 7109)	1
  (5570, 7236)	1
  (5570, 7534)	1
  (5570, 7754)	1
  (5571, 3987)	2
  (5571, 4970)	1
  (5571, 6114)	1
  (5571, 7236)	1
  (5571, 7366)	1


In [16]:
print(count2)

  (0, 7925)	0.22378642176936625
  (0, 7715)	0.18293604147358436
  (0, 7497)	0.232012730496152
  (0, 7130)	0.15808501470085967
  (0, 5635)	0.22485506312666312
  (0, 5292)	0.1588008730270491
  (0, 4273)	0.2781965206152583
  (0, 4128)	0.32930301835453774
  (0, 3872)	0.10860920003212803
  (0, 3425)	0.18328548053939198
  (0, 3388)	0.15280952404957904
  (0, 3336)	0.132266862568599
  (0, 2248)	0.255022519528138
  (0, 2029)	0.2781965206152583
  (0, 1750)	0.2781965206152583
  (0, 1748)	0.31435532599420324
  (0, 1340)	0.2504083119963028
  (0, 1146)	0.32930301835453774
  (1, 7835)	0.44483654514496557
  (1, 5289)	0.5633498837724461
  (1, 5257)	0.2825014776211812
  (1, 4308)	0.42081977871680865
  (1, 4094)	0.4773478663822099
  (2, 7883)	0.18653623125647448
  (2, 7848)	0.14242759355834578
  :	:
  (5570, 6587)	0.19054252105358732
  (5570, 5048)	0.21643786562194572
  (5570, 4396)	0.16284308112975754
  (5570, 3987)	0.11780359009346424
  (5570, 3940)	0.27149395792904457
  (5570, 3872)	0.1156240697440695

In [17]:
# target / features 분할
X = count
y = message_df['label']

In [18]:
# train / test data 분할

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [19]:
X_train.shape

(4457, 8169)

In [20]:
X_test.shape

(1115, 8169)

### LogisticRegression

In [21]:
# model

from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()
logreg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [22]:
# score

logreg.score(X_train, y_train), logreg.score(X_test, y_test)

(0.9973076060130133, 0.9838565022421525)

In [23]:
# prediction

prediction = logreg.predict(X_test[:10])
print(prediction)
print(y_test[:10])

[0 0 0 1 0 0 0 0 0 0]
2825    0
3695    0
3904    0
576     1
2899    0
3456    0
5128    0
919     0
2505    0
17      0
Name: label, dtype: int64


### Naive Bayes

In [24]:
from sklearn.naive_bayes import BernoulliNB

bnb = BernoulliNB()
bnb.fit(X_train, y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [25]:
bnb.score(X_train, y_train), bnb.score(X_test, y_test)

(0.9851918330715728, 0.9730941704035875)

In [26]:
prediction = bnb.predict(X_test[:10])
print(prediction)
print(y_test[:10])

[0 0 0 1 0 0 0 0 0 0]
2825    0
3695    0
3904    0
576     1
2899    0
3456    0
5128    0
919     0
2505    0
17      0
Name: label, dtype: int64
