In [49]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [30]:
df = pd.read_csv('smsspamcollection.tsv', sep='\t')

In [5]:
df.head(10)

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2
5,spam,FreeMsg Hey there darling it's been 3 week's n...,147,8
6,ham,Even my brother is not like to speak with me. ...,77,2
7,ham,As per your request 'Melle Melle (Oru Minnamin...,160,6
8,spam,WINNER!! As a valued network customer you have...,157,6
9,spam,Had your mobile 11 months or more? U R entitle...,154,2


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 4 columns):
label      5572 non-null object
message    5572 non-null object
length     5572 non-null int64
punct      5572 non-null int64
dtypes: int64(2), object(2)
memory usage: 130.6+ KB


In [8]:
df.describe()

Unnamed: 0,length,punct
count,5572.0,5572.0
mean,80.48995,4.177495
std,59.942907,4.623919
min,2.0,0.0
25%,36.0,2.0
50%,62.0,3.0
75%,122.0,6.0
max,910.0,133.0


In [9]:
df.isnull().sum()

label      0
message    0
length     0
punct      0
dtype: int64

In [11]:
df['label'].value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [12]:
X = df[['length', 'punct']]
y = df['label']

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [16]:
X_train.shape

(3900, 2)

In [17]:
X_test.shape

(1672, 2)

In [18]:
from sklearn.linear_model import LogisticRegression

In [19]:
lr_model = LogisticRegression(solver='lbfgs')

In [20]:
lr_model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [21]:
from sklearn import metrics

In [22]:
predictions = lr_model.predict(X_test)

In [23]:
print(metrics.confusion_matrix(y_test, predictions))

[[1404   44]
 [ 219    5]]


In [25]:
df = pd.DataFrame(metrics.confusion_matrix(y_test,predictions), index=['ham','spam'], columns=['ham','spam'])
df

Unnamed: 0,ham,spam
ham,1404,44
spam,219,5


In [27]:
print(metrics.classification_report(y_test, predictions))

              precision    recall  f1-score   support

         ham       0.87      0.97      0.91      1448
        spam       0.10      0.02      0.04       224

   micro avg       0.84      0.84      0.84      1672
   macro avg       0.48      0.50      0.48      1672
weighted avg       0.76      0.84      0.80      1672



In [28]:
print(metrics.accuracy_score(y_test, predictions))

0.8427033492822966


In [32]:
X = df['message']
y = df['label']

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [35]:
from sklearn.feature_extraction.text import CountVectorizer

In [36]:
count_vect = CountVectorizer()

In [37]:
X_train_count = count_vect.fit_transform(X_train)

In [38]:
X_train_count

<3733x7082 sparse matrix of type '<class 'numpy.int64'>'
	with 49992 stored elements in Compressed Sparse Row format>

In [40]:
X_train_count.shape

(3733, 7082)

In [41]:
from sklearn.feature_extraction.text import TfidfTransformer

In [42]:
tfidf_transformer = TfidfTransformer()

In [43]:
X_train_tfidf = tfidf_transformer.fit_transform(X_train_count)

In [45]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [46]:
tfidf_vector = TfidfVectorizer()

In [47]:
X_train_tfidf_vector = tfidf_vector.fit_transform(X_train)

In [50]:
X_train_tfidf_vector

<3733x7082 sparse matrix of type '<class 'numpy.float64'>'
	with 49992 stored elements in Compressed Sparse Row format>

In [52]:
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

In [53]:
text_clf = Pipeline([('tfidf_vector', TfidfVectorizer()),
                     ('clf', LinearSVC())])

In [54]:
text_clf.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('tfidf_vector', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_id...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

In [55]:
predictions = text_clf.predict(X_test)

In [56]:
print(metrics.confusion_matrix(predictions, y_test))
print('\n\n')
print(metrics.classification_report(predictions, y_test))
print('\n\n')
print(metrics.accuracy_score(predictions, y_test))

[[1586   12]
 [   7  234]]



              precision    recall  f1-score   support

         ham       1.00      0.99      0.99      1598
        spam       0.95      0.97      0.96       241

   micro avg       0.99      0.99      0.99      1839
   macro avg       0.97      0.98      0.98      1839
weighted avg       0.99      0.99      0.99      1839




0.989668297988037


In [57]:
print(metrics.confusion_matrix(y_test, predictions))
print('\n\n')
print(metrics.classification_report(y_test, predictions))
print('\n\n')
print(metrics.accuracy_score(y_test, predictions))

[[1586    7]
 [  12  234]]



              precision    recall  f1-score   support

         ham       0.99      1.00      0.99      1593
        spam       0.97      0.95      0.96       246

   micro avg       0.99      0.99      0.99      1839
   macro avg       0.98      0.97      0.98      1839
weighted avg       0.99      0.99      0.99      1839




0.989668297988037


In [58]:
text_clf.predict(['Hii, It was nice meeting you!'])

array(['ham'], dtype=object)