In [17]:
# Perform imports and load the dataset:
import numpy as np
import pandas as pd

df = pd.read_csv('smsspamcollection.tsv', sep='\t')
df.head()

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2


In [18]:
df.isnull().sum()

label      0
message    0
length     0
punct      0
dtype: int64

In [19]:
df['label'].value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [None]:
from sklearn.model_selection import train_test_split
X = df['message']  
y = df['label']

In [20]:
X

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: message, Length: 5572, dtype: object

In [21]:
y

0        ham
1        ham
2       spam
3        ham
4        ham
        ... 
5567    spam
5568     ham
5569     ham
5570     ham
5571     ham
Name: label, Length: 5572, dtype: object

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()

X_train_counts = count_vect.fit_transform(X_train)
print(X_train_counts.shape)
print(X_train_counts)

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()

X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape
print(X_train_tfidf)

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

X_train_tfidf = vectorizer.fit_transform(X_train) 
X_train_tfidf.shape

print(X_train_tfidf)

  (0, 1736)	0.7135046738275388
  (0, 4415)	0.35852876712053044
  (0, 7069)	0.6019702680143677
  (1, 4519)	0.19824007486838768
  (1, 2472)	0.2174695059369183
  (1, 3726)	0.2019445301153727
  (1, 6219)	0.0743459370365268
  (1, 4018)	0.08517903405827133
  (1, 4518)	0.12552667891184652
  (1, 3416)	0.08402534579064597
  (1, 5436)	0.17424313877010142
  (1, 3116)	0.13652411260636216
  (1, 4489)	0.23669893700544894
  (1, 3501)	0.20622100985162556
  (1, 1797)	0.15290569110510804
  (1, 849)	0.15094660409003705
  (1, 1835)	0.1625121573959421
  (1, 4513)	0.09521739789951614
  (1, 5243)	0.2254504409201562
  (1, 938)	0.23669893700544894
  (1, 4470)	0.09310113083315118
  (1, 6250)	0.1869915787830949
  (1, 957)	0.09696276613143018
  (1, 7048)	0.062208359241353935
  (1, 3280)	0.10494370111466803
  :	:
  (3728, 2926)	0.1855922660561245
  (3728, 7048)	0.10087994978401515
  (3728, 3280)	0.17018155482163447
  (3729, 3674)	0.5570538854722596
  (3729, 3794)	0.4832600441125685
  (3729, 5795)	0.547487444568536

With Tfidftransformer you will systematically compute word counts using CountVectorizer and then compute the Inverse Document Frequency (IDF) values and only then compute the Tf-idf scores.

With Tfidfvectorizer on the contrary, you will do all three steps at once. Under the hood, it computes the word counts, IDF values, and Tf-idf scores all using the same dataset.

In [23]:
from sklearn.svm import LinearSVC
clf = LinearSVC()
clf.fit(X_train_tfidf,y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [24]:
from sklearn.pipeline import Pipeline

text_clf = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', LinearSVC()),
])

# Feed the training data through the pipeline
text_clf.fit(X_train, y_train)  

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 LinearSVC(C=1.0, class_weight=None, dual=True,
                           fit_intercept=True, intercept_scaling=1,
               

In [25]:
# Form a prediction set
predictions = text_clf.predict(X_test)

In [26]:
# Report the confusion matrix
from sklearn import metrics
print(metrics.confusion_matrix(y_test,predictions))

[[1586    7]
 [  12  234]]


In [27]:
# Print a classification report
print(metrics.classification_report(y_test,predictions))

              precision    recall  f1-score   support

         ham       0.99      1.00      0.99      1593
        spam       0.97      0.95      0.96       246

    accuracy                           0.99      1839
   macro avg       0.98      0.97      0.98      1839
weighted avg       0.99      0.99      0.99      1839



In [28]:
# Print the overall accuracy
print(metrics.accuracy_score(y_test,predictions))

0.989668297988037
