In [4]:
#Text Classification
#feature extraction
#count vectorization: will form a DTM of the features/words in a text/string

messages= ['Hey, lets go to the game toaay!', 'Call your friends', 'want to go walk your dogs?']
#we'll use sklearn and not spacy for count vectorization

from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer()
dtm= vect.fit_transform(messages)
print(dtm)


  (0, 5)	1
  (0, 6)	1
  (0, 4)	1
  (0, 8)	1
  (0, 7)	1
  (0, 3)	1
  (0, 9)	1
  (1, 0)	1
  (1, 12)	1
  (1, 2)	1
  (2, 4)	1
  (2, 8)	1
  (2, 12)	1
  (2, 11)	1
  (2, 10)	1
  (2, 1)	1


In [5]:
#we also have tfidvectorizer

from sklearn.feature_extraction.text import TfidfVectorizer
vect= TfidfVectorizer()
dtm= vect.fit_transform(messages)
print(dtm)

  (0, 9)	0.40301621080355077
  (0, 3)	0.40301621080355077
  (0, 7)	0.40301621080355077
  (0, 8)	0.3065042162415877
  (0, 4)	0.3065042162415877
  (0, 6)	0.40301621080355077
  (0, 5)	0.40301621080355077
  (1, 2)	0.6227660078332259
  (1, 12)	0.4736296010332684
  (1, 0)	0.6227660078332259
  (2, 1)	0.45954803293870056
  (2, 10)	0.45954803293870056
  (2, 11)	0.45954803293870056
  (2, 12)	0.3494981241087058
  (2, 8)	0.3494981241087058
  (2, 4)	0.3494981241087058


In [1]:
#text preprocessing, stopwords removal, tokenization all included in count vectorizer
#TfidfVectorizer combines the CountVectorizer and TfidfTransformer
#we're gonna use a pipeline to determine whether a text is 'spam' or 'ham'
#the pipeline has 2 steps, first is TfidfVectorizer and the second is to train our LinearSVC model

import pandas as pd
df= pd.read_csv('./smsspamcollection.tsv', sep='\t')
df.head()

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2


In [4]:
from sklearn.model_selection import train_test_split
X= df['message']
y=df['label']
X_train, X_test, y_train, y_test= train_test_split(X, y)
X_train

157                           I'm leaving my house now...
3436    Hi darlin i cantdo anythingtomorrow as myparen...
4401                      Juz go google n search 4 qet...
1686                  Cramps stopped. Going back to sleep
883     I love to give massages. I use lots of baby oi...
                              ...                        
19      England v Macedonia - dont miss the goals/team...
544       4 oclock at mine. Just to bash out a flat plan.
1926    We don call like  &lt;#&gt;  times oh. No give...
1368    I don't know, same thing that's wrong everyso ...
1531                        I think chennai well settled?
Name: message, Length: 4179, dtype: object

In [5]:
#now we build the pipeline
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
text_clf= Pipeline([('tfidf', TfidfVectorizer()),('clf', LinearSVC())])
text_clf.fit(X_train, y_train)
predictions= text_clf.predict(X_test)


In [7]:
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test,predictions), '\n', classification_report(y_test,predictions))

[[1199    2]
 [  17  175]] 
               precision    recall  f1-score   support

         ham       0.99      1.00      0.99      1201
        spam       0.99      0.91      0.95       192

    accuracy                           0.99      1393
   macro avg       0.99      0.95      0.97      1393
weighted avg       0.99      0.99      0.99      1393



In [9]:
text_clf.predict(['hey how are you doing'])

array(['ham'], dtype=object)

In [12]:
text_clf.predict(['congratulatons, you have won $6 million. call us at 12:00'])

array(['ham'], dtype=object)

In [14]:
text_clf.predict(['you have won 1 million dollars, please call our office to receive it'])

array(['spam'], dtype=object)