In [4]:
import pandas as pd
import numpy as np
import sklearn 

In [5]:
doc=pd.read_csv("smsspamcollection.tsv",sep='\t')
print(doc.head())

  label                                            message  length  punct
0   ham  Go until jurong point, crazy.. Available only ...     111      9
1   ham                      Ok lar... Joking wif u oni...      29      6
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...     155      6
3   ham  U dun say so early hor... U c already then say...      49      6
4   ham  Nah I don't think he goes to usf, he lives aro...      61      2


In [6]:
vocab={}
i=1
with open('1.txt') as f:
    x=f.read().lower().split()
for word in x:
    if word in vocab:
        continue
    else:
        vocab[word]=i
        i+=1
print(vocab)

{'this': 1, 'is': 2, 'a': 3, 'story': 4, 'about': 5, 'cats': 6, 'our': 7, 'feline': 8, 'pets': 9, 'are': 10, 'furry': 11, 'animals': 12}


In [7]:
with open('2.txt') as f:
    x=f.read().lower().split()
print(x)
for word in x:
    if word in vocab:
        continue
    else:
        vocab[word]=i
        i+=1
print(vocab)

['this', 'story', 'is', 'about', 'surfing', 'catching', 'waves', 'is', 'fun', 'surfing', 'is', 'a', 'popular', 'water', 'sport']
{'this': 1, 'is': 2, 'a': 3, 'story': 4, 'about': 5, 'cats': 6, 'our': 7, 'feline': 8, 'pets': 9, 'are': 10, 'furry': 11, 'animals': 12, 'surfing': 13, 'catching': 14, 'waves': 15, 'fun': 16, 'popular': 17, 'water': 18, 'sport': 19}


In [8]:
one=['1.txt']+[0]*len(vocab)
print(one)

['1.txt', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [9]:
with open('1.txt') as f:
    x=f.read().lower().split()
for word in x:
    one[vocab[word]]+=1
print(one)

['1.txt', 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]


In [10]:
two=['2.txt']+[0]*len(vocab)
print(two)
with open(('2.txt')) as f:
    x=f.read().lower().split()
for word in x:
    two[vocab[word]]+=1
print(two)

['2.txt', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
['2.txt', 1, 3, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 2, 1, 1, 1, 1, 1, 1]


In [11]:
print(f"{one}\n{two}")

['1.txt', 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]
['2.txt', 1, 3, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 2, 1, 1, 1, 1, 1, 1]


By comparing the vectors we see that some words are common to both, some appear only in `1.txt`, others only in `2.txt`. Extending this logic to tens of thousands of documents, we would see the vocabulary dictionary grow to hundreds of thousands of words. Vectors would contain mostly zero values, making them *sparse matrices*.

In [12]:
df=pd.read_csv("smsspamcollection.tsv",sep='\t')
print(df.head())

  label                                            message  length  punct
0   ham  Go until jurong point, crazy.. Available only ...     111      9
1   ham                      Ok lar... Joking wif u oni...      29      6
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...     155      6
3   ham  U dun say so early hor... U c already then say...      49      6
4   ham  Nah I don't think he goes to usf, he lives aro...      61      2


In [29]:
from sklearn.model_selection import train_test_split
x=df['message']
y=df['label']
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=42)
print(x_test)


3245    Squeeeeeze!! This is christmas hug.. If u lik ...
944     And also I've sorta blown him off a couple tim...
1044    Mmm thats better now i got a roast down me! i...
2484        Mm have some kanji dont eat anything heavy ok
812     So there's a ring that comes with the guys cos...
                              ...                        
668                 This pay is  &lt;DECIMAL&gt;  lakhs:)
218             Easy ah?sen got selected means its good..
5536    Aiyah ok wat as long as got improve can alread...
1657    Yes I posted a couple of pics on fb. There's s...
3875       No. Did you multimedia message them or e-mail?
Name: message, Length: 1393, dtype: object


## Scikit-learn's CountVectorizer
Text preprocessing, tokenizing and the ability to filter out stopwords are all included in [CountVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html), which builds a dictionary of features and transforms documents to feature vectors.

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect=CountVectorizer()
x_train_counts=count_vect.fit_transform(x_train)
print(x_train_counts)

  (0, 7278)	1
  (0, 1047)	1
  (0, 7004)	1
  (0, 4583)	1
  (0, 2052)	1
  (0, 7454)	1
  (0, 3231)	1
  (0, 1245)	1
  (0, 5804)	1
  (0, 6691)	2
  (0, 5447)	1
  (0, 698)	1
  (0, 5244)	1
  (0, 5587)	1
  (0, 1758)	2
  (0, 1550)	1
  (0, 193)	1
  (0, 1815)	1
  (0, 3805)	1
  (0, 7000)	1
  (0, 268)	1
  (0, 3377)	1
  (0, 4772)	1
  (1, 7454)	1
  (1, 6080)	1
  :	:
  (4174, 7329)	1
  (4174, 2426)	1
  (4174, 7313)	1
  (4175, 1550)	1
  (4175, 4009)	1
  (4175, 3882)	1
  (4175, 6129)	1
  (4176, 2895)	1
  (4176, 3252)	1
  (4176, 5433)	1
  (4176, 5172)	1
  (4176, 6134)	1
  (4176, 6133)	1
  (4177, 4675)	1
  (4177, 5833)	1
  (4177, 6707)	1
  (4177, 3700)	1
  (4178, 3240)	1
  (4178, 3729)	1
  (4178, 3601)	1
  (4178, 2221)	1
  (4178, 5733)	1
  (4178, 6475)	1
  (4178, 6095)	1
  (4178, 4526)	1


Transforming counts to frquency with tfidf transformer


In [77]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer=TfidfTransformer()

x_train_tfidf=tfidf_transformer.fit_transform(x_train_counts)
print(x_train_tfidf)

  (0, 7454)	0.07296508854525731
  (0, 7278)	0.2175165614381031
  (0, 7004)	0.22534708380553428
  (0, 7000)	0.20760083145563224
  (0, 6691)	0.141539967341859
  (0, 5804)	0.19868987802008992
  (0, 5587)	0.2509482628533065
  (0, 5447)	0.28070018637607863
  (0, 5244)	0.1742914275865926
  (0, 4772)	0.13923802451667316
  (0, 4583)	0.19868987802008992
  (0, 3805)	0.28070018637607863
  (0, 3377)	0.20760083145563224
  (0, 3231)	0.10882678981142475
  (0, 2052)	0.18108062992459154
  (0, 1815)	0.2043511255475615
  (0, 1758)	0.3260359616382075
  (0, 1550)	0.1074686845391386
  (0, 1245)	0.15433293650012758
  (0, 1047)	0.14830270818246083
  (0, 698)	0.24502828917509248
  (0, 268)	0.22844180154286198
  (0, 193)	0.28070018637607863
  (1, 7459)	0.1385459253783691
  (1, 7454)	0.09615771392941105
  :	:
  (4174, 2221)	0.221180227428022
  (4174, 1533)	0.20143400946858403
  (4174, 1150)	0.2072389743122357
  (4175, 6129)	0.5503632832222988
  (4175, 4009)	0.4849140455629369
  (4175, 3882)	0.5535979814892258
  

Combine Steps with TfidVectorizer:
This allows to combine tfif transformer with count vectorizer.

In [78]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

x_train_tfidf = vectorizer.fit_transform(x_train) # remember to use the original X_train set
x_train_tfidf

<4179x7491 sparse matrix of type '<class 'numpy.float64'>'
	with 55879 stored elements in Compressed Sparse Row format>

Train a Classifier

## Train a Classifier
Here we'll introduce an SVM classifier that's similar to SVC, called [LinearSVC](https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html). LinearSVC handles sparse input better, and scales well to large numbers of samples.

In [79]:
from sklearn.svm import LinearSVC
clf = LinearSVC()
clf.fit(x_train_tfidf,y_train)



Build a Pipeline

In [19]:
from sklearn.pipeline import Pipeline

text_clf=Pipeline([('tfidf',TfidfVectorizer()),
                   ('clif',LinearSVC())])
text_clf.fit(x_train , y_train)




In [20]:
y_pred=text_clf.predict(x_test)


Result Prediciton

In [49]:
from sklearn import metrics

print(metrics.confusion_matrix(y_test,y_pred))

[[1205    2]
 [  11  175]]


In [22]:
print(metrics.accuracy_score(y_test,y_pred))

0.990667623833453


In [75]:
# res=text_clf.predict([["hey my gay friend, we are offering you a chance to get straight."]])
doc=pd.read_csv("test.tsv",sep='\t')
print(doc['message'])
text_clf.predict(doc['message'])

0    Hurry up! Registration opens for Company Secre...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
5    FreeMsg Hey there darling it's been 3 week's n...
Name: message, dtype: object


array(['ham', 'ham', 'spam', 'ham', 'ham', 'spam'], dtype=object)

In [84]:
text_clf.predict(["FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv	147	8"])

array(['spam'], dtype=object)