In [1]:
import string
from nltk.corpus import stopwords

In [7]:
stopwords.words("english")[0:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [8]:
stopwords.words("hinglish")[0:10]

['a', 'aadi', 'aaj', 'aap', 'aapne', 'aata', 'aati', 'aaya', 'aaye', 'ab']

In [14]:
test_sentence = 'This is my first string. Wow!! We are doing just fine'

In [15]:
print(string.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [16]:
no_punctuations = [char for char in test_sentence if char not in string.punctuation]
no_punctuations

['T',
 'h',
 'i',
 's',
 ' ',
 'i',
 's',
 ' ',
 'm',
 'y',
 ' ',
 'f',
 'i',
 'r',
 's',
 't',
 ' ',
 's',
 't',
 'r',
 'i',
 'n',
 'g',
 ' ',
 'W',
 'o',
 'w',
 ' ',
 'W',
 'e',
 ' ',
 'a',
 'r',
 'e',
 ' ',
 'd',
 'o',
 'i',
 'n',
 'g',
 ' ',
 'j',
 'u',
 's',
 't',
 ' ',
 'f',
 'i',
 'n',
 'e']

In [17]:
no_punctuations = ''.join(no_punctuations)
no_punctuations  # final string with all the punctuations removed.

'This is my first string Wow We are doing just fine'

In [23]:
# now removing the stopwords from the string without punctuations to get our stopword free string
clean_sentence = [word for word in no_punctuations.split() if word.lower() not in stopwords.words("english")]

In [24]:
clean_sentence

['first', 'string', 'Wow', 'fine']

**Count vectorization**

In [26]:
from sklearn.feature_extraction.text import CountVectorizer

In [27]:
vectorizer = CountVectorizer()

In [29]:
document1 = 'Hi How are you'
document2 = 'Today is a very very very pleasent day and we can have some fun fun fun'
document3 = 'This was an amazing experience!'

In [30]:
list_of_documents = [document1, document2, document3]

In [31]:
# fit the documents as bag of words
bag_of_words = vectorizer.fit(list_of_documents)

In [32]:
bag_of_words

In [33]:
bag_of_words = vectorizer.transform(list_of_documents)


In [34]:
bag_of_words

<3x20 sparse matrix of type '<class 'numpy.int64'>'
	with 20 stored elements in Compressed Sparse Row format>

In [35]:
print(bag_of_words)

  (0, 3)	1
  (0, 9)	1
  (0, 10)	1
  (0, 19)	1
  (1, 2)	1
  (1, 4)	1
  (1, 5)	1
  (1, 7)	3
  (1, 8)	1
  (1, 11)	1
  (1, 12)	1
  (1, 13)	1
  (1, 15)	1
  (1, 16)	3
  (1, 18)	1
  (2, 0)	1
  (2, 1)	1
  (2, 6)	1
  (2, 14)	1
  (2, 17)	1


In [37]:
# verify the vocabulary for repeated words
print(vectorizer.vocabulary_.get("very"))
print(vectorizer.vocabulary_.get("fun"))

16
7


**Pipeline and Grid Search**

In [38]:
import pandas as pd
import string
from pprint import pprint
from time import time

In [39]:
df_spam_collection = pd.read_csv('SpamCollection',sep='\t',names=['response', 'message'])

In [40]:
df_spam_collection.head()

Unnamed: 0,response,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [41]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier

In [44]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [52]:
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier())
])


In [53]:
parameters = {'tfidf__use_idf':(True,False)}

In [54]:
grid_search = GridSearchCV(pipeline,parameters,n_jobs=-1,verbose=1)
print("performing grid search now...")
print("parameters: ")
pprint(parameters)
t0 = time()
grid_search.fit(df_spam_collection['message'],df_spam_collection['response'])
print('Done in  %0.3fs'% (time()-t0))
print()

performing grid search now...
parameters: 
{'tfidf__use_idf': (True, False)}
Fitting 5 folds for each of 2 candidates, totalling 10 fits
Done in  1.784s

