### Eliminate punctuation and stopwords from the sentence

In [2]:
# import required libraries
import string
from nltk.corpus import stopwords

In [10]:
# View first 10 stopwords present in the english corpus
stopwords.words('english')[0:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [35]:
# for i in stopwords.words('english'):
#     if i=='fine':
#         print(i)

[i for i in stopwords.words('english') if i=='me']  #for searching 'me' inside the corpus 'english'

['me']

In [11]:
# Create a test sentence
test_sentence = 'This is my first string. Wow! we are doing just fine'

In [12]:
# Eliminate the punctuation in form of characters and print them
no_punctuation = [char for char in test_sentence if char not in string.punctuation]
no_punctuation

['T',
 'h',
 'i',
 's',
 ' ',
 'i',
 's',
 ' ',
 'm',
 'y',
 ' ',
 'f',
 'i',
 'r',
 's',
 't',
 ' ',
 's',
 't',
 'r',
 'i',
 'n',
 'g',
 ' ',
 'W',
 'o',
 'w',
 ' ',
 'w',
 'e',
 ' ',
 'a',
 'r',
 'e',
 ' ',
 'd',
 'o',
 'i',
 'n',
 'g',
 ' ',
 'j',
 'u',
 's',
 't',
 ' ',
 'f',
 'i',
 'n',
 'e']

In [14]:
# Now eliminate the punctuation and print them as a whole sentence
no_punctuation = ''.join(no_punctuation)
no_punctuation

'This is my first string Wow we are doing just fine'

In [15]:
# Split each words present in the new sentence
no_punctuation.split()

['This',
 'is',
 'my',
 'first',
 'string',
 'Wow',
 'we',
 'are',
 'doing',
 'just',
 'fine']

In [17]:
# Now eliminate stopwords
clean_sentence = [word for word in no_punctuation.split() if word.lower() not in stopwords.words('english')]

In [18]:
# Print the final cleaned sentence
clean_sentence

['first', 'string', 'Wow', 'fine']

##### eliminated strings - ~~This~~ ~~is~~ ~~my~~ ~~we~~ ~~are~~ ~~doing~~ ~~just~~

In [37]:
# load the dataset
from sklearn.datasets import load_digits

In [38]:
# create object of the dataset
digit_dataset = load_digits()

In [40]:
# use built-in DESCR function to describe dataset
digit_dataset.DESCR

".. _digits_dataset:\n\nOptical recognition of handwritten digits dataset\n--------------------------------------------------\n\n**Data Set Characteristics:**\n\n    :Number of Instances: 1797\n    :Number of Attributes: 64\n    :Attribute Information: 8x8 image of integer pixels in the range 0..16.\n    :Missing Attribute Values: None\n    :Creator: E. Alpaydin (alpaydin '@' boun.edu.tr)\n    :Date: July; 1998\n\nThis is a copy of the test set of the UCI ML hand-written digits datasets\nhttps://archive.ics.uci.edu/ml/datasets/Optical+Recognition+of+Handwritten+Digits\n\nThe data set contains images of hand-written digits: 10 classes where\neach class refers to a digit.\n\nPreprocessing programs made available by NIST were used to extract\nnormalized bitmaps of handwritten digits from a preprinted form. From a\ntotal of 43 people, 30 contributed to the training set and different 13\nto the test set. 32x32 bitmaps are divided into nonoverlapping blocks of\n4x4 and the number of on pixel

In [41]:
type(digit_dataset)

sklearn.utils.Bunch

In [42]:
digit_dataset.data

array([[ 0.,  0.,  5., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ..., 10.,  0.,  0.],
       [ 0.,  0.,  0., ..., 16.,  9.,  0.],
       ...,
       [ 0.,  0.,  1., ...,  6.,  0.,  0.],
       [ 0.,  0.,  2., ..., 12.,  0.,  0.],
       [ 0.,  0., 10., ..., 12.,  1.,  0.]])

In [43]:
digit_dataset.target

array([0, 1, 2, ..., 8, 9, 8])

## Bag of words

In [54]:
# import required library
from sklearn.feature_extraction.text import CountVectorizer

In [55]:
# instantiate the vectotizer
vectorizer = CountVectorizer()

In [64]:
# create 3 documents
doc1 = "This is first document"
doc2 = "This is second document"
doc3 = "This is third document"

In [65]:
# put them together
listofdocument = [doc1, doc2, doc3]

In [66]:
# fit them as bag of words
bag_of_words = vectorizer.fit(listofdocument)

In [67]:
# check bag of words
bag_of_words

CountVectorizer()

In [68]:
# apply transform method
bag_of_words = vectorizer.transform(listofdocument)

In [69]:
# print bag of words
print(bag_of_words)

  (0, 0)	1
  (0, 1)	1
  (0, 2)	1
  (0, 5)	1
  (1, 0)	1
  (1, 2)	1
  (1, 3)	1
  (1, 5)	1
  (2, 0)	1
  (2, 2)	1
  (2, 4)	1
  (2, 5)	1


In [71]:
# verify the vocabulary for repeated word
print(vectorizer.vocabulary_.get('second'))
print(vectorizer.vocabulary_.get('document'))

3
0


In [72]:
# Check the type of bag of words
type(bag_of_words)

scipy.sparse.csr.csr_matrix

## Pipeline and grid search

In [80]:
# import required libraries
import pandas as pd
import string
from pprint import pprint
from time import time

In [None]:
# import the dataset
df_spam_collection = pd.read_csv('', sep='\t', names = ['response','message'])

In [76]:
# view first 5 records with head method
df_spam_collection.head()

In [83]:
# import text processing libraries
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

# import SGD classifier
from sklearn.linear_model import SGDClassifier

# import for gridsearch
from sklearn.model_selection import GridSearchCV

# import for pipeline
from sklearn.pipeline import Pipeline

# define the pipeline
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier())
])

In [84]:
# parameters for grid search
parameters = {'tfidf__use_idf': (True, False)}

In [86]:
# perform the gridsearch with pipeline and parameters
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)
print('Performing grid search now ...')
print('parameters:')
pprint(parameters)
t0 = time()
grid_search.fit(df_spam_collection['message'],df_spam_collection['response'])
print("Done in %0.3fs" %(time()-t0))
print()

Performing grid search now ...
parameters:
{'tfidf__use_idf': (True, False)}


NameError: name 'df_spam_collection' is not defined