In [29]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score,recall_score,accuracy_score,f1_score
from sklearn.preprocessing import LabelBinarizer
lb=LabelBinarizer()
pipeline=Pipeline([('vect',TfidfVectorizer(stop_words="english")),('clf',LogisticRegression())])

In [30]:
parameters={'vect__max_df':(0.25,0.5,0.75),
            'vect__stop_words':('english',None),
            'vect__max_features':(2500,5000,1000,None),
            'vect__ngram_range':((1,1),(1,2)),'vect__use_idf':(True,False),
            'vect__norm':('l1','l2'),
            'clf__penalty':('l1','l2'),
            'clf__C':(0.01,0.1,1,10)}

In [31]:
grid_search=GridSearchCV(pipeline,parameters,n_jobs=-1,verbose=-1,
                        scoring='accuracy',cv=3)

In [32]:
df=pd.read_csv('C:\\Users\\Dell\\Downloads\\smsspamcollection\\SMSSpamCollection.csv',delimiter='\t',header=None)
X,y=df.iloc[:,1],df.iloc[:,0]
x_train,x_test,y_train,y_test=train_test_split(X,y)
y_train=lb.fit_transform(y_train)
y_test=lb.transform(y_test)
print(y_train)
print(y_test)


[[0]
 [1]
 [0]
 ...
 [1]
 [0]
 [0]]
[[1]
 [0]
 [0]
 ...
 [0]
 [0]
 [1]]


In [33]:
grid_search.fit(x_train,y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    9.2s
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:   26.6s
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed:   58.4s
[Parallel(n_jobs=-1)]: Done 1144 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 2584 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done 3520 tasks      | elapsed:  6.3min
[Parallel(n_jobs=-1)]: Done 4600 tasks      | elapsed:  8.7min
[Parallel(n_jobs=-1)]: Done 4608 out of 4608 | elapsed:  8.7min finished
  return f(**kwargs)


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('vect',
                                        TfidfVectorizer(stop_words='english')),
                                       ('clf', LogisticRegression())]),
             n_jobs=-1,
             param_grid={'clf__C': (0.01, 0.1, 1, 10),
                         'clf__penalty': ('l1', 'l2'),
                         'vect__max_df': (0.25, 0.5, 0.75),
                         'vect__max_features': (2500, 5000, 1000, None),
                         'vect__ngram_range': ((1, 1), (1, 2)),
                         'vect__norm': ('l1', 'l2'),
                         'vect__stop_words': ('english', None),
                         'vect__use_idf': (True, False)},
             scoring='accuracy', verbose=-1)

In [35]:
print("Best score: %0.3f"%grid_search.best_score_)

Best score: 0.982


In [37]:
print("Best parameter set :")
best_parameters=grid_search.best_estimator_.get_params()
#print(best_parameters)
for param_name in sorted(parameters.keys()):
    print('\t%r : %r'%(param_name,best_parameters[param_name]))

Best parameter set :
	'clf__C' : 10
	'clf__penalty' : 'l2'
	'vect__max_df' : 0.25
	'vect__max_features' : 5000
	'vect__ngram_range' : (1, 2)
	'vect__norm' : 'l2'
	'vect__stop_words' : None
	'vect__use_idf' : False


In [42]:
predictions=grid_search.predict(x_test)
print('Accuracy',accuracy_score(y_test,predictions))
print('Precision',precision_score(y_test,predictions))
print('Recall',recall_score(y_test,predictions))
print('F1 Score',f1_score(y_test,predictions))

Accuracy 0.9885139985642498
Precision 0.9940119760479041
Recall 0.9171270718232044
F1 Score 0.9540229885057472
