In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score,recall_score, accuracy_score
#GridSearchCV被移到了model_selection模块中

In [2]:
pipeline=Pipeline([
    ('vect',TfidfVectorizer(stop_words='english')),
    ('clf',LogisticRegression())
])
#sklean提供的pipeline来将多个学习器组成流水线，通常流水线的形式为：
#将数据标准化的学习器---特征提取的学习器---执行预测的学习器

In [3]:
parameters={
    'vect__max_df':(0.25,0.5,0.75),
    'vect__stop_words':('english',None),
    'vect__max_features':(2500,5000,None),
    'vect__ngram_range':((1,1),(1,2)),
    'vect__use_idf':(True,False),
    'clf__penalty':('l1','l2'),
    'clf__C':(0.01,0.1,1,10),
     
}
#是两个下划线

In [4]:
if __name__=="__main__":
    #__name__=="__main__"让你写的脚本模块不仅可以导入到别人的模块中使用，也能自己去执行
    #目的就是这个模块可以当做P22 Grid search.py当做模块导入，同时if下的语句表示可以针对某一个特定的数据进行分析
    df=pd.read_csv('SMSSpamCollection',sep='\t',header=None)
    X=df[1].values
    y=df[0].values
    label_encoder=LabelEncoder()
    y=label_encoder.fit_transform(y)
    X_train,X_test,y_train,y_test=train_test_split(X,y)
    
    grid_search=GridSearchCV(pipeline,parameters,n_jobs=-1,verbose=1,scoring='accuracy',cv=3)
    grid_search.fit(X_train,y_train)
    #交叉验证经常与网格搜索进行结合，作为参数评价的一种方法，这种方法叫做grid search with cross validation。
    #sklearn因此设计了一个这样的类GridSearchCV，这个类实现了fit，predict，score等方法，被当做了一个estimator，
    #使用fit方法，该过程中：（1）搜索到最佳参数；（2）实例化了一个最佳参数的estimator；
    
    print('Best score: %0.3f' %grid_search.best_score_)
    print('Best parameter set:')
    best_parameters=grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        #keys()方法用于返回字典中所有可用键的列表,parameters.keys()只针对我们设置的parameter进行排序和索引
        #sorted(L)返回一个排序后的L，不改变原始的L；L.sort()是对原始的L进行操作，调用后原始的L会改变，没有返回值
        print('\t%s:%r'%(param_name ,best_parameters[param_name]))
    
    
    predictions=grid_search.predict(X_test)
    print('Acurracy:%s'% accuracy_score(y_test,predictions))
    print('Precision:%s'% precision_score(y_test,predictions))
    print('Recall:%s'% recall_score(y_test,predictions))
    
    
    

Fitting 3 folds for each of 576 candidates, totalling 1728 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    6.3s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   23.0s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 1728 out of 1728 | elapsed:  5.6min finished


Best score: 0.983
Best parameter set:
	clf__C:10
	clf__penalty:'l2'
	vect__max_df:0.5
	vect__max_features:5000
	vect__ngram_range:(1, 2)
	vect__stop_words:None
	vect__use_idf:True
Acurracy:0.990667623833
Precision:0.994082840237
Recall:0.933333333333


In [9]:
 print(sorted(best_parameters))


['clf', 'clf__C', 'clf__class_weight', 'clf__dual', 'clf__fit_intercept', 'clf__intercept_scaling', 'clf__max_iter', 'clf__multi_class', 'clf__n_jobs', 'clf__penalty', 'clf__random_state', 'clf__solver', 'clf__tol', 'clf__verbose', 'clf__warm_start', 'memory', 'steps', 'vect', 'vect__analyzer', 'vect__binary', 'vect__decode_error', 'vect__dtype', 'vect__encoding', 'vect__input', 'vect__lowercase', 'vect__max_df', 'vect__max_features', 'vect__min_df', 'vect__ngram_range', 'vect__norm', 'vect__preprocessor', 'vect__smooth_idf', 'vect__stop_words', 'vect__strip_accents', 'vect__sublinear_tf', 'vect__token_pattern', 'vect__tokenizer', 'vect__use_idf', 'vect__vocabulary']


In [10]:
 print(best_parameters)   

{'memory': None, 'steps': [('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.5, max_features=5000, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)), ('clf', LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))], 'vect': TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.5, max_features=5000, min_df=1,
        ngram_range=(1, 2),