In [1]:
import pandas as pd
import numpy as np
titanic = pd.read_csv('http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt')
titanic.head()
titanic.info()
X = titanic[['pclass','age','sex']]
y = titanic['survived']
X['age'].fillna(X['age'].mean(),inplace=True)
X.info()
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=33)
X_train = X_train.to_dict(orient='record')
X_test = X_test.to_dict(orient='record')
#将非数值型数据转换为数值型数据
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
clf = Pipeline([('vecd',DictVectorizer(sparse=False)),('dtc',DecisionTreeClassifier())])
vec = DictVectorizer(sparse=False)
 
clf.fit(X_train,y_train)
y_predict = clf.predict(X_test)
from sklearn.metrics import classification_report
print (clf.score(X_test,y_test))
print(classification_report(y_predict,y_test,target_names=['died','survivied']))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1313 entries, 0 to 1312
Data columns (total 11 columns):
row.names    1313 non-null int64
pclass       1313 non-null object
survived     1313 non-null int64
name         1313 non-null object
age          633 non-null float64
embarked     821 non-null object
home.dest    754 non-null object
room         77 non-null object
ticket       69 non-null object
boat         347 non-null object
sex          1313 non-null object
dtypes: float64(1), int64(2), object(8)
memory usage: 112.9+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1313 entries, 0 to 1312
Data columns (total 3 columns):
pclass    1313 non-null object
age       1313 non-null float64
sex       1313 non-null object
dtypes: float64(1), object(2)
memory usage: 30.9+ KB


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


0.7811550151975684
              precision    recall  f1-score   support

        died       0.91      0.78      0.84       236
   survivied       0.58      0.80      0.67        93

    accuracy                           0.78       329
   macro avg       0.74      0.79      0.75       329
weighted avg       0.81      0.78      0.79       329



In [2]:
from sklearn.datasets import fetch_20newsgroups
import numpy as np
news = fetch_20newsgroups(subset='all')
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(news.data[:3000],news.target[:3000],test_size=0.25,random_state=33)
from sklearn.feature_extraction.text import TfidfVectorizer
vec = TfidfVectorizer()
X_count_train = vec.fit_transform(X_train)
X_count_test = vec.transform(X_test)
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
#使用pipeline简化系统搭建流程，将文本抽取与分类器模型串联起来
clf = Pipeline([
    ('vect',TfidfVectorizer(stop_words='english')),('svc',SVC())
])
# 注意，这里经pipeline进行特征处理、SVC模型训练之后，得到的直接就是训练好的分类器clf
 
parameters = {
    'svc__gamma':np.logspace(-2,1,4),
    'svc__C':np.logspace(-1,1,3),
    'vect__analyzer':['word']
}
 
#n_jobs=-1代表使用计算机的全部CPU
from sklearn.model_selection import GridSearchCV
gs = GridSearchCV(clf,parameters,verbose=2,refit=True,cv=3,n_jobs=-1)
 
%time _=gs.fit(X_train,y_train)
print (gs.best_params_,gs.best_score_)
print (gs.score(X_test,y_test))

Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  32 out of  36 | elapsed:   56.0s remaining:    6.9s
[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:   56.6s finished


Wall time: 1min 17s
{'svc__C': 10.0, 'svc__gamma': 0.1, 'vect__analyzer': 'word'} 0.7906666666666666
0.8226666666666667


# 可加入Pipeline的自定义变换

## Part 1 步骤

首先，需要sklearn.base中的两个类BaseEstimator和TransformerMixin

其次，定义自定义变换继承上述两个类，如class CustomFunction(BaseEstimator,TransformerMixin)

最后，在自定义变换中定义fit和transform函数

## Part 2 实例

目标：对于某个含有10个特征的数据集，现在想找到最佳的特征组合使识别率最高，使识别率最高

说明：为了达到这个目标，就是采用GridSearchCV的方法，根据CV得分，直接挑选出最佳的特征组合。因此需要构建一个Pipeline能够放入GridSearchCV中。但是，光构建Pipeline还不够，还需要定义一个能够加入Pipeline的选取特定特征组合的变换

以下为代码实例：

- 构建特征组合变换SelectRowTransformer

- 生成数据make_classification

- 构建Pipeline

- 网格搜素GridSearchCV

In [3]:
from sklearn.base import BaseEstimator,TransformerMixin
from sklearn.datasets import make_classification
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
from itertools import combinations 

class SelectRowTransformer(BaseEstimator,TransformerMixin):
    def __init__(self,comb_idx = [0,]):
        self.comb_idx = comb_idx

    def fit(self,X,y=None):
        return self

    def transform(self,X,y =None):
        return X[:,self.comb_idx].copy()

if __name__ == '__main__': 
    __spec__ = None
    K = 10
    data, label = make_classification(n_samples=200, n_informative=4,n_redundant=0,
                                      random_state=223,n_features=K)
    print(data.shape, label.shape)
    pipe = Pipeline([
            ('Comb',SelectRowTransformer() ),
            ('SVC',LinearSVC() ),
            ])   
    param = { 
            'Comb__comb_idx':[i for j in range(K) for i in combinations(range(K),j+1)],
            'SVC__C':[2**(f-2) for f in range(5)]
            }   
    grid = GridSearchCV(pipe,param,cv=3,verbose=1,n_jobs=-1)
    grid.fit(data,label)
    print( '最佳CV得分:{0}, 最佳得分对应的特征组合:{1}, SVC-C:{2}'.format(grid.best_score_,
          grid.best_params_['Comb__comb_idx'],grid.best_params_['SVC__C']))

(200, 10) (200,)
Fitting 3 folds for each of 5115 candidates, totalling 15345 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done 1689 tasks      | elapsed:    4.3s
[Parallel(n_jobs=-1)]: Done 6689 tasks      | elapsed:    8.9s
[Parallel(n_jobs=-1)]: Done 13689 tasks      | elapsed:   15.4s


最佳CV得分:0.82, 最佳得分对应的特征组合:(0, 1, 5, 9), SVC-C:2


[Parallel(n_jobs=-1)]: Done 15345 out of 15345 | elapsed:   16.8s finished
