In [1]:
import pandas as pd

In [2]:
data = pd.read_csv("../data/SMSSpamCollection", sep="\t", 
                   header=None, names=["target", "document"])

In [4]:
print(data.shape)
data.head()

(5572, 2)


Unnamed: 0,target,document
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
from sklearn.cross_validation import train_test_split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(data["document"], data["target"], 
                                                    random_state=42, test_size=0.3)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(3900,)
(1672,)
(3900,)
(1672,)


In [8]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler

In [10]:
pipe = Pipeline(steps=[
        ("vect", CountVectorizer(stop_words="english")),
        ("model", RandomForestClassifier())
    ])
print(pipe)

Pipeline(steps=[('vect', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
     ...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])


In [11]:
X_train.apply(lambda val: len(val.split(" "))).values.reshape(-1, 1)

array([[10],
       [ 8],
       [13],
       ..., 
       [ 5],
       [ 5],
       [ 8]])

In [12]:
from sklearn.base import BaseEstimator, TransformerMixin

class WordCounter(BaseEstimator, TransformerMixin):

    def fit(self, x, y=None):
        return self

    def transform(self, x):
        counts = x.map(lambda val: len(val.split(" ")))
        return counts.values.reshape(-1, 1)

In [14]:
pipe_features = FeatureUnion([
        ("count_vect", CountVectorizer(stop_words="english")),
        ("dummy_word_count", Pipeline([
                ("word_count", WordCounter()),
                ("scalar", StandardScaler())
            ]))
    ])
print(pipe_features)

FeatureUnion(n_jobs=1,
       transformer_list=[('count_vect', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words...ord_count', WordCounter()), ('scalar', StandardScaler(copy=True, with_mean=True, with_std=True))]))],
       transformer_weights=None)


In [16]:
pipe = Pipeline(steps=[
        ("pipe_features", pipe_features),
        ("model", LogisticRegression())
    ])
print(pipe)
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)

Pipeline(steps=[('pipe_features', FeatureUnion(n_jobs=1,
       transformer_list=[('count_vect', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
  ...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])




0.98444976076555024

In [18]:
vect = WordCounter()

vect.fit(X_train, y_train)
print(vect)

WordCounter()


In [19]:
vect.transform(X_train)

array([[10],
       [ 8],
       [13],
       ..., 
       [ 5],
       [ 5],
       [ 8]])

In [52]:
params = {
    "model__n_estimators": [10, 20, 100],
    "model__min_samples__leaf": [2, 5, 10],
    "vect__lowercase": [True, False]
}
grid = RandomizedSearchCV(pipe, params, verbose=True)

grid.fit(X_train, y_train)  # not sure what's wrong here...

Fitting 3 folds for each of 10 candidates, totalling 30 fits


ValueError: Invalid parameter vect for estimator Pipeline(steps=[('pipe_features', FeatureUnion(n_jobs=1,
       transformer_list=[('count_vect', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
  ...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]). Check the list of available parameters with `estimator.get_params().keys()`.

In [24]:
grid.get_params().keys()

['n_jobs',
 'verbose',
 'estimator__pipe_features',
 'random_state',
 'estimator__pipe_features__count_vect__max_df',
 'estimator__pipe_features__count_vect',
 'estimator__model__warm_start',
 'estimator__steps',
 'estimator__pipe_features__transformer_list',
 'estimator__pipe_features__count_vect__tokenizer',
 'cv',
 'estimator__pipe_features__count_vect__input',
 'estimator__pipe_features__count_vect__analyzer',
 'estimator__model__penalty',
 'estimator__model__intercept_scaling',
 'estimator__pipe_features__dummy_word_count__scalar__copy',
 'estimator__pipe_features__dummy_word_count',
 'estimator__pipe_features__count_vect__lowercase',
 'estimator__pipe_features__count_vect__binary',
 'estimator__model__random_state',
 'estimator__pipe_features__count_vect__decode_error',
 'estimator__model__solver',
 'estimator__pipe_features__count_vect__encoding',
 'estimator__pipe_features__count_vect__dtype',
 'param_distributions',
 'estimator__pipe_features__count_vect__ngram_range',
 'estim

In [53]:
grid.get_params


<bound method RandomizedSearchCV.get_params of RandomizedSearchCV(cv=None, error_score='raise',
          estimator=Pipeline(steps=[('pipe_features', FeatureUnion(n_jobs=1,
       transformer_list=[('count_vect', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
  ...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
          fit_params={}, iid=True, n_iter=10, n_jobs=1,
          param_distributions={'vect__lowercase': [True, False], 'model__min_samples__leaf': [2, 5, 10], 'model__n_estimators': [10, 20, 100]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          scoring=None, verbose=True)>

In [32]:
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)



0.98444976076555024

In [33]:
pipe

Pipeline(steps=[('pipe_features', FeatureUnion(n_jobs=1,
       transformer_list=[('count_vect', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
  ...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])