In [20]:
import pandas as pd

In [21]:
data = pd.read_csv("../data/SMSSpamCollection", sep="\t", 
                   header=None, names=["target", "document"])

In [22]:
data.head()

Unnamed: 0,target,document
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [23]:
from sklearn.cross_validation import train_test_split

In [24]:
X_train, X_test, y_train, y_test = train_test_split(data["document"], data["target"], 
                                                    random_state=42, test_size=0.3)

In [25]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler

In [26]:
pipe = Pipeline(steps=[
        ("vect", CountVectorizer(stop_words="english")),
        ("model", RandomForestClassifier())
    ])

In [27]:
X_train.apply(lambda val: len(val.split(" "))).values.reshape(-1, 1)

array([[10],
       [ 8],
       [13],
       ..., 
       [ 5],
       [ 5],
       [ 8]])

In [28]:
from sklearn.base import BaseEstimator, TransformerMixin

class WordCounter(BaseEstimator, TransformerMixin):

    def fit(self, x, y=None):
        return self

    def transform(self, x):
        counts = x.map(lambda val: len(val.split(" ")))
        return counts.values.reshape(-1, 1)

In [29]:
pipe_features = FeatureUnion([
        ("count_vect", CountVectorizer(stop_words="english")),
        ("dummy_word_count", Pipeline([
                ("word_count", WordCounter()),
                ("scalar", StandardScaler())
            ]))
    ])

In [30]:
pipe = Pipeline(steps=[
        ("pipe_features", pipe_features),
        ("model", LogisticRegression())
    ])

pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)



0.98444976076555024

In [31]:
vect = WordCounter()

vect.fit(X_train, y_train)

WordCounter()

In [32]:
vect.transform(X_train)

array([[10],
       [ 8],
       [13],
       ..., 
       [ 5],
       [ 5],
       [ 8]])

In [16]:
params = {
    "model__n_estimators": [10, 20, 100],
    "model__min_samples_leaf": [2, 5, 10],
    "vect__lowercase": [True, False]
}
grid = RandomizedSearchCV(pipe, params, verbose=True)

grid.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


ValueError: Invalid parameter min_samples_leaf for estimator LogisticRegression. Check the list of available parameters with `estimator.get_params().keys()`.

In [33]:
grid.best_params_

AttributeError: 'RandomizedSearchCV' object has no attribute 'best_params_'

In [18]:
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)



0.98444976076555024

In [19]:
pipe

Pipeline(steps=[('pipe_features', FeatureUnion(n_jobs=1,
       transformer_list=[('count_vect', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
  ...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])