In [17]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt


In [18]:
# In this section we study how different estimators may be chained
# A simple example:
#    feature extraction and selection before an estimator
# feature extraction: vectorizer

# for some types of data, a feature extraction must be applied to convert it to numerical
# features.

import os
with open(os.path.join("F:\\", "machine_learning", "datasets", "smsspam", "SMSSpamCollection")) as f:
    lines = [line.strip().split("\t") for line in f.readlines()]
text = [x[1] for x in lines]
y = [x[0] == 'ham' for x in lines]

In [19]:
from sklearn.model_selection import train_test_split

text_train, text_test, y_train, y_test = train_test_split(text, y)

In [20]:
# it is possible to apply feature extraction manually, like so

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

vectorizer = TfidfVectorizer()
vectorizer.fit(text_train)

X_train = vectorizer.transform(text_train)
X_test = vectorizer.transform(text_test)

clf = LogisticRegression()
clf.fit(X_train, y_train)

clf.score(X_test, y_test)

0.96700143472022959

In [21]:
# The situation of learning a transformation and applying it to test data is very common 
# to machine learning. Therefore scikit-learn has a shortcut for this, called pipelines

from sklearn.pipeline import make_pipeline

pipeline = make_pipeline(TfidfVectorizer(), LogisticRegression())
pipeline.fit(text_train, y_train)
pipeline.score(text_test, y_test)

0.96700143472022959

In [22]:
# creating pipelines makes the code much shorter and easier to handle. When calling a fit on 
# the pipeline, it calls a fit on each step in turn

# Building pipelines not only simplifies the code, it is also important for model selection.
# Say we want to grid-search C to tune our Logistic Regression above:

"""This illustrates a common mistake, don't use this code!!!"""
from sklearn.model_selection import GridSearchCV

vectorizer = TfidfVectorizer()
vectorizer.fit(text_train)

X_train = vectorizer.transform(text_train)
X_test = vectorizer.transform(text_test)

clf = LogisticRegression()
grid = GridSearchCV(clf, param_grid={'C': [.1, 1, 10, 100]}, cv=5)
grid.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': [0.1, 1, 10, 100]}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score=True, scoring=None, verbose=0)

In [23]:
# What did we do wrong???
# Here we did grid-search with cross-validation on X_train. However, when applying 
# TfidfVectorizer,it saw all of the X_train data, not just the training folds.
# So it used knowledge of the frequency of the words in the test-folds. This is called 
# "Contamination" of the test set. This can be solved in the pipeline.

from sklearn.model_selection import GridSearchCV

pipeline = make_pipeline(TfidfVectorizer(),
                        LogisticRegression())
grid = GridSearchCV(pipeline, param_grid={'logisticregression__C': [.1, 1, 10, 100]}, cv=5)

grid.fit(text_train, y_train)
grid.score(text_test, y_test)


0.98637015781922521

In [24]:
# Another benefit of pipelines is that we can now search over parameters of the feature 
# extraction with GridSearchCV

from sklearn.model_selection import GridSearchCV

pipeline = make_pipeline(TfidfVectorizer(), LogisticRegression())

params = {'logisticregression__C':[.1, 1, 10, 100],
         'tfidfvectorizer__ngram_range':[(1,1), (1,2), (2,2)]}

grid = GridSearchCV(pipeline, param_grid = params, cv=5)
grid.fit(text_train, y_train)
print(grid.best_params_)
grid.score(text_test, y_test)

{'tfidfvectorizer__ngram_range': (1, 2), 'logisticregression__C': 100}


0.98637015781922521