In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import Imputer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import FeatureUnion
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_extraction.text import HashingVectorizer

In [2]:
# Load dataset
sample_df = pd.read_csv('../datasets/School Budgets/sample.csv', index_col=0)

In [3]:
sample_df.head()

Unnamed: 0,numeric,text,with_missing,label
0,-10.856306,,4.43324,b
1,9.973454,foo,4.310229,b
2,2.829785,foo bar,2.469828,a
3,-15.062947,,2.852981,b
4,-5.786003,foo bar,1.826475,a


In [4]:
sample_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 0 to 999
Data columns (total 4 columns):
numeric         1000 non-null float64
text            790 non-null object
with_missing    822 non-null float64
label           1000 non-null object
dtypes: float64(2), object(2)
memory usage: 39.1+ KB


In [5]:
# Replace np.nan with '' in 'text' column
sample_df['text'].fillna('', inplace=True)

## Instantiate pipeline

In [6]:
# Split and select numeric data only, no nans 
X_train, X_test, y_train, y_test = train_test_split(sample_df[['numeric']],
                                                    pd.get_dummies(sample_df['label']), 
                                                    random_state=22)

In [7]:
# Instantiate Pipeline object
pl = Pipeline([
        ('clf', OneVsRestClassifier(LogisticRegression()))
    ])

In [8]:
# Fit the pipeline to the training data
pl.fit(X_train, y_train)

Pipeline(steps=[('clf', OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
          n_jobs=1))])

In [9]:
# Compute and print accuracy
pl.score(X_test, y_test)

0.62

## Preprocessing numeric features

In [10]:
# Create training and test sets using only numeric data
X_train, X_test, y_train, y_test = train_test_split(sample_df[['numeric', 'with_missing']],
                                                    pd.get_dummies(sample_df['label']), 
                                                    random_state=456)

In [11]:
# Insantiate Pipeline object
pl = Pipeline([
        ('imp', Imputer()),
        ('clf', OneVsRestClassifier(LogisticRegression()))
    ])

In [12]:
# Fit the pipeline to the training data
pl.fit(X_train, y_train)

Pipeline(steps=[('imp', Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)), ('clf', OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
          n_jobs=1))])

In [13]:
# Compute and print accuracy
pl.score(X_test, y_test)

0.63600000000000001

## Preprocessing text features

In [14]:
# Split out only the text data
X_train, X_test, y_train, y_test = train_test_split(sample_df['text'],
                                                    pd.get_dummies(sample_df['label']), 
                                                    random_state=456)

In [15]:
# Instantiate Pipeline object
pl = Pipeline([
        ('vec', CountVectorizer()),
        ('clf', OneVsRestClassifier(LogisticRegression()))
    ])

In [16]:
# Fit to the training data
pl.fit(X_train, y_train)

Pipeline(steps=[('vec', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_...=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
          n_jobs=1))])

In [17]:
# Compute and print accuracy
pl.score(X_test, y_test)

0.80800000000000005

## Multiple types of processing: FunctionTransformer

In [18]:
# Obtain the text data
get_text_data = FunctionTransformer(lambda x: x['text'], validate=False)

In [19]:
# Obtain the numeric data
get_numeric_data = FunctionTransformer(lambda x: x[['numeric', 'with_missing']], validate=False)

In [20]:
# Fit and transform the text data
just_text_data = get_text_data.fit_transform(sample_df)

In [21]:
just_text_data.head()

0           
1        foo
2    foo bar
3           
4    foo bar
Name: text, dtype: object

In [22]:
# Fit and transform the numeric data
just_numeric_data = get_numeric_data.fit_transform(sample_df)

In [23]:
just_numeric_data.head()

Unnamed: 0,numeric,with_missing
0,-10.856306,4.43324
1,9.973454,4.310229
2,2.829785,2.469828
3,-15.062947,2.852981
4,-5.786003,1.826475


## Multiple types of processing: FeatureUnion

In [24]:
# Split using ALL data in sample_df
X_train, X_test, y_train, y_test = train_test_split(sample_df[['numeric', 'with_missing', 'text']],
                                                    pd.get_dummies(sample_df['label']), 
                                                    random_state=22)

In [25]:
# Create a FeatureUnion with nested pipeline
process_and_join_features = FeatureUnion(
            transformer_list = [
                ('numeric_features', Pipeline([
                    ('selector', get_numeric_data),
                    ('imputer', Imputer())
                ])),
                ('text_features', Pipeline([
                    ('selector', get_text_data),
                    ('vectorizer', CountVectorizer())
                ]))
             ]
        )

In [26]:
# Instantiate nested pipeline
pl = Pipeline([
        ('union', process_and_join_features),
        ('clf', OneVsRestClassifier(LogisticRegression()))
    ])

In [27]:
# Fit pl to the training data
pl.fit(X_train, y_train)

Pipeline(steps=[('union', FeatureUnion(n_jobs=1,
       transformer_list=[('numeric_features', Pipeline(steps=[('selector', FunctionTransformer(accept_sparse=False,
          func=<function <lambda> at 0x118b5c730>, inv_kw_args=None,
          inverse_func=None, kw_args=None, pass_y=False, validate=False)), ...=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
          n_jobs=1))])

In [28]:
# Compute and print accuracy
pl.score(X_test, y_test)

0.92800000000000005

## Add a model to the pipeline

In [29]:
# Complete the pipeline
pl = Pipeline([
        ('union', FeatureUnion(
            transformer_list = [
                ('numeric_features', Pipeline([
                    ('selector', get_numeric_data),
                    ('imputer', Imputer())
                ])),
                ('text_features', Pipeline([
                    ('selector', get_text_data),
                    ('vectorizer', CountVectorizer())
                ]))
             ]
        )),
        ('clf', OneVsRestClassifier(LogisticRegression()))
    ])

In [30]:
# Fit to the training data
pl.fit(X_train, y_train)

Pipeline(steps=[('union', FeatureUnion(n_jobs=1,
       transformer_list=[('numeric_features', Pipeline(steps=[('selector', FunctionTransformer(accept_sparse=False,
          func=<function <lambda> at 0x118b5c730>, inv_kw_args=None,
          inverse_func=None, kw_args=None, pass_y=False, validate=False)), ...=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
          n_jobs=1))])

In [31]:
# Compute and print accuracy
pl.score(X_test, y_test)

0.92800000000000005

## N-gram range

In [32]:
# Select 300 best features
chi_k = 'all'  # chi_k = 300

In [33]:
pl = Pipeline([
        ('union', FeatureUnion(
            transformer_list = [
                ('numeric_features', Pipeline([
                    ('selector', get_numeric_data),
                    ('imputer', Imputer())
                ])),
                ('text_features', Pipeline([
                    ('selector', get_text_data),
                    ('vectorizer', CountVectorizer(ngram_range=(1,2))),
                    ('dim_red', SelectKBest(chi2, chi_k))
                ]))
             ]
        )),
        ('scale', MaxAbsScaler()),
        ('clf', OneVsRestClassifier(LogisticRegression()))
    ])

In [34]:
# Fit to the training data
pl.fit(X_train, y_train)

Pipeline(steps=[('union', FeatureUnion(n_jobs=1,
       transformer_list=[('numeric_features', Pipeline(steps=[('selector', FunctionTransformer(accept_sparse=False,
          func=<function <lambda> at 0x118b5c730>, inv_kw_args=None,
          inverse_func=None, kw_args=None, pass_y=False, validate=False)), ...=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
          n_jobs=1))])

In [35]:
# Compute and print accuracy
pl.score(X_test, y_test)

0.93600000000000005

## Interaction modeling

In [36]:
# Sparce to dense matrix
to_dense = FunctionTransformer(lambda x: x.todense(), validate=False)

In [37]:
# Instantiate pipeline
pl = Pipeline([
        ('union', FeatureUnion(
            transformer_list = [
                ('numeric_features', Pipeline([
                    ('selector', get_numeric_data),
                    ('imputer', Imputer())
                ])),
                ('text_features', Pipeline([
                    ('selector', get_text_data),
                    ('vectorizer', CountVectorizer(ngram_range=(1, 2))),  
                    ('dim_red', SelectKBest(chi2, chi_k))
                ]))
             ]
        )),
        ('to_dense', to_dense), 
        ('poly', PolynomialFeatures(degree=2)),
        ('scale', MaxAbsScaler()),
        ('clf', OneVsRestClassifier(LogisticRegression()))
    ])

In [38]:
# Fit to the training data
pl.fit(X_train, y_train)

Pipeline(steps=[('union', FeatureUnion(n_jobs=1,
       transformer_list=[('numeric_features', Pipeline(steps=[('selector', FunctionTransformer(accept_sparse=False,
          func=<function <lambda> at 0x118b5c730>, inv_kw_args=None,
          inverse_func=None, kw_args=None, pass_y=False, validate=False)), ...=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
          n_jobs=1))])

In [39]:
# Compute and print accuracy
pl.score(X_test, y_test)

0.93200000000000005

## Implementing the hashing trick

In [40]:
TOKENS_ALPHANUMERIC = '[A-Za-z0-9]+(?=\\s+)'

In [41]:
# Instantiate the winning model pipeline
pl = Pipeline([
        ('union', FeatureUnion(
            transformer_list = [
                ('numeric_features', Pipeline([
                    ('selector', get_numeric_data),
                    ('imputer', Imputer())
                ])),
                ('text_features', Pipeline([
                    ('selector', get_text_data),
                    ('vectorizer', HashingVectorizer(token_pattern=TOKENS_ALPHANUMERIC,
                                                     non_negative=True, norm=None, binary=False,
                                                     ngram_range=(1,2))),
                    ('dim_red', SelectKBest(chi2, chi_k))
                ]))
             ]
        )),
        ('to_dense', to_dense), 
        ('poly', PolynomialFeatures(degree=2)),
        ('scale', MaxAbsScaler()),
        ('clf', OneVsRestClassifier(LogisticRegression()))
    ])

In [None]:
# Fit to the training data
pl.fit(X_train, y_train)

In [None]:
# Compute and print accuracy
pl.score(X_test, y_test)