In [1]:
import numpy as np
import pandas as pd

from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.preprocessing import MinMaxScaler, StandardScaler

[**Feature Union Documentation**](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.FeatureUnion.html)

**FeatureUnion** is a very useful pipeline class that takes a data set and allows for the parallel processing of completely independent data transformation pipelines, the output from the pipelines is merged at the eng - hence the name feautre **union**. 

A potential use case is when you want to include both term-counts (i.e. CountVectorizer and TFIDF) and context vectors (i.e. spaCy's language model) into the same doc-term matrix that you pass into a ML classifier. The motivation is that our model would benefit from having both pieces of information about our data at training time. 

You can then use this to, hopefully, bump up your Kaggle score - yay!!!

____

### Our Mixed Data Set

In [15]:
# text data
reviews = ["I love dogs and coffee and dancing tango with beautiful women in Argentina.", 
           "I love cats and tea but dancing tango is difficult and I gave up.", 
           "I love dogs and cats and dancing but only dancing salsa."]

# numerical data
ratings = [100.,75., 25.]
visits = [10, 3, 6]
purchase_amount = [25.0, 5.50, 12.30]

data = [reviews, ratings, visits, purchase_amount]
columns = ["reviews", "ratings", "visits", "purchase_amount"]

df = pd.DataFrame(data=data).T
df.columns = columns

In [16]:
df.head()

Unnamed: 0,reviews,ratings,visits,purchase_amount
0,I love dogs and coffee and dancing tango with ...,100,10,25.0
1,I love cats and tea but dancing tango is diffi...,75,3,5.5
2,I love dogs and cats and dancing but only danc...,25,6,12.3


----
### Split features into multiple feature sets

FeatureUnion provides multiple feature transformation pipelines that run in parallel and in isolation of each other, however FeatureUnion passed the same data set through all pipelines. So we need to make sure that whatever features we pass in can be handeled by all transformations in all pipelines. 

In order words, create a FeatureUnion instance for text data, then create another FeatureUnion for numerical data, and so on. 

In [17]:
feat_set_1 = ["reviews"]
feat_set_2 = ['ratings', 'visits', "purchase_amount"]


text_data = df[feat_set_1].values.flatten()
numerical_data =  df[feat_set_2].values

In [22]:
text_data

array(['I love dogs and coffee and dancing tango with beautiful women in Argentina.',
       'I love cats and tea but dancing tango is difficult and I gave up.',
       'I love dogs and cats and dancing but only dancing salsa.'],
      dtype=object)

In [28]:
numerical_data

array([[100.0, 10, 25.0],
       [75.0, 3, 5.5],
       [25.0, 6, 12.3]], dtype=object)

----
### Create FeatureUnion pipeine for text data 

In [23]:
vect = CountVectorizer()
tfidf = TfidfVectorizer()
svd = TruncatedSVD(n_components = 2)

# there are 2 independent feature eng pipelines running in parallel 

# pipeline 1 applys 2 consequcative transformations, i.e. vect then svd
pipe_one = Pipeline([("tfidf", TfidfVectorizer()), 
                     ('svd', TruncatedSVD(n_components = 10))])


# pipeline 2 applys 1 transformation, i.e. count vectorizer 
pipe_two = Pipeline([("vect", CountVectorizer())])

transformer_list = [("tfidf_svd", pipe_one), 
                    ("vect", pipe_two) 
                   ]   

union = FeatureUnion(transformer_list, n_jobs=2, verbose=1)

In [29]:
text_feat_set = union.fit_transform(text_data)
text_feat_set

<3x23 sparse matrix of type '<class 'numpy.float64'>'
	with 39 stored elements in Compressed Sparse Row format>

----
### Create FeatureUnion pipeline for numerical data

In [26]:
# there are 2 independent feature eng pipelines running in parallel 

# pipeline 1 applys 2 consequcative transformations, i.e. vect then svd
pipe_one = Pipeline([("minmaxscaler", MinMaxScaler()), 
                     ('pca', PCA(n_components = 2))])


# pipeline 2 applys 1 transformation, i.e. count vectorizer 
pipe_two = Pipeline([("standardscaler", StandardScaler())])

transformer_list = [("scaler_pca", pipe_one), 
                    ("scaler", pipe_two) 
                   ]   

num_union = FeatureUnion(transformer_list, n_jobs=2, verbose=1)

In [30]:
num_feat_set = num_union.fit_transform(numerical_data)
num_feat_set

array([[ 0.87799794, -0.06322309,  1.06904497,  1.27872403,  1.32815411],
       [-0.5377671 , -0.38990404,  0.26726124, -1.16247639, -1.08479668],
       [-0.34023084,  0.45312713, -1.33630621, -0.11624764, -0.24335743]])

In [33]:
# sanity check: make sure each dataset as the same number of rows
# they are expected to have different number of features 
text_feat_set.todense().shape

(3, 23)

In [34]:
num_feat_set.shape

(3, 5)

In [36]:
# join numerical and text features together 
model_ready_data = np.concatenate((text_feat_set.todense(), num_feat_set), axis=1)

In [37]:
model_ready_data

matrix([[ 0.7059917 ,  0.68635474, -0.17462215,  2.        ,  1.        ,
          1.        ,  0.        ,  0.        ,  1.        ,  1.        ,
          0.        ,  1.        ,  0.        ,  1.        ,  0.        ,
          1.        ,  0.        ,  0.        ,  1.        ,  0.        ,
          0.        ,  1.        ,  1.        ,  0.87799794, -0.06322309,
          1.06904497,  1.27872403,  1.32815411],
        [ 0.77038018, -0.47103563, -0.42969735,  2.        ,  0.        ,
          0.        ,  1.        ,  1.        ,  0.        ,  1.        ,
          1.        ,  0.        ,  1.        ,  0.        ,  1.        ,
          1.        ,  0.        ,  0.        ,  1.        ,  1.        ,
          1.        ,  0.        ,  0.        , -0.5377671 , -0.38990404,
          0.26726124, -1.16247639, -1.08479668],
        [ 0.81834366, -0.14869576,  0.55516054,  2.        ,  0.        ,
          0.        ,  1.        ,  1.        ,  0.        ,  2.        ,
          0.  