## DictVectorizer pipeline 
Practicing with DictVectorizer and feature union

Importing packages

In [1]:
import numpy as np
import pandas as pd

from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

from sklearn.model_selection import train_test_split

In [2]:
class ItemSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        return data_dict[self.key]


class TextStats(BaseEstimator, TransformerMixin):
    """Extract features from each document for DictVectorizer"""

    def fit(self, x, y=None):
        return self

    def transform(self, posts):
        return [{'length': len(text),
                 'num_sentences': text.count('.')}
                for text in posts]


class SubjectBodyExtractor(BaseEstimator, TransformerMixin):
    """Extract the subject & body from a usenet post in a single pass.

    Takes a sequence of strings and produces a dict of sequences.  Keys are
    `subject` and `body`.
    """
    def fit(self, x, y=None):
        return self

    def transform(self, posts):
        features = np.recarray(shape=(len(posts),),
                               dtype=[('subject', object), ('body', object)])
        for i, text in enumerate(posts):
            headers, _, bod = text.partition('\n\n')
            bod = strip_newsgroup_footer(bod)
            bod = strip_newsgroup_quoting(bod)
            features['body'][i] = bod

            prefix = 'Subject:'
            sub = ''
            for line in headers.split('\n'):
                if line.startswith(prefix):
                    sub = line[len(prefix):]
                    break
            features['subject'][i] = sub

        return features

Read in the DataFrame

In [3]:
df = pd.read_csv('imdb.csv', encoding = "ISO-8859-1")
print(df.head())

                                                text  score
0  A very, very, very slow-moving, aimless movie ...      0
1  Not sure who was more lost - the flat characte...      0
2  Attempting artiness with black & white and cle...      0
3       Very little music or anything to speak of.        0
4  The best scene in the movie was when Gerardo i...      1


Select features, then train test split. 

In [4]:
#list of features
features= 'text'
#target variable
target = 'score'


X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.25, random_state=42)
print(X_train.head())

394    The memories are murky but I can only say that...
327    And if that isn't enough of a mess of a movie ...
445    This movie is so mind-bendingly awful, it coul...
110                     I mean this in a terrible way.  
82     This movie is a pure disaster, the story is st...
Name: text, dtype: object


Building some pipelines

In [5]:
# Pipeline for pulling ad hoc features from review text
text_stats_pipeline = Pipeline([
                #('selector', ItemSelector(key='text')),
                ('stats', TextStats()),  # returns a list of dicts
                ('vect', DictVectorizer()),  # list of dicts -> feature matrix
            ])

# Pipeline for standard bag-of-words model for body
bow_pipeline = Pipeline([
                #('selector', ItemSelector(key='text')),
                ('tfidf', TfidfVectorizer()),
                ('best', TruncatedSVD(n_components=50)),
            ])

In [6]:
fu = FeatureUnion(transformer_list=[('text_bow',bow_pipeline), ('text_stats', text_stats_pipeline)])
                  
# Use a SVC classifier on the combined features                  
fu_pipeline = Pipeline([('feature_union', fu), ('svc', SVC(kernel='linear'))])

Fit the pipeline to the training data

In [7]:
#fu_pipeline.fit_transform(X_train)
fu_pipeline.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('feature_union', FeatureUnion(n_jobs=None,
       transformer_list=[('text_bow', Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True,...r', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False))])

In [8]:
y_pred = fu_pipeline.predict(X_test)
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.65      0.73      0.69        82
           1       0.77      0.69      0.72       105

   micro avg       0.71      0.71      0.71       187
   macro avg       0.71      0.71      0.70       187
weighted avg       0.71      0.71      0.71       187

