## Pipelines in SKlearn

In [5]:
import pandas as pd 
import numpy as np
import json

from sklearn.pipeline import Pipeline

In [6]:
data = pd.read_csv("/Users/patricksmith/Desktop/stumbleupon.tsv", sep='\t')

In [7]:
data.head(1)

Unnamed: 0,url,urlid,boilerplate,alchemy_category,alchemy_category_score,avglinksize,commonlinkratio_1,commonlinkratio_2,commonlinkratio_3,commonlinkratio_4,...,is_news,lengthyLinkDomain,linkwordscore,news_front_page,non_markup_alphanum_characters,numberOfLinks,numwords_in_url,parametrizedLinkRatio,spelling_errors_ratio,label
0,http://www.bloomberg.com/news/2010-12-23/ibm-p...,4042,"{""title"":""IBM Sees Holographic Calls Air Breat...",business,0.789131,2.055556,0.676471,0.205882,0.047059,0.023529,...,1,1,24,0,5424,170,8,0.152941,0.07913,0


## Example with natural language processing (NLP)

In [8]:
data['title'] = data.boilerplate.map(lambda x: json.loads(x).get('title', ''))
data['body'] = data.boilerplate.map(lambda x: json.loads(x).get('body', ''))

In [9]:
titles = data['title'].fillna('')
body = data['body'].fillna('')

titles[0:3]

0    IBM Sees Holographic Calls Air Breathing Batte...
1    The Fully Electronic Futuristic Starting Gun T...
2    Fruits that Fight the Flu fruits that fight th...
Name: title, dtype: object

In [10]:
Y = data['label']

In [11]:
Y.value_counts() / len(Y)

1    0.51332
0    0.48668
Name: label, dtype: float64

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

In [13]:
vectorizer = CountVectorizer(max_features = 1000, ngram_range=(1,2), stop_words='english',binary=True)

vectorizer.fit(body)

CountVectorizer(analyzer=u'word', binary=True, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=1000, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [14]:
vectorizer.get_feature_names()

[u'000',
 u'10',
 u'10 minutes',
 u'100',
 u'11',
 u'12',
 u'13',
 u'14',
 u'15',
 u'15 minutes',
 u'16',
 u'17',
 u'18',
 u'19',
 u'20',
 u'20 minutes',
 u'2008',
 u'2009',
 u'2010',
 u'2011',
 u'2012',
 u'21',
 u'22',
 u'23',
 u'24',
 u'25',
 u'26',
 u'28',
 u'30',
 u'30 minutes',
 u'35',
 u'350',
 u'350 degrees',
 u'40',
 u'400',
 u'45',
 u'50',
 u'60',
 u'ability',
 u'able',
 u'absolutely',
 u'according',
 u'actually',
 u'adapted',
 u'add',
 u'added',
 u'adding',
 u'addition',
 u'additional',
 u'advice',
 u'age',
 u'ago',
 u'ahead',
 u'air',
 u'allow',
 u'alternative',
 u'amazing',
 u'america',
 u'american',
 u'apart',
 u'apple',
 u'area',
 u'aren',
 u'art',
 u'article',
 u'aside',
 u'ask',
 u'asked',
 u'attention',
 u'author',
 u'available',
 u'average',
 u'avoid',
 u'away',
 u'awesome',
 u'baby',
 u'bacon',
 u'bad',
 u'bag',
 u'bake',
 u'baked',
 u'baking',
 u'baking powder',
 u'baking sheet',
 u'baking soda',
 u'ball',
 u'bar',
 u'base',
 u'based',
 u'basic',
 u'batter',
 u'bean

In [15]:
vectorizer.transform(['IBM Sees Holographic Air']).todense()

matrix([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0

In [16]:
title_vectorizer = CountVectorizer(max_features = 1000, ngram_range=(1,2), stop_words='english',binary=True)

body_vectorizer = CountVectorizer(max_features = 1000, ngram_range=(1,2), stop_words='english',binary=True)

In [17]:
title_vectorizer.fit(titles)

body_vectorizer.fit(body)

CountVectorizer(analyzer=u'word', binary=True, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=1000, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [18]:
title_X = title_vectorizer.transform(titles)
body_X = body_vectorizer.transform(body)

## Combining steps together in a pipeline

In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score

title_model = LogisticRegression()
title_scores = cross_val_score(title_model, title_X, Y, cv=5)

In [23]:
body_model = LogisticRegression()
body_scores = cross_val_score(body_model, body_X, Y, cv=5) 

## Merging feature sets in pipelines

In [24]:
training_data = data[:6000]
title_X_train = training_data['title'].fillna('')
Y_train = training_data['label']

In [25]:
title_X_test = data[6000:]['title'].fillna('')

In [26]:
pipeline = Pipeline([('vec', title_vectorizer),('model', title_model)])

In [27]:
pipeline.fit(title_X_train, Y_train)

Pipeline(steps=[('vec', CountVectorizer(analyzer=u'word', binary=True, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=1000, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words='english',
       ...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [28]:
pipeline.predict_proba(title_X_test)

array([[ 0.46801259,  0.53198741],
       [ 0.28316885,  0.71683115],
       [ 0.00513415,  0.99486585],
       ..., 
       [ 0.2906378 ,  0.7093622 ],
       [ 0.60684131,  0.39315869],
       [ 0.66320386,  0.33679614]])

In [29]:
from sklearn.preprocessing import MaxAbsScaler

In [30]:
ma_scaler = MaxAbsScaler()

pipeline = Pipeline([('vec', title_vectorizer),
                     ('max_abs_scaler', ma_scaler),
                     ('model', title_model)])

pipeline.fit(title_X_train, Y_train)

pipeline.predict(title_X_test)

array([1, 1, 1, ..., 1, 0, 0])

## make_pipeline() with preprocessing and modeling

In [32]:
from sklearn.pipeline import make_pipeline 
from sklearn.preprocessing import StandardScaler

In [37]:
pipe1 = make_pipeline(CountVectorizer(max_features = 1000, ngram_range=(1,2), stop_words='english',binary=True)
,MaxAbsScaler(), LogisticRegression())

In [38]:
pipe1.fit(title_X_train, Y_train)

Pipeline(steps=[('countvectorizer', CountVectorizer(analyzer=u'word', binary=True, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=1000, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words='engli...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [40]:
pipe1.predict(title_X_train)

array([1, 0, 1, ..., 0, 0, 1])

In [35]:
print pipe1

Pipeline(steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('logisticregression', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])


## Custom transformer classes

In [41]:
from sklearn.base import BaseEstimator, TransformerMixin

In [None]:
class FeatureMultiplier(BaseEstimator, transformerMixin):
    def __init__(self, factor):
        self.factor = factor 
        
    def transform(self, X, *_):
        return X * self.factor
    
    def fit(self,*_):
        return self