In [1]:
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
try:
    from sklearn.model_selection import train_test_split
except:
    from sklearn.cross_validation import train_test_split
from sklearn import metrics

from sklearn.pipeline import Pipeline, FeatureUnion, make_union, make_pipeline

In [8]:
df = pd.read_csv("/Users/nicholasthomas/desktop/coding.local/Spam_ham_app/smsspamcollection/SMSSpamCollection", sep="\t", 
                 header=None, 
                 names=["target", "text"])

In [9]:
X = df["text"]
y = df["target"]

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [11]:
X_train[:4]

708     Quite late lar... Ard 12 anyway i wun b drivin...
4338                        on a Tuesday night r u 4 real
5029    Go chase after her and run her over while she'...
4921     G says you never answer your texts, confirm/deny
Name: text, dtype: object

In [12]:
df

Unnamed: 0,target,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [13]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

class CapitalDocTransfomer(BaseEstimator, TransformerMixin):
    """
    Transforms the input document to either 1 or 0.
    1 if all words in the document are Capital else  0.
    """
        
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        # if all the words in each line of X is A-Z only, then return 1 else retun 0         
        X_new = np.array([line == line.upper() for line in X]).astype(int)
        return X_new.reshape(-1, 1)

In [14]:
# Name the Steps with Pipeline and FeatureUnion

log_reg_model = Pipeline(steps=[
        ("features", FeatureUnion([
                    ("iscap", CapitalDocTransfomer()), 
                    ("count", CountVectorizer())])),
        ("model", LogisticRegression())
        ])

In [15]:
# Since we are not doing any parameter tuning of Features, convert to make_union
log_reg_model = Pipeline(steps=[
        ("features", make_union(CapitalDocTransfomer(), CountVectorizer())),
        ("model", LogisticRegression())
        ])

In [16]:
# Using Make Pipeline and Make Union
log_reg_model = make_pipeline(make_union(CapitalDocTransfomer(), CountVectorizer()),
        LogisticRegression())

In [17]:
# All three of the above ways are the same

In [18]:
log_reg_model.fit(X_train, y_train)
# Accuracy
log_reg_model.score(X_test, y_test)

0.98504784688995217

In [19]:
from sklearn.externals import joblib

In [23]:
!ls -a

[34m.[m[m                                       [34msmsspamcollection[m[m
[34m..[m[m                                      spam_ham.pkl
.DS_Store                               spam_ham.pkl_01.npy
[34m.ipynb_checkpoints[m[m                      spam_ham.pkl_02.npy
[34mData[m[m                                    spam_ham.pkl_03.npy
[34mModels[m[m                                  spamham.pkl
feature-pipeline-example-spam-ham.ipynb


In [24]:
joblib.dump(log_reg_model, "models/spam_ham.pkl")

['models/spam_ham.pkl',
 'models/spam_ham.pkl_01.npy',
 'models/spam_ham.pkl_02.npy',
 'models/spam_ham.pkl_03.npy']

In [15]:
!ls

[34mdata[m[m                                    spam_ham.pkl
feature-pipeline-example-spam-ham.ipynb


In [26]:
new_model = joblib.load("models/spam_ham.pkl")

In [30]:
new_model.predict(["you won a cruise"])

array(['ham'], dtype=object)