In [None]:
# Download the following file

# ``` bash
# wget https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip
# ```
#
# If you unzip it, you will see a tab seperated file called SMSSpamCollection.
# 
# The first column is the label, the second column is some text recieved by SMS. 
#
# keep a 30% holdout set, seed=42
# 
# Take 1:
# 1. Use Count Vectorizer to convert Text to features
# 2. Build a Model
# 3. Report Accuracy, Confusion Matrix in Slack Channel
#
# Take 2:
#
# 1. Build a simple pipeline with a count vectorizer feeding into a model of your choice
# 2. Report Classificication Report, Confusion Matrix
# 3. Plot ROC Curve
# 4. Plot Precision / Recall Curve for Detecting Spam
# 5. Can you choose a different Cutoff to get a Higher Accuracy

In [1]:
from __future__ import print_function, division, absolute_import  # Python 2/3

In [14]:
#data handling, model creation/evaluation
import pandas as pd
import numpy as np
import scipy.stats as stats

from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder, PolynomialFeatures, StandardScaler
from sklearn.linear_model import Lasso, Ridge, LinearRegression, LogisticRegression, ElasticNet
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import metrics
try:
    from sklearn.model_selection import cross_val_score, train_test_split
except ImportError:
    from sklearn.cross_validation import cross_val_score, train_test_split
    
# visualization
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:

data = pd.read_csv("../data/SMSSpamCollection",sep='\t', header=None, names=['target','document'])

In [6]:
data.head()

Unnamed: 0,target,document
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
from sklearn.cross_validation import train_test_split

In [10]:
X_train, X_test, y_train, y_test = train_test_split(data["document"], data["target"], random_state=42, test_size=0.3)

In [11]:
X_train.shape

(3900,)

In [12]:
X_test.shape

(1672,)

In [15]:
vect = CountVectorizer(stop_words="english")
X_train_vect = vect.fit_transform(X_train)
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()   # define the logistic regression
logreg.fit(X_train_vect, y_train)   # we fit it

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [16]:
X_test_vect = vect.transform(X_test) 

In [17]:
logreg.score(X_test_vect, y_test)

0.98444976076555024

In [18]:
y_pred=logreg.predict(X_test_vect)

In [19]:
from sklearn.metrics import confusion_matrix, classification_report
confusion_matrix(y_test, y_pred)   # What is the confusion matrix telling us here? Instructor explained but
                                   # I seemed to have missed it

array([[1448,    0],
       [  26,  198]])

In [22]:
# Another student's solution (posted on slac):
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.cross_validation import train_test_split
vect = TfidfVectorizer(stop_words="english")
# X_train, X_test, y_train, y_test = train_test_split(emails['msg'], emails['ham'], test_size=.2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(data["document"], data["target"], random_state=42, test_size=0.3)
X_train_vect = vect.fit_transform(X_train)


from sklearn.linear_model import LogisticRegression
from sklearn import metrics
logreg = LogisticRegression()
logreg.fit(X_train_vect, y_train)
X_test_vect = vect.transform(X_test)
print(logreg.score(X_test_vect, y_test))


y_pred = logreg.predict(X_test_vect)
from sklearn.metrics import confusion_matrix, classification_report
print(confusion_matrix(y_test, y_pred))

0.967703349282
[[1446    2]
 [  52  172]]


In [33]:
# Take 2:
#
# 1. Build a simple pipeline with a count vectorizer feeding into a model of your choice
# 2. Report Classificication Report, Confusion Matrix
# 3. Plot ROC Curve
# 4. Plot Precision / Recall Curve for Detecting Spam
# 5. Can you choose a different Cutoff to get a Higher Accuracy

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import RandomizedSearchCV
from sklearn.preprocessing

In [34]:
pipe=Pipeline(steps=[
        ("vect", CountVectorizer(stop_words='english')),
        ("model", RandomForestClassifier())
    ])

In [41]:
X_train.apply(lambda val: len(val.split(" "))).values.reshape[-1, 1]

array([10,  8, 13, ...,  5,  5,  8])

In [35]:
params = {
    "model__n_estimators": [10, 20, 100],
    "model__min_samples_leaf": [2, 5, 10],
    "vect__lowercase": [True, False]
}
grid = RandomizedSearchCV(pipe, params, verbose=True)

grid.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   10.3s finished


RandomizedSearchCV(cv=None, error_score='raise',
          estimator=Pipeline(steps=[('vect', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
     ...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
          fit_params={}, iid=True, n_iter=10, n_jobs=1,
          param_distributions={'model__min_samples_leaf': [2, 5, 10], 'vect__lowercase': [True, False], 'model__n_estimators': [10, 20, 100]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          scoring=None, verbose=True)

In [36]:
grid.best_params_

{'model__min_samples_leaf': 2,
 'model__n_estimators': 20,
 'vect__lowercase': True}

In [40]:
pipe.fit(X_train,y_train)
pipe.score(X_test, y_test)

0.97009569377990434

In [45]:
from sklearn.base import BaseEstimator, TransformerMixin

class WordCounter(BaseEstimator, TransformerMixin):

    def fit(self, x, y=None):
        return self

    def transform(self, x):
        counts = x.map(lambda val: len(val.split(" ")))
        return counts.values.reshape[-1,1]

In [48]:
pipe_features = FeatureUnion([
        ("count_vect", CountVectorizer(stop_words='english')),
        ("dummy_word_count", WordCounter([
                    ('word_count', WordCounter()),
                    ('')
                ]))
                
    ])

In [51]:
pipe= Pipeline(steps=[
        ("pipe_features", pipe_features),
        ("model", LogisticRegression())
    ])
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)

TypeError: 'builtin_function_or_method' object has no attribute '__getitem__'

In [46]:
vect = WordCounter()
vect.fit(X_train, y_train)

WordCounter()

In [47]:
vect.transform(X_train)

TypeError: 'builtin_function_or_method' object has no attribute '__getitem__'