In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import FeatureUnion, Pipeline

## Load the dataset into `pandas`
The file is in this directory (`SMSSpamCollection` with no file extension). This is a dataset of text messages, some are spam and some are not (also known as ham).

Original source: https://archive.ics.uci.edu/ml/datasets/sms+spam+collection

1. You'll need to create your own column names: `label` and `message`
2. The data is tab separated, not comma separated

In [19]:
data = pd.read_table('SMSSpamCollection', sep='\t', names=['label','message'])
data.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## Preprocessing
For the label column, covert `'ham'` to 0 and `'spam'` to 1

In [23]:
data['label'] = data['label'].map({'ham': 0, 'spam': 1})
data.head()

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


## Baseline accuracy
What is the baseline accuracy of this dataset?

In [28]:
baseline = 1 - data['label'].mean()
print('Baseline: ', baseline)

Baseline:  0.8659368269921034


## Feature extraction

Using `sklearn`'s `FunctionTransformer` class, create several functions using the `str.contains()` method. One has been set up as an example for you below:

In [38]:
def has_forward_slash(data):
    return data['message'].str.contains('/').astype(int).to_frame()
has_forward_slash_tf = FunctionTransformer(has_forward_slash, validate=False)

In [59]:
def has_currency_symbol(data):
    return data['message'].str.contains('[$£€]').astype(int).to_frame()
has_currency_symbol_tf = FunctionTransformer(has_currency_symbol, validate=False)

def has_exclamation(data):
    return data['message'].str.contains('!').astype(int).to_frame()
has_exclamation_tf = FunctionTransformer(has_exclamation, validate=False)

def has_smut(data):
    return data['message'].str.contains('[(?i)xxx|(?i)sex]').astype(int).to_frame()
has_smut_tf = FunctionTransformer(has_smut, validate=False)

def has_yelling(data):
    return data['message'].str.contains('[A-Z]{4:}').astype(int).to_frame()
has_yelling_tf = FunctionTransformer(has_yelling, validate=False)

def contains_website(data):
    return data['message'].str.contains('http://').astype(int).to_frame()
contains_website_tf = FunctionTransformer(contains_website, validate=False)

def offers_free_stuff(data):
    return data['message'].str.contains('(?i)free').astype(int).to_frame()
offers_free_stuff_tf = FunctionTransformer(offers_free_stuff, validate=False)

def offers_prize(data):
    return data['message'].str.contains('(?i)prize|(?i)w[io]n|(?i)lottery|(?i)raffle|(?i)contest|(?i)draw').astype(int).to_frame()
offers_prize_tf = FunctionTransformer(offers_prize, validate=False)

## Feature union
Combine all your function transformers into a feature union

In [60]:
feat_un = FeatureUnion([
    ('has forward slash', has_forward_slash_tf),
    ('has currency symbol', has_currency_symbol_tf),
    ('has exclamation', has_exclamation_tf),
    ('has smut', has_smut_tf),
    ('has yelling', has_yelling_tf),
    ('contains website', contains_website_tf),
    ('offers free stuff', offers_free_stuff_tf),
    ('offers prize', offers_prize_tf)
])

## Pipeline
Create a pipeline with two components:
1. The `FeatureUnion` you set up in the previous step
2. The `LogisticRegression` class from `sklearn`

In [61]:
pipe = Pipeline([
    ('feature union', feat_un),
    ('log reg', LogisticRegression())
])

## Cross Validation
Using only the features you've created in the Feature Extraction step, see what accuracy score you can get with your **untuned** pipeline model from the previous step. You'll need the `cross_val_score` function for this step.

Some suggestions:
1. Look at a random sampling of spam messages and see what regex patterns you can glean from your observations.
2. If you're testing an idea, use `df.loc[]` and take the mean of the `label`. For example:

```python
# Testing percentage of spam messages that conain a forward slash
df.loc[df['message'].str.contains('/'), 'label'].mean()
```

In [62]:
cross_val_score(pipe, data, data['label']).mean()

0.93682753549008879

In [72]:
# Don't code here. See if you can beat my score!

0.97595111853847172