In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import FunctionTransformer

In [2]:
df = pd.read_csv('SMSSpamCollection', sep = '\t', names = ['label', 'message'])
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# I converted the label column to binary to be able to perform some exploration on the Data set

'Spam' will be converted to 1, while 'ham' will be 0

In [3]:
df['label'] = df['label'].map({'spam': 1, 'ham': 0})
df.head()

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


# I decided to find the percentage of messages with 3 numbers in my spam label (This format is common among spam messages)

In [4]:
df.loc[df['message'].str.contains('[0-9]{3,}'), 'label'].mean()

0.9429824561403509

94% of the spam messages has at least 3 numbers in them. This will be usefull when building my models

# Most spam messages also seems to contain 'www', so I would like to know what percentage of spam messages in my data have 'www'.

In [5]:
df.loc[df['message'].str.contains('www'), 'label'].mean()

0.9696969696969697

96% of the spam messages has 'www'

# Characterize by using the length of the messages

In [6]:
df['message'].apply(len).head()

0    111
1     29
2    155
3     49
4     61
Name: message, dtype: int64

# Spam messages with all Cap

In [15]:
df.loc[df['message'].str.contains('[A-Z]{4,}'), 'label'].mean()

0.5986666666666667

Almost 60% of spam messages contains all cap.

In [7]:
def pass_message_column(df):
    return df['message']
pass_message_column_tf = FunctionTransformer(pass_message_column, validate = False)

In [8]:
def message_length(df):
    return df['message'].apply(len).to_frame()
message_length_tf = FunctionTransformer(message_length, validate = False)


In [9]:
def has_number(df):
    return df['message'].str.contains('[0-9]{3,}').astype(int).to_frame()  
has_number_tf = FunctionTransformer(has_number, validate = False)

In [10]:
def has_all_capLetters(df):
    return df['message'].str.contains('[A-Z]{4,}').astype(int).to_frame()  # Atleast three Capital letter
has_all_capLetters_tf = FunctionTransformer(has_all_capLetters, validate = False)

In [11]:
def has_www(df):
    return df['message'].str.contains('www').astype(int).to_frame() 
has_www_tf = FunctionTransformer(has_www, validate = False)

In [12]:
def change_from_sparse_to_array(sparse_matrix):
    return sparse_matrix.toarray()
change_from_sparse_to_array_tf = FunctionTransformer(change_from_sparse_to_array, validate = False)

In [13]:
vect =  CountVectorizer()

In [16]:
def full_matrix(sparse_matrix):
    return sparse_matrix.toarray()
full_matrix_tf = FunctionTransformer(full_matrix, validate=False)

In [17]:
vector_pipeline = Pipeline([('pass_message_column_tf', pass_message_column_tf), ('vect', vect), ('full_matrix_tf', full_matrix_tf)])
vector_pipeline.fit_transform(df)

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [18]:
fu = FeatureUnion([('message_length_tf', message_length_tf), ('has_number_tf', has_number_tf), ('vector_pipeline', vector_pipeline), ('has_all_capLetters_tf', has_all_capLetters_tf), ('has_www_tf', has_www_tf)])

In [19]:
fu.transform(df)

array([[111,   0,   0, ...,   0,   0,   0],
       [ 29,   0,   0, ...,   0,   0,   0],
       [155,   1,   0, ...,   0,   0,   0],
       ..., 
       [ 57,   0,   0, ...,   0,   0,   0],
       [125,   0,   0, ...,   0,   0,   0],
       [ 26,   0,   0, ...,   0,   0,   0]])

In [20]:
lr = LogisticRegression()

In [21]:
pipe = Pipeline(
    [
        ('fu',fu),
#        ('ss', StandardScaler()),
        ('lr', lr)
    ]
)

In [22]:
X_train, X_test, y_train, y_test = train_test_split(df, df['label'])

In [23]:
params = {
    'fu__vector_pipeline__vect__stop_words':[None, 'english']
}
gs = GridSearchCV(pipe, param_grid = params)
gs.fit(X_train, y_train)
print gs.best_score_
gs.score(X_test, y_test)

0.988274706868


0.9921033740129217