In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

In [2]:
df = pd.read_csv('SMSSpamCollection', sep = '\t', names = ['label', 'message'])
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df['label'] = df['label'].map({'spam': 1, 'ham': 0})
df.head()

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


# What percentage of the label column is ham (the message we want)?

In [4]:
1 - df['label'].mean()   

0.8659368269921034

86% of the messages are classified as ham

In [5]:
corpus = ['see spot', 'see spot run', 'The baby wolf looked for the wolves']
cv = CountVectorizer()
dtm = cv.fit_transform(corpus)

In [6]:
dtm.toarray()

array([[0, 0, 0, 0, 1, 1, 0, 0, 0],
       [0, 0, 0, 1, 1, 1, 0, 0, 0],
       [1, 1, 1, 0, 0, 0, 2, 1, 1]])

In [7]:
cv.get_feature_names()

[u'baby',
 u'for',
 u'looked',
 u'run',
 u'see',
 u'spot',
 u'the',
 u'wolf',
 u'wolves']

In [8]:
dtm_df = pd.DataFrame(dtm.toarray(), columns = cv.get_feature_names())
dtm_df

Unnamed: 0,baby,for,looked,run,see,spot,the,wolf,wolves
0,0,0,0,0,1,1,0,0,0
1,0,0,0,1,1,1,0,0,0
2,1,1,1,0,0,0,2,1,1


In [9]:
df_corpus = df.message
df_cv = CountVectorizer()
df_dtm = df_cv.fit_transform(df_corpus)
columns = df_cv.get_feature_names()
X_dtm_df = pd.DataFrame(df_dtm.toarray(), columns = columns)
X_dtm_df.head()

Unnamed: 0,00,000,000pes,008704050406,0089,0121,01223585236,01223585334,0125698789,02,...,zhong,zindgi,zoe,zogtorius,zoom,zouk,zyada,èn,ú1,〨ud
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
X_dtm_df['label'] = df.label
X_dtm_df.shape

(5572, 8714)

# Percentage of time each word appeared on the Spam list.

In [11]:
X_dtm_df.groupby('label').mean().iloc[1,:].sort_values(ascending = False)  # Spam list

to             0.925033
call           0.475234
you            0.397590
your           0.353414
free           0.299866
the            0.275770
for            0.273092
now            0.266399
or             0.251673
txt            0.218206
is             0.211513
on             0.194110
ur             0.192771
have           0.180723
from           0.175368
mobile         0.170013
text           0.167336
stop           0.164659
and            0.163320
claim          0.151272
with           0.145917
reply          0.139224
www            0.131191
of             0.129853
prize          0.124498
this           0.119143
get            0.115127
our            0.113788
only           0.112450
in             0.107095
                 ...   
nevr           0.000000
ned            0.000000
necklace       0.000000
neck           0.000000
nauseous       0.000000
nanny          0.000000
nannys         0.000000
nap            0.000000
narcotics      0.000000
naseeb         0.000000
nationwide     0

# Percentage of time each word appeared on the ham list.

In [12]:
X_dtm_df.groupby('label').mean().iloc[0,:].sort_values(ascending = False) # ham list

you            0.403731
to             0.323731
the            0.234819
and            0.177824
in             0.170570
me             0.161036
my             0.156269
is             0.153161
it             0.148808
that           0.116062
of             0.109016
for            0.105078
have           0.091813
can            0.091399
but            0.091399
so             0.090363
not            0.087254
your           0.086425
are            0.086010
on             0.081451
do             0.079793
at             0.078756
if             0.074197
we             0.073990
will           0.071710
be             0.069637
gt             0.065907
lt             0.065492
how            0.063212
get            0.063212
                 ...   
chatlines      0.000000
chat80155      0.000000
carlie         0.000000
shortcode      0.000000
shortbreaks    0.000000
cashbin        0.000000
cashto         0.000000
cast           0.000000
shinco         0.000000
cc             0.000000
cc100p         0

In [13]:
# df_corpus = df.message
df_cv = CountVectorizer()
df_dtm = df_cv.fit_transform(df['message'])
# columns = df_cv.get_feature_names()
lr = LogisticRegression()
params = {}
gs = GridSearchCV(lr, param_grid=params)
gs.fit(df_dtm, df['label'])
gs.best_score_

0.98115577889447236

# Setting stop words = 'English'

In [14]:
# df_corpus = df.message
df_cv = CountVectorizer(stop_words = 'english')
df_dtm = df_cv.fit_transform(df['message'])
# columns = df_cv.get_feature_names()
lr = LogisticRegression()
params = {}
gs = GridSearchCV(lr, param_grid=params)
gs.fit(df_dtm, df['label'])
gs.best_score_

0.97846374730796837

In [15]:
df_cv.get_stop_words()

frozenset({'a',
           'about',
           'above',
           'across',
           'after',
           'afterwards',
           'again',
           'against',
           'all',
           'almost',
           'alone',
           'along',
           'already',
           'also',
           'although',
           'always',
           'am',
           'among',
           'amongst',
           'amoungst',
           'amount',
           'an',
           'and',
           'another',
           'any',
           'anyhow',
           'anyone',
           'anything',
           'anyway',
           'anywhere',
           'are',
           'around',
           'as',
           'at',
           'back',
           'be',
           'became',
           'because',
           'become',
           'becomes',
           'becoming',
           'been',
           'before',
           'beforehand',
           'behind',
           'being',
           'below',
           'beside',
           'besides'

# The lower and upper boundary of the range of n-values for different n-grams to be extracted.

In [16]:
# df_corpus = df.message
df_cv = CountVectorizer(ngram_range=(1,2))
df_dtm = df_cv.fit_transform(df['message'])
# columns = df_cv.get_feature_names()
lr = LogisticRegression()
params = {}
gs = GridSearchCV(lr, param_grid=params)
gs.fit(df_dtm, df['label'])
gs.best_score_

0.98025843503230436

In [17]:
# df_corpus = df.message
df_cv = CountVectorizer(min_df = 5, max_df=250, ngram_range=(1,3))
df_dtm = df_cv.fit_transform(df['message'])
# columns = df_cv.get_feature_names()
lr = LogisticRegression()
params = {}
gs = GridSearchCV(lr, param_grid=params)
gs.fit(df_dtm, df['label'])
gs.best_score_

0.98205312275664036

In [18]:
# df_corpus = df.message
df_cv = CountVectorizer(max_features=2000)
df_dtm = df_cv.fit_transform(df['message'])
# columns = df_cv.get_feature_names()
lr = LogisticRegression()
params = {}
gs = GridSearchCV(lr, param_grid=params, cv = 10)
gs.fit(df_dtm, df['label'])
gs.best_score_

0.98366834170854267

In [19]:
# df_corpus = df.message
df_cv = CountVectorizer(binary=True)
df_dtm = df_cv.fit_transform(df['message'])
# columns = df_cv.get_feature_names()
lr = LogisticRegression()
params = {}
gs = GridSearchCV(lr, param_grid=params, cv = 10)
gs.fit(df_dtm, df['label'])
gs.best_score_

0.98420674802584351

#                     # TF-IDF
# TF(t) = (Number of times term t appears in a document) / (Total number of terms in the document)
# IDF(t) = log_e(Total number of documents / Number of documents with term t in it).
# Value = TF * IDF

In [20]:
df_cv = TfidfVectorizer()
df_dtm = df_cv.fit_transform(df['message'])
lr = LogisticRegression()
params = {}
gs = GridSearchCV(lr, param_grid=params, cv = 10)
gs.fit(df_dtm, df['label'])
gs.best_score_

0.96536252692031588

In [21]:
vect = CountVectorizer()
lr = LogisticRegression()
pipe = Pipeline([('vect', vect),('lr', lr)])
params = {}
gs = GridSearchCV(pipe, param_grid=params, cv = 10)
gs.fit(df['message'], df['label'])
gs.best_score_

0.98295046661880836

In [22]:
vect = CountVectorizer()
lr = LogisticRegression()
pipe = Pipeline([('vect', vect),('lr', lr)])
params = {
    'vect__ngram_range':[(1,1),(1,2)]
}
gs = GridSearchCV(pipe, param_grid=params, cv = 10)
gs.fit(df['message'], df['label'])
gs.best_score_

0.98295046661880836

In [23]:
vect = CountVectorizer()
lr = LogisticRegression()
pipe = Pipeline([('vect', vect),('lr', lr)])
params = {
    'vect__binary': [2], 'vect__max_features':[2000]
}
gs = GridSearchCV(pipe, param_grid=params, cv = 10)
gs.fit(df['message'], df['label'])
gs.best_score_

0.98492462311557794

In [24]:
vect.get_params().keys()

['binary',
 'lowercase',
 'stop_words',
 'decode_error',
 'vocabulary',
 'tokenizer',
 'encoding',
 'dtype',
 'analyzer',
 'ngram_range',
 'max_df',
 'min_df',
 'max_features',
 'input',
 'strip_accents',
 'token_pattern',
 'preprocessor']

In [25]:
vect = CountVectorizer()
lr = LogisticRegression()
pipe = Pipeline([('vect', vect),('lr', lr)])
params = {
    'vect__binary': [2], 'vect__min_df' : [5], 'vect__ngram_range':[(1,1),(1,2)]
}
gs = GridSearchCV(pipe, param_grid=params, cv = 10)
gs.fit(df['message'], df['label'])
gs.best_score_

0.98510409188801151