# Libraries

In [3]:
import pandas as pd
pd.set_option('display.max_colwidth', None)

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")


import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.colors import n_colors
from plotly.subplots import make_subplots
%matplotlib inline

# Loading data

In [4]:
df = pd.read_csv('../data/spam.csv',encoding='Windows-1252')

In [5]:
df.shape

(5572, 5)

- Our dataset contains 5572 observation and 5 characterisitics (which one is dependant variable, and rest 4 are independant variables)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


- Most of the data in columns 2,3 and 4 are null values (they can be dropped)

In [7]:
df.sample(10,random_state=123456)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
1007,ham,I don't quite know what to do. I still can't get hold of anyone. I cud pick you up bout 7.30pm and we can see if they're in the pub?,,,
166,spam,URGENT! We are trying to contact you. Last weekends draw shows that you have won a å£900 prize GUARANTEED. Call 09061701939. Claim code S89. Valid 12hrs only,,,
2256,ham,"Just checked out, heading out to drop off my stuff now",,,
3862,spam,"Oh my god! I've found your number again! I'm so glad, text me back xafter this msgs cst std ntwk chg å£1.50",,,
4780,ham,Yup... Hey then one day on fri we can ask miwa and jiayin take leave go karaoke,,,
5439,ham,Hey i've booked the 2 lessons on sun liao...,,,
4374,spam,Ur TONEXS subscription has been renewed and you have been charged å£4.50. You can choose 10 more polys this month. www.clubzed.co.uk *BILLING MSG*,,,
2169,spam,"Shop till u Drop, IS IT YOU, either 10K, 5K, å£500 Cash or å£100 Travel voucher, Call now, 09064011000. NTT PO Box CR01327BT fixedline Cost 150ppm mobile vary",,,
1043,ham,Mmm thats better now i got a roast down me! iåÕd b better if i had a few drinks down me 2! Good indian?,,,
3342,ham,"I haven't forgotten you, i might have a couple bucks to send you tomorrow, k? I love ya too",,,


- We will be dropping columns 2,3 and 4 since they are not significative to our problem.

In [8]:
for column in ['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4']:
    del df[column]
df.sample(10,random_state=123456)

Unnamed: 0,v1,v2
1007,ham,I don't quite know what to do. I still can't get hold of anyone. I cud pick you up bout 7.30pm and we can see if they're in the pub?
166,spam,URGENT! We are trying to contact you. Last weekends draw shows that you have won a å£900 prize GUARANTEED. Call 09061701939. Claim code S89. Valid 12hrs only
2256,ham,"Just checked out, heading out to drop off my stuff now"
3862,spam,"Oh my god! I've found your number again! I'm so glad, text me back xafter this msgs cst std ntwk chg å£1.50"
4780,ham,Yup... Hey then one day on fri we can ask miwa and jiayin take leave go karaoke
5439,ham,Hey i've booked the 2 lessons on sun liao...
4374,spam,Ur TONEXS subscription has been renewed and you have been charged å£4.50. You can choose 10 more polys this month. www.clubzed.co.uk *BILLING MSG*
2169,spam,"Shop till u Drop, IS IT YOU, either 10K, 5K, å£500 Cash or å£100 Travel voucher, Call now, 09064011000. NTT PO Box CR01327BT fixedline Cost 150ppm mobile vary"
1043,ham,Mmm thats better now i got a roast down me! iåÕd b better if i had a few drinks down me 2! Good indian?
3342,ham,"I haven't forgotten you, i might have a couple bucks to send you tomorrow, k? I love ya too"


- We will rename the columns to make them more significative

In [9]:
df.columns = ['target', 'message']

In [10]:
df.sample(10,random_state=123456)

Unnamed: 0,target,message
1007,ham,I don't quite know what to do. I still can't get hold of anyone. I cud pick you up bout 7.30pm and we can see if they're in the pub?
166,spam,URGENT! We are trying to contact you. Last weekends draw shows that you have won a å£900 prize GUARANTEED. Call 09061701939. Claim code S89. Valid 12hrs only
2256,ham,"Just checked out, heading out to drop off my stuff now"
3862,spam,"Oh my god! I've found your number again! I'm so glad, text me back xafter this msgs cst std ntwk chg å£1.50"
4780,ham,Yup... Hey then one day on fri we can ask miwa and jiayin take leave go karaoke
5439,ham,Hey i've booked the 2 lessons on sun liao...
4374,spam,Ur TONEXS subscription has been renewed and you have been charged å£4.50. You can choose 10 more polys this month. www.clubzed.co.uk *BILLING MSG*
2169,spam,"Shop till u Drop, IS IT YOU, either 10K, 5K, å£500 Cash or å£100 Travel voucher, Call now, 09064011000. NTT PO Box CR01327BT fixedline Cost 150ppm mobile vary"
1043,ham,Mmm thats better now i got a roast down me! iåÕd b better if i had a few drinks down me 2! Good indian?
3342,ham,"I haven't forgotten you, i might have a couple bucks to send you tomorrow, k? I love ya too"


- The dataset consists of 5572 English messages, each entry is designated as being ham or spam. Dataframe has two columns: 
 - The first column `target` indicating the class of message as ham or spam.
 - The second column `message` is the content of the message.

In [11]:
df.groupby('target').describe()

Unnamed: 0_level_0,message,message,message,message
Unnamed: 0_level_1,count,unique,top,freq
target,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,653,Please call our customer service representative on FREEPHONE 0808 145 4742 between 9am-11pm as you have WON a guaranteed å£1000 cash or å£5000 prize!,4


- Our dataset contains 4516 unique (out of 4825 message) `ham` messages and 653 unique (out of 747 message) `spam` ones.
- `Sorry, I'll call later` is the most common `ham` message.
- `Please call our customer service representative on FREEPHONE...` is the most common `spam` message.


# Feature engineering

- We will be adding two new columns containing the number of words (`word_count`) and sentences (`sentence_count`) in each message.

In [12]:
import nltk
from nltk.tokenize import sent_tokenize
import string
def tokenize_words(message):
    # remove punctuation
    message = message.translate(str.maketrans('', '', string.punctuation))
    return nltk.word_tokenize(message)
def tokenize_sentences(message):
    return sent_tokenize(message)

In [13]:
df['word_count'] = df['message'].apply(lambda message : len(tokenize_words(message)))
df['sentence_count'] = df['message'].apply(lambda message : len(tokenize_sentences(message)))
df.sample(10,random_state=123456)

Unnamed: 0,target,message,word_count,sentence_count
1007,ham,I don't quite know what to do. I still can't get hold of anyone. I cud pick you up bout 7.30pm and we can see if they're in the pub?,30,3
166,spam,URGENT! We are trying to contact you. Last weekends draw shows that you have won a å£900 prize GUARANTEED. Call 09061701939. Claim code S89. Valid 12hrs only,27,6
2256,ham,"Just checked out, heading out to drop off my stuff now",11,1
3862,spam,"Oh my god! I've found your number again! I'm so glad, text me back xafter this msgs cst std ntwk chg å£1.50",22,3
4780,ham,Yup... Hey then one day on fri we can ask miwa and jiayin take leave go karaoke,17,1
5439,ham,Hey i've booked the 2 lessons on sun liao...,9,1
4374,spam,Ur TONEXS subscription has been renewed and you have been charged å£4.50. You can choose 10 more polys this month. www.clubzed.co.uk *BILLING MSG*,23,3
2169,spam,"Shop till u Drop, IS IT YOU, either 10K, 5K, å£500 Cash or å£100 Travel voucher, Call now, 09064011000. NTT PO Box CR01327BT fixedline Cost 150ppm mobile vary",28,2
1043,ham,Mmm thats better now i got a roast down me! iåÕd b better if i had a few drinks down me 2! Good indian?,24,3
3342,ham,"I haven't forgotten you, i might have a couple bucks to send you tomorrow, k? I love ya too",19,2


# EDA (Exploratory data analysis)

In [14]:
df.describe()

Unnamed: 0,word_count,sentence_count
count,5572.0,5572.0
mean,15.296482,1.991565
std,11.089235,1.501427
min,0.0,1.0
25%,7.0,1.0
50%,12.0,1.5
75%,22.0,2.0
max,171.0,38.0


- The average number of words is (`mean`) 15 word
- The average number of sentences is (`mean`) 2 sentences
- The mean values are close to medians
- There's a large difference  between the 75% and max value (there might be outliers in the dataset)

In [15]:
df.loc[df['word_count'] == df['word_count'].max()]

Unnamed: 0,target,message,word_count,sentence_count
1084,ham,For me the love should start with attraction.i should feel that I need her every time around me.she should be the first thing which comes in my thoughts.I would start the day and end it with her.she should be there every time I dream.love will be then when my every breath has her name.my life should happen around her.my life will be named to her.I would cry for her.will give all my happiness and take all her sorrows.I will be ready to fight with anyone for her.I will be in love when I will be doing the craziest things for her.love will be when I don't have to proove anyone that my girl is the most beautiful lady on the whole planet.I will always be singing praises for her.love will be when I start up making chicken curry and end up makiing sambar.life will be the most beautiful then.will get every morning and thank god for the day because she is with me.I would like to say a lot..will tell later..,171,1


- The longest message contains 171 (`max`) word

In [16]:
df.loc[df['word_count'] < 2].sort_values('word_count').head(5)

Unnamed: 0,target,message,word_count,sentence_count
3374,ham,:),0,1
4822,ham,:-) :-),0,1
260,ham,Yup,1,1
3092,ham,staff.science.nus.edu.sg/~phyhcmk/teaching/pc1323,1,1
3154,ham,Ok...,1,1


- The shortest message contains either 1 word or emojis only (counted as 0 words)

## Detecting outliers using the 1.5 IQR (Interquartile Rule)

### Applying the 1.5 IQ rule to `word_count`
- IQR = Q3-Q1 = 16 | Q1 = 7 | Q3 = 23
- IQR * 1.5 = 24
- IQR * 1.5 - Q1 = 17
- Q3 + IQR * 1.5 = 47

In [17]:
wc_desc = df.describe()['word_count']
wc_Q1, wc_Q3 = wc_desc['25%'], wc_desc['75%']
IQR = wc_Q3 - wc_Q1
low, high = IQR * 1.5 - wc_Q1 , wc_Q3 + IQR * 1.5
print(f'Low outliers count : {len(df[df.word_count<low])}\nHigh outliers count : {len(df[df.word_count>high])}')

Low outliers count : 3362
High outliers count : 79


- Removing low outliers will result in removing more than **half** of the data, while high outliers count is low so it will not affect our data.

### Applying the 1.5 IQ rule to `sentence_count`
- IQR = Q3-Q1 = 16 | Q1 = 7 | Q3 = 23
- IQR * 1.5 = 24
- IQR * 1.5 - Q1 = 17
- Q3 + IQR * 1.5 = 47

In [18]:
wc_desc = df.describe()['sentence_count']
wc_Q1, wc_Q3 = wc_desc['25%'], wc_desc['75%']
IQR = wc_Q3 - wc_Q1
low, high = IQR * 1.5 - wc_Q1 , wc_Q3 + IQR * 1.5
print(f'Low outliers count : {len(df[df.sentence_count<low])}\nHigh outliers count : {len(df[df.sentence_count>high])}')

Low outliers count : 0
High outliers count : 662


- There are no lower outliers in the dataset, while there're 662 high outliers. Like `word_count`, high outliers count is low so they will not affect the data.

In [19]:
fig = px.scatter_matrix(df,
                        dimensions=["word_count", "sentence_count"],
                        color="target")
fig.show()

## Count of every class

In [20]:
labels = ['ham', 'spam']
df.groupby('target')['target'].agg('count')

target
ham     4825
spam     747
Name: target, dtype: int64

In [21]:
counts = df.groupby('target').count().reset_index()
counts = counts.rename(columns = {"message":"count"})
fig = px.bar(counts,
             x='target',
             y='count',
             color='target',
             width = 500, height=400)
fig.update_layout(title_text='Count of ham and spam messages in the dataset',
                  xaxis_title_text='Class',
                  yaxis_title_text='Count')
fig.show()

- As we can see, the classes are imbalanced.

## Word count distribution

In [22]:
fig_hist = px.histogram(df,
                        x="word_count",
                        color="target",
                        barmode="overlay",
                        marginal="violin", #box or rug
                        hover_data=df.columns,
                        range_x=["0","100"],
                        width=700)
fig_hist.update_layout(title_text='Message length distribution (Word count)',
                       xaxis_title_text='Word Count',
                       bargap=0.3)
fig_hist.show()

- As we can see, `ham` message lenght tends to be lower than `spam` message lenght.

## Sentence count distribution

In [23]:
fig_hist = px.histogram(df,
                        x="sentence_count",
                        color="target",
                        barmode="overlay",
                        marginal="violin", #box or rug
                        hover_data=df.columns,
                        range_x=["0","40"],
                        width=700)
fig_hist.update_layout(title_text='Message length distribution (Sentence count)',
                       xaxis_title_text='Sentence Count',
                       bargap=0.3)
fig_hist.show()