Goal of this notebook to test sms spam detection with valious classfiers

### Start

First of all neccesary imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from nltk.stem import SnowballStemmer
%matplotlib inline

import tensorflow_hub as hub

import urllib.request
from tokenizers import BertWordPieceTokenizer

#model
import lightgbm as lgbm

from scipy import stats
from sklearn.preprocessing import StandardScaler

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 

Read the data from csv file

In [None]:
sms = pd.read_csv('../input/sms-spam-collection-dataset/spam.csv', encoding = 'Latin-1')
sms.head()

Now drop "unnamed" columns and rename v1 and v2 to "label" and "message"

In [None]:
sms = sms.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=1)
sms = sms.rename(columns = {'v1':'label','v2':'message'})

Check the changed data

In [None]:
sms.groupby('label').describe()

Create new feature "message length" and plot it to see if it's of any interest

In [None]:
sms['length'] = sms['message'].apply(len)
sms.head()

In [None]:
mpl.rcParams['patch.force_edgecolor'] = True
plt.style.use('seaborn-bright')
sms.hist(column='length', by='label', bins=50,figsize=(11,5))

Looks like the lengthy is the message, more likely it is a spam. Let's not forget this

### Outlier analysis: Approaches such as box-plot can be used to detect outliers and deal with them

In [None]:
def outliers_iqr(data):
    
    print(data.sample(10))
    green_diamond = dict(markerfacecolor='g', marker='D')
    plt.figure(figsize=(12,8))
    plt.boxplot(data['length'], flierprops=green_diamond)
    plt.show
    
    q1, q3 = np.percentile(data['length'],[25,75]) # 넘파이의 값을 퍼센트로 표시
    iqr = q3 -q1
    lower_bound = q1 - (iqr*1.5)
    upper_bound = q3 + (iqr*1.5)
    
    outlier_idx = data['length'][ (data['length'] < lower_bound) | (data['length'] > upper_bound) ].index
    
    return outlier_idx

In [None]:
outliers_1 = outliers_iqr(sms[sms['label']=='spam'])
outliers_2 = outliers_iqr(sms[sms['label']=='ham'])

In [None]:
#outliters = list(outliters)
print(outliers_1)
print(outliers_2)

In [None]:
print(sms.head(50))

In [None]:
sms.drop(index=outliers_1, axis=0, inplace=True)
sms.drop(index=outliers_2, axis=0, inplace=True)

In [None]:
print(sms.head(50))

### Text preprocessing

1. Lower casing (소문자화)
2. Text cleansing(remove) like . , ! $( ) * % @
3. Removing Stop words (불용어 제거)
4. Stemming (형태소분석)
5. Lemmatization(표제어) -- 필요 없을듯

**#1. Lower preprocessing**

In [None]:
sms['message'] = sms['message'].apply(lambda x: x.lower())

**#2. Text Cleansing**

In [None]:
sms['message'] = sms['message'].str.replace('\(?(http|https|ftp|ftps)?\:\/\/[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(\/\S*)?\)?','')
sms['message'] = sms['message'].str.replace('\d{2,3}-?\d{3,4}-?\d{4}','')
sms['message'] = sms['message'].str.replace('([\w\.-]+)@([\w\.-]+)(\.[\w\.]+)','')

**#3. Stop Words & Stemmer**

In [None]:
stop_words = set(stopwords.words('english')) 
stemmer = SnowballStemmer("english")

In [None]:
def preprocess(text, stem=False):

    tokens = []
    for token in text.split():
        if token not in stop_words:
            if stem:
                tokens.append(stemmer.stem(token))
            else:
                tokens.append(token)
    return " ".join(tokens)

In [None]:
sms['cleansing_text'] = sms['message']

In [None]:
%%time
sms['cleansing_text'] = sms['cleansing_text'].apply(lambda x: preprocess(x))

after preprocessing...

In [None]:
sms.head()

### Tokenizing & Embedding

In [None]:
# Huggingface Tokenizer
tokenizer = BertWordPieceTokenizer(lowercase=False)

In [None]:
data_file = sms['cleansing_text']
vocab_size = 30000
limit_alphabet = 6000
min_frequency = 5

tokenizer.train(files=data_file,
                vocab_size=vocab_size,
                limit_alphabet=limit_alphabet,
                min_frequency=min_frequency)

## 1. LightGBM

###  Classifiers and predictions

First of all let's split our features to test and train set

In [None]:
X_train, X_test, y_train, y_test = train_test_split(sms['cleansing_text'] ,sms['label'], test_size=0.2, random_state=111)