In [27]:
import pandas as pd
import string
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split,KFold,cross_val_score,cross_validate,GridSearchCV
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import auc,roc_curve


import matplotlib.pyplot as plt
%matplotlib inline

pd.set_option('display.max_colwidth',None)

In [2]:
#read in text
stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

data = pd.read_csv('SMSSpamCollection.tsv',sep='\t',header=None)
data.columns = ['label','sms_text']
data['label_num']= data['label'].map({'ham': 0,'spam':1})

In [3]:
#create func to remove punctuation, tokenize, remove stopwords,stem

stopword = nltk.corpus.stopwords.words('english')
def clean_text (single_sms):
    #remove punctuation
    new_text = ''.join([char for char in single_sms if char not in string.punctuation])
    #tokenize sentence (will remove extra spaces as well)
    tokens = word_tokenize(new_text.lower())
    #remove stopwords and returned stemmed word
    new_text = [ps.stem(word) for word in tokens if word not in stopword]
    
    return new_text

In [4]:
#Feature Engineering : add colummns that might add more information

#length of text
def ret_len(text):
    return len(text)- text.count(' ')

data ['text_len'] = data['sms_text'].apply(ret_len)
#data.head()

#% of punctuation chars in a text
def ret_punc_per(text):
    punc_count = 0
    for char in text :
        if char in string.punctuation:
            punc_count += 1
            
    return round(punc_count/(len(text)- text.count(' ')),3)*100
data['punct%'] = data['sms_text'].apply(ret_punc_per)
#data.head()

#num of digits in a text
def ret_digit_per(text):
    digit_count = 0
    for char in text :
        if char.isdigit():
            digit_count += 1
            
    return round(digit_count/(len(text)- text.count(' ')),3)*100
data ['digit%'] = data['sms_text'].apply(ret_digit_per)
#data.head() 

#percentage of capital letters:
def ret_upper_per(text):
    upper_count = 0
    for char in text :
        if char.isupper():
            upper_count += 1
            
    return round(upper_count/(len(text)- text.count(' ')),3)*100
data['upper%'] = data['sms_text'].apply(ret_upper_per)
#data.head()

def ret_url_presence(text):
   
    urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text)
    if len(urls) == 0:
        #empty list, no url
        return 0
    else :
        return 1 #text has urls
data ['url_present'] = data['sms_text'].apply(ret_url_presence)


In [5]:
#Split data : 

X_train, X_test, y_train, y_test = train_test_split(data[['sms_text','text_len','punct%','digit%','upper%','url_present']],data['label_num'],test_size=0.20, random_state=42)



In [6]:
#create vectorised dataset : convert sms_text col to a numeric form

#create vectorizer and pass clean func made above
tfid_vect = TfidfVectorizer(analyzer=clean_text)
tfid_vect_fit = tfid_vect.fit(X_train['sms_text'])

#create vectorised columns
tfid_train = tfid_vect_fit.transform(X_train['sms_text']) #sparse matrix
tfid_test = tfid_vect_fit.transform(X_test['sms_text']) #sparse matrix


new_train_df = X_train[['text_len','punct%','digit%','upper%','url_present']].reset_index(drop = True)
vect_train_df = pd.DataFrame(tfid_train.toarray()) #convert sparse matrix to array
vect_train_df.columns = tfid_vect.get_feature_names_out() #column names will be unique words 

new_test_df = X_test[['text_len','punct%','digit%','upper%','url_present']].reset_index(drop = True)
vect_test_df = pd.DataFrame(tfid_test.toarray()) #convert sparse matrix to array
vect_test_df.columns = tfid_vect.get_feature_names_out() #column names will be unique words 

X_train_vect = pd.concat([new_train_df,vect_train_df],axis=1)
X_test_vect = pd.concat([new_test_df,vect_test_df],axis=1)

X_train_vect.head()

Unnamed: 0,text_len,punct%,digit%,upper%,url_present,0,008704050406,0089mi,0121,01223585334,...,»,é,ü,üll,–,‘,’,“,…,…thank
0,94,6.4,2.1,3.2,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,104,5.8,1.9,3.8,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,49,6.1,0.0,4.1,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,39,2.6,0.0,2.6,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,22,4.5,0.0,4.5,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
#for all models, data should be balanced

In [26]:
#logistic regression : scaled and balanced data (for log reg, data should be scaled.)

#pass vectorised data to the model:

#resample 
ros = RandomOverSampler()
X_resampled, y_resampled = ros.fit_resample(X_train_vect,y_train)

#scale 
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_resampled)
X_test_scaled = scaler.transform(X_test_vect)

#apply model
lr = LogisticRegression(solver = 'newton-cg')

scores = cross_val_score(lr,X_train_scaled,y_resampled,cv = 5)
print(scores) #model is stable

lr.fit(X_train_scaled,y_resampled)

y_pred = lr.predict(X_test_vect)
print(classification_report(y_test,y_pred))

fpr, tpr, thresholds = roc_curve(y_test, y_pred)
print(auc(fpr, tpr))


[0.99935191 1.         0.99870382 1.         0.99935149]
              precision    recall  f1-score   support

           0       1.00      0.00      0.00       965
           1       0.13      1.00      0.24       149

    accuracy                           0.13      1114
   macro avg       0.57      0.50      0.12      1114
weighted avg       0.88      0.13      0.03      1114

0.5005181347150259




In [25]:
#KNN : scaled and balanced data (for Knn, data should be scaled.)

#pass vectorised data to the model:

#oversample 
ros = RandomOverSampler()
X_resampled, y_resampled = ros.fit_resample(X_train_vect,y_train)

#scale 
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_resampled)
X_test_scaled = scaler.transform(X_test_vect)

#apply model
kn = KNeighborsClassifier()

scores = cross_val_score(kn,X_train_scaled,y_resampled,cv = 5)
print(scores) #model is stable

kn.fit(X_train_scaled,y_resampled)

y_pred = kn.predict(X_test_vect)
print(classification_report(y_test,y_pred))

fpr, tpr, thresholds = roc_curve(y_test, y_pred)
print(auc(fpr, tpr))



[0.96824368 0.97083603 0.97861309 0.977965   0.98378729]




              precision    recall  f1-score   support

           0       0.87      0.97      0.92       965
           1       0.28      0.07      0.11       149

    accuracy                           0.85      1114
   macro avg       0.57      0.52      0.51      1114
weighted avg       0.79      0.85      0.81      1114

0.5200855443891921


In [24]:
#Decision Tree : needs balanced data , DT can scale itself

#pass vectorised data to the model:

#resample 
ros = RandomOverSampler()
X_resampled, y_resampled = ros.fit_resample(X_train_vect,y_train)

#apply model
dt = DecisionTreeClassifier()

scores = cross_val_score(dt,X_resampled,y_resampled,cv = 5)
print(scores) #model is stable

dt.fit(X_resampled,y_resampled)

y_pred = dt.predict(X_test_vect)
print(classification_report(y_test,y_pred))

fpr, tpr, thresholds = roc_curve(y_test, y_pred)
print(auc(fpr, tpr))



[0.99157485 0.99092677 0.99805574 0.9889825  0.99610895]
              precision    recall  f1-score   support

           0       0.98      0.98      0.98       965
           1       0.89      0.89      0.89       149

    accuracy                           0.97      1114
   macro avg       0.94      0.94      0.94      1114
weighted avg       0.97      0.97      0.97      1114

0.9380185693918004


In [23]:
#Random Forest  : needs balanced data , RF can scale itself

#pass vectorised data to the model:

#oversample 
ros = RandomOverSampler()
X_resampled, y_resampled = ros.fit_resample(X_train_vect,y_train)

#apply model
rf = RandomForestClassifier()

scores = cross_val_score(rf,X_resampled,y_resampled,cv = 5)
print(scores) #model is stable

rf.fit(X_resampled,y_resampled)
y_pred = rf.predict(X_test_vect)
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

fpr, tpr, thresholds = roc_curve(y_test, y_pred)
print(auc(fpr, tpr))



[0.99870382 1.         0.99935191 1.         1.        ]
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       0.99      0.87      0.93       149

    accuracy                           0.98      1114
   macro avg       0.99      0.94      0.96      1114
weighted avg       0.98      0.98      0.98      1114

[[964   1]
 [ 19 130]]
0.935723476023229


In [28]:
#Gradient Boost  : needs balanced data , GB can scale itself

#pass vectorised data to the model:

#oversample 
ros = RandomOverSampler()
X_resampled, y_resampled = ros.fit_resample(X_train_vect,y_train)

#apply model
gb = GradientBoostingClassifier()

scores = cross_val_score(gb,X_resampled,y_resampled,cv = 5)
print(scores) #model is stable

gb.fit(X_resampled,y_resampled)
y_pred = gb.predict(X_test_vect)

print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

fpr, tpr, thresholds = roc_curve(y_test, y_pred)
print(auc(fpr, tpr))

[0.99157485 0.9837978  0.99481529 0.98833441 0.99286641]
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       965
           1       0.93      0.91      0.92       149

    accuracy                           0.98      1114
   macro avg       0.96      0.95      0.95      1114
weighted avg       0.98      0.98      0.98      1114

[[954  11]
 [ 13 136]]
0.9506763570608896


In [None]:
#RF is top model followed closely by GB