In [None]:
'''
check performance of all models - all features vs. digit%(most important feature)
'''

In [1]:
import pandas as pd
import string
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
import time

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split,KFold,cross_val_score,cross_validate,GridSearchCV
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import auc,roc_curve
from sklearn.feature_selection import SelectFromModel


import matplotlib.pyplot as plt
%matplotlib inline

pd.set_option('display.max_colwidth',None)

In [2]:
#read in text
stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

data = pd.read_csv('SMSSpamCollection.tsv',sep='\t',header=None)
data.columns = ['label','sms_text']
data['label_num']= data['label'].map({'ham': 0,'spam':1})

In [3]:
#create func to remove punctuation, tokenize, remove stopwords,stem

stopword = nltk.corpus.stopwords.words('english')
def clean_text (single_sms):
    #remove punctuation
    new_text = ''.join([char for char in single_sms if char not in string.punctuation])
    #tokenize sentence (will remove extra spaces as well)
    tokens = word_tokenize(new_text.lower())
    #remove stopwords and returned stemmed word
    new_text = [ps.stem(word) for word in tokens if word not in stopword]
    
    return new_text

In [4]:
#feature engineering

#length of text
def ret_len(text):
    return len(text)- text.count(' ')

data ['text_len'] = data['sms_text'].apply(ret_len)


#% of punctuation chars in a text
def ret_punc_per(text):
    punc_count = 0
    for char in text :
        if char in string.punctuation:
            punc_count += 1
            
    return round(punc_count/(len(text)- text.count(' ')),3)*100
data['punct%'] = data['sms_text'].apply(ret_punc_per)


#num of digits in a text
def ret_digit_per(text):
    digit_count = 0
    for char in text :
        if char.isdigit():
            digit_count += 1
            
    return round(digit_count/(len(text)- text.count(' ')),3)*100
data ['digit%'] = data['sms_text'].apply(ret_digit_per)


#percentage of capital letters:
def ret_upper_per(text):
    upper_count = 0
    for char in text :
        if char.isupper():
            upper_count += 1
            
    return round(upper_count/(len(text)- text.count(' ')),3)*100
data['upper%'] = data['sms_text'].apply(ret_upper_per)


def ret_url_presence(text):
   
    urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text)
    if len(urls) == 0:
        #empty list, no url
        return 0
    else :
        return 1 #text has urls
data ['url_present'] = data['sms_text'].apply(ret_url_presence)


In [57]:
#Split data :

X_train, X_test, y_train, y_test = train_test_split(data[['sms_text','text_len','punct%','digit%','upper%','url_present']],data['label_num'],test_size=0.20, random_state=42)



In [58]:
#create vectorised dataset : convert sms_text col to a numeric form

#create vectorizer and pass clean func made above
tfid_vect = TfidfVectorizer(analyzer=clean_text)
tfid_vect_fit = tfid_vect.fit(X_train['sms_text'])

#create vectorised columns
tfid_train = tfid_vect_fit.transform(X_train['sms_text']) #sparse matrix
tfid_test = tfid_vect_fit.transform(X_test['sms_text']) #sparse matrix


new_train_df = X_train[['text_len','punct%','digit%','upper%','url_present']].reset_index(drop = True)
vect_train_df = pd.DataFrame(tfid_train.toarray()) #convert sparse matrix to array
vect_train_df.columns = tfid_vect.get_feature_names_out() #column names will be unique words 

new_test_df = X_test[['text_len','punct%','digit%','upper%','url_present']].reset_index(drop = True)
vect_test_df = pd.DataFrame(tfid_test.toarray()) #convert sparse matrix to array
vect_test_df.columns = tfid_vect.get_feature_names_out() #column names will be unique words 

X_train_vect = pd.concat([new_train_df,vect_train_df],axis=1)
X_test_vect = pd.concat([new_test_df,vect_test_df],axis=1)

X_test_vect.shape
X_test_vect.head()


Unnamed: 0,text_len,punct%,digit%,upper%,url_present,0,008704050406,0089mi,0121,01223585334,...,»,é,ü,üll,–,‘,’,“,…,…thank
0,128,7.8,6.2,13.3,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,41,2.4,0.0,4.9,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,34,11.8,5.9,5.9,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,35,5.7,0.0,2.9,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,41,2.4,0.0,4.9,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
#compare performance with digit% column only.

In [59]:
%%time
#GB with all features :

#oversample 
ros = RandomOverSampler()
X_resampled, y_resampled = ros.fit_resample(X_train_vect,y_train) #all 7285 features

#apply model
gb = GradientBoostingClassifier() #default parameters
start = time.time()
gb.fit(X_resampled,y_resampled)
end = time.time()
fit_time = end-start

start = time.time()
y_pred = gb.predict(X_test_vect) #all 7285 features 
end = time.time()
predict_time = end-start

print(f'fit time is {fit_time} and Predict time is {predict_time}')
print(classification_report(y_test,y_pred))

fpr, tpr, thresholds = roc_curve(y_test, y_pred)
print(auc(fpr, tpr))

fit time is 29.46201205253601 and Predict time is 0.1508159637451172
              precision    recall  f1-score   support

           0       0.99      0.98      0.99       965
           1       0.90      0.92      0.91       149

    accuracy                           0.97      1114
   macro avg       0.94      0.95      0.95      1114
weighted avg       0.98      0.97      0.98      1114

0.9514413881837466
CPU times: user 30.2 s, sys: 1.24 s, total: 31.4 s
Wall time: 31.5 s


In [66]:
%%time
#GB with only digit% feature :

#https://stackoverflow.com/questions/51150153/valueerror-expected-2d-array-got-1d-array-instead
#https://www.edureka.co/community/66401/valueerror-expected-2d-array-got-1d-array-instead-array-4-7-9
X_train_vect_digit = X_train_vect['digit%'].values.reshape(-1,1)
X_test_vect_digit = X_test_vect['digit%'].values.reshape(-1,1)

#oversample 
ros = RandomOverSampler()
X_resampled, y_resampled = ros.fit_resample(X_train_vect_digit,y_train)

#apply model
gb = GradientBoostingClassifier() #default parameters
start = time.time()
gb.fit(X_resampled,y_resampled)
end = time.time()
fit_time = end-start

start = time.time()
y_pred = gb.predict(X_test_vect_digit) #only 1 feature - digit%
end = time.time()
predict_time = end-start

print(f'fit time is {fit_time} and Predict time is {predict_time}')
print(classification_report(y_test,y_pred))

fpr, tpr, thresholds = roc_curve(y_test, y_pred)
print(auc(fpr, tpr))



fit time is 0.19907808303833008 and Predict time is 0.0008518695831298828
              precision    recall  f1-score   support

           0       0.98      0.95      0.97       965
           1       0.73      0.87      0.80       149

    accuracy                           0.94      1114
   macro avg       0.86      0.91      0.88      1114
weighted avg       0.95      0.94      0.94      1114

0.9118892791320373
CPU times: user 209 ms, sys: 2.91 ms, total: 211 ms
Wall time: 209 ms


In [67]:
%%time
#RF with only digit% feature :

#https://stackoverflow.com/questions/51150153/valueerror-expected-2d-array-got-1d-array-instead
#https://www.edureka.co/community/66401/valueerror-expected-2d-array-got-1d-array-instead-array-4-7-9
X_train_vect_digit = X_train_vect['digit%'].values.reshape(-1,1)
X_test_vect_digit = X_test_vect['digit%'].values.reshape(-1,1)

#oversample 
ros = RandomOverSampler()
X_resampled, y_resampled = ros.fit_resample(X_train_vect_digit,y_train)

#apply model
rf = RandomForestClassifier() #default parameters
start = time.time()
rf.fit(X_resampled,y_resampled)
end = time.time()
fit_time = end-start

start = time.time()
y_pred = rf.predict(X_test_vect_digit) #only 1 feature - digit%
end = time.time()
predict_time = end-start

print(f'fit time is {fit_time} and Predict time is {predict_time}')
print(classification_report(y_test,y_pred))

fpr, tpr, thresholds = roc_curve(y_test, y_pred)
print(auc(fpr, tpr))



fit time is 0.25598978996276855 and Predict time is 0.008738040924072266
              precision    recall  f1-score   support

           0       0.97      0.96      0.97       965
           1       0.76      0.84      0.80       149

    accuracy                           0.94      1114
   macro avg       0.87      0.90      0.88      1114
weighted avg       0.95      0.94      0.94      1114

0.8992558333623117
CPU times: user 272 ms, sys: 4.28 ms, total: 276 ms
Wall time: 274 ms


In [69]:
%%time
#DT with only digit% feature :

#https://stackoverflow.com/questions/51150153/valueerror-expected-2d-array-got-1d-array-instead
#https://www.edureka.co/community/66401/valueerror-expected-2d-array-got-1d-array-instead-array-4-7-9
X_train_vect_digit = X_train_vect['digit%'].values.reshape(-1,1)
X_test_vect_digit = X_test_vect['digit%'].values.reshape(-1,1)

#oversample 
ros = RandomOverSampler()
X_resampled, y_resampled = ros.fit_resample(X_train_vect_digit,y_train)

#apply model
dt = DecisionTreeClassifier() #default parameters
start = time.time()
dt.fit(X_resampled,y_resampled)
end = time.time()
fit_time = end-start

start = time.time()
y_pred = dt.predict(X_test_vect_digit) #only 1 feature - digit%
end = time.time()
predict_time = end-start

print(f'fit time is {fit_time} and Predict time is {predict_time}')
print(classification_report(y_test,y_pred))

fpr, tpr, thresholds = roc_curve(y_test, y_pred)
print(auc(fpr, tpr))



fit time is 0.006953001022338867 and Predict time is 0.0003151893615722656
              precision    recall  f1-score   support

           0       0.97      0.96      0.97       965
           1       0.76      0.84      0.80       149

    accuracy                           0.94      1114
   macro avg       0.87      0.90      0.88      1114
weighted avg       0.95      0.94      0.94      1114

0.8987376986472858
CPU times: user 21 ms, sys: 2.5 ms, total: 23.5 ms
Wall time: 21.7 ms


In [None]:
'''
Conclusion : The performance with all features is better than a single feature - digit%
'''