In [None]:
'''
Final Evaluation
'''

In [18]:
import pandas as pd
import string
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
import time

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split,KFold,cross_val_score,cross_validate,GridSearchCV
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import auc,roc_curve
from sklearn.feature_selection import SelectFromModel


import matplotlib.pyplot as plt
%matplotlib inline

pd.set_option('display.max_colwidth',None)

In [19]:
#read in text
stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

data = pd.read_csv('SMSSpamCollection.tsv',sep='\t',header=None)
data.columns = ['label','sms_text']
data['label_num']= data['label'].map({'ham': 0,'spam':1})

In [20]:
#create func to remove punctuation, tokenize, remove stopwords,stem

stopword = nltk.corpus.stopwords.words('english')
def clean_text (single_sms):
    #remove punctuation
    new_text = ''.join([char for char in single_sms if char not in string.punctuation])
    #tokenize sentence (will remove extra spaces as well)
    tokens = word_tokenize(new_text.lower())
    #remove stopwords and returned stemmed word
    new_text = [ps.stem(word) for word in tokens if word not in stopword]
    
    return new_text

In [22]:
#feature engineering

#length of text
def ret_len(text):
    return len(text)- text.count(' ')

data ['text_len'] = data['sms_text'].apply(ret_len)


#% of punctuation chars in a text
def ret_punc_per(text):
    punc_count = 0
    for char in text :
        if char in string.punctuation:
            punc_count += 1
            
    return round(punc_count/(len(text)- text.count(' ')),3)*100
data['punct%'] = data['sms_text'].apply(ret_punc_per)


#num of digits in a text
def ret_digit_per(text):
    digit_count = 0
    for char in text :
        if char.isdigit():
            digit_count += 1
            
    return round(digit_count/(len(text)- text.count(' ')),3)*100
data ['digit%'] = data['sms_text'].apply(ret_digit_per)


#percentage of capital letters:
def ret_upper_per(text):
    upper_count = 0
    for char in text :
        if char.isupper():
            upper_count += 1
            
    return round(upper_count/(len(text)- text.count(' ')),3)*100
data['upper%'] = data['sms_text'].apply(ret_upper_per)


def ret_url_presence(text):
   
    urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text)
    if len(urls) == 0:
        #empty list, no url
        return 0
    else :
        return 1 #text has urls
data ['url_present'] = data['sms_text'].apply(ret_url_presence)


In [23]:
#Split data :

X_train, X_test, y_train, y_test = train_test_split(data[['sms_text','text_len','punct%','digit%','upper%','url_present']],data['label_num'],test_size=0.20, random_state=42)



In [24]:
#create vectorised dataset : convert sms_text col to a numeric form

#create vectorizer and pass clean func made above
tfid_vect = TfidfVectorizer(analyzer=clean_text)
tfid_vect_fit = tfid_vect.fit(X_train['sms_text'])

#create vectorised columns
tfid_train = tfid_vect_fit.transform(X_train['sms_text']) #sparse matrix
tfid_test = tfid_vect_fit.transform(X_test['sms_text']) #sparse matrix


new_train_df = X_train[['text_len','punct%','digit%','upper%','url_present']].reset_index(drop = True)
vect_train_df = pd.DataFrame(tfid_train.toarray()) #convert sparse matrix to array
vect_train_df.columns = tfid_vect.get_feature_names_out() #column names will be unique words 

new_test_df = X_test[['text_len','punct%','digit%','upper%','url_present']].reset_index(drop = True)
vect_test_df = pd.DataFrame(tfid_test.toarray()) #convert sparse matrix to array
vect_test_df.columns = tfid_vect.get_feature_names_out() #column names will be unique words 

X_train_vect = pd.concat([new_train_df,vect_train_df],axis=1)
X_test_vect = pd.concat([new_test_df,vect_test_df],axis=1)

In [30]:
#find best columns for Random Forest : Select features whose importance is greater than or equal to the mean 
#of all features

ros = RandomOverSampler()
X_resampled, y_resampled = ros.fit_resample(X_train_vect,y_train)

#SelectFromModel - select features based on importance weights
sel_r = SelectFromModel(RandomForestClassifier(random_state=42,n_jobs=-1)) #n_estimators = 100 is default
sel_r.fit(X_resampled, y_resampled)

selected_feat_rf = X_resampled.columns[sel_r.get_support()]
print(sel_r.threshold_) #pd.Series(rf.feature_importances_).mean()
print(len(selected_feat_rf))


0.00013726835964310226
656


In [None]:
'''
SelectFromModel : Extract best features of given dataset according to the importance of weights. 
The SelectFromModel is a meta-estimator that determines the weight importance by comparing to the given
threshold value.
The threshold value to use for feature selection. Features whose importance is greater or equal are kept while the 
others are discarded. If "median" (resp. "mean"), then the ``threshold`` value is the median (resp. the mean) of the
feature importances. A scaling factor (e.g., "1.25*mean") may also be used. If None and if the estimator has a 
parameter penalty set to l1, either explicitly or implicitly (e.g, Lasso), the threshold used is 1e-5.
Otherwise, "mean" is used by default. 

'''

In [27]:
#Find performance metrics for Random Forest with 642 features 

#oversample 
ros = RandomOverSampler()
X_resampled, y_resampled = ros.fit_resample(X_train_vect[selected_feat_rf],y_train) #642 cols

#apply model, chose best features from grid search cv
rf = RandomForestClassifier(n_estimators=100, max_depth=30,random_state=42,n_jobs=-1) #best params

start = time.time()
rf.fit(X_resampled,y_resampled)
end = time.time()
fit_time = end-start

start = time.time()
y_pred = rf.predict(X_test_vect[selected_feat_rf]) #642 cols
end = time.time()
predict_time = end-start

print(f'fit time is {fit_time} and Predict time is {predict_time}')
print(classification_report(y_test,y_pred))

fpr, tpr, thresholds = roc_curve(y_test, y_pred)
print(auc(fpr, tpr))



fit time is 0.27548694610595703 and Predict time is 0.020874977111816406
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       0.99      0.90      0.94       149

    accuracy                           0.99      1114
   macro avg       0.99      0.95      0.97      1114
weighted avg       0.99      0.99      0.99      1114

0.9491462948151754


In [None]:
#--------------------------------------------------

In [33]:
#calculate best features for Gradient Boosting :

ros = RandomOverSampler()
X_resampled, y_resampled = ros.fit_resample(X_train_vect,y_train)

sel_g = SelectFromModel(GradientBoostingClassifier()) #n_estimators = 100 is default
sel_g.fit(X_resampled, y_resampled)

selected_feat_gb = X_resampled.columns[sel_g.get_support()]

print(sel_g.threshold_) #pd.Series(gb.feature_importances_).mean()
print(len(selected_feat_gb))


0.00013726835964310226
84


In [29]:
%%time

##verify result of gridsearch cv, use best params and features to calculate performance metrics

#oversample 
ros = RandomOverSampler()
X_resampled, y_resampled = ros.fit_resample(X_train_vect[selected_feat_gb],y_train) #87 features

#apply model
gb = GradientBoostingClassifier(n_estimators=150,max_depth=3,learning_rate=.10) #best params
start = time.time()
gb.fit(X_resampled,y_resampled)
end = time.time()
fit_time = end-start

start = time.time()
y_pred = gb.predict(X_test_vect[selected_feat_gb]) #87 features
end = time.time()
predict_time = end-start

print(f'fit time is {fit_time} and Predict time is {predict_time}')
print(classification_report(y_test,y_pred))

fpr, tpr, thresholds = roc_curve(y_test, y_pred)
print(auc(fpr, tpr))

fit time is 1.0869531631469727 and Predict time is 0.0027589797973632812
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       965
           1       0.91      0.92      0.91       149

    accuracy                           0.98      1114
   macro avg       0.95      0.95      0.95      1114
weighted avg       0.98      0.98      0.98      1114

0.9524776576137984
CPU times: user 1.12 s, sys: 64.1 ms, total: 1.18 s
Wall time: 1.18 s


In [None]:
'''
Prediction time is more important than fit time.

Interpretation :
1) Accuracy is same for both.
2) GB takes longer to fit
3) RF takes longer to predict
4) RF has a slightly higher f1 score

'''