In [15]:
# Import libraries to be used
import pandas as pd
import numpy as np
import re
import string
from matplotlib import pyplot
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import precision_recall_fscore_support as score 
from sklearn.model_selection import train_test_split

In [16]:
# Initialise the stopwords from nltk.corpus
stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

In [17]:
# Load data set and set preliminaries
df = pd.read_csv('SMSSpamCollection.tsv', sep='\t', header=None)
df.columns = ['label', 'text']

df.head()

Unnamed: 0,label,text
0,ham,I've been searching for the right words to tha...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro..."
3,ham,Even my brother is not like to speak with me. ...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!


In [18]:
# Function to get the percentage of punctuations in text, sum them, round up and multiply by 100
def count_punct(text):
    punct_count = sum([1 for char in text if char in string.punctuation])
    return round(punct_count/(len(text) - text.count(" ")), 3) * 100


# Create a feature that would check the len of text and the percentage of punctuations
df['len_text'] = df['text'].apply(lambda x: len(x) - x.count(" "))
df['punct%'] = df['text'].apply(lambda x: count_punct(x))


# Function to remove punctuations, tokenise and stem words
def clean_text(text):
    text = " ".join([word.lower() for word in text if word not in string.punctuation])
    token = re.split('\W+', text)
    text = [ps.stem(word) for word in token if word not in stopwords]
    return text

In [19]:
df.head()

Unnamed: 0,label,text,len_text,punct%
0,ham,I've been searching for the right words to tha...,160,2.5
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...,128,4.7
2,ham,"Nah I don't think he goes to usf, he lives aro...",49,4.1
3,ham,Even my brother is not like to speak with me. ...,62,3.2
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,28,7.1


In [20]:
# Instantiate the vectorise and use the function to stem as hyperparameter, thus gives an vectorised object. Fit and transforrm the text
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf = tfidf_vect.fit_transform(df['text'])

print(tfidf_vect.get_feature_names_out())

# Convert the features from the vectorisation to an array and concatenate the two new features added
X_features = pd.concat([df['len_text'], df['punct%'], pd.DataFrame(X_tfidf.toarray())], axis=1)
X_features.head()




['' '0' '1' '2' '3' '4' '5' '6' '7' '8' '9' 'b' 'c' 'e' 'f' 'g' 'h' 'j'
 'k' 'l' 'n' 'p' 'q' 'r' 'u' 'v' 'w' 'x' 'z' 'è' 'é' 'ì' 'ú' 'ü' '〨' '鈥']


Unnamed: 0,len_text,punct%,0,1,2,3,4,5,6,7,...,26,27,28,29,30,31,32,33,34,35
0,160,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.157083,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,128,4.7,0.0,0.407846,0.404499,0.377804,0.0,0.08308,0.257814,0.0,...,0.071954,0.14608,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,49,4.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,62,3.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.092022,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,28,7.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.472263,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
# Instantiate the classifier, perform the kfold spliting of data, and carry out cross evaluation
rf = RandomForestClassifier(n_jobs=-1)
k_fold = KFold(n_splits=5)
X_features.columns = X_features.columns.astype(str)
cross_val_score(rf, X_features, df['label'], cv=k_fold, scoring='accuracy', n_jobs=-1)

array([0.97666068, 0.98025135, 0.97845601, 0.97394429, 0.97933513])

## Explore RandomForestClassifier through Holdout Set

In [22]:
# Split dataset into training and testing sets and set
X_train, X_test, y_train, y_test = train_test_split(X_features, df['label'], test_size=0.2)
X_train.columns = X_train.columns.astype(str)

# Fit the model, initialise the classifier and set hypeparameter
rf = RandomForestClassifier(n_estimators=50, max_depth=20, n_jobs=-1)
rf_model = rf.fit(X_train, y_train)



In [23]:
# Getting the most important feature
sorted(zip(rf_model.feature_importances_, X_train.columns), reverse=True)[0:10]

[(0.1749140756670773, '1'),
 (0.1489287149390899, '2'),
 (0.11018183872943553, '9'),
 (0.08564620515772503, '6'),
 (0.08179434234239791, '7'),
 (0.06752728360735318, '8'),
 (0.04311942903005069, 'len_text'),
 (0.027670144913576826, '3'),
 (0.02568544031154941, '4'),
 (0.022252568819389723, '5')]

In [24]:
# Carry out prediction with the X_test data, the y_test is not needed, score the predictions and save them respectively, calculate accuracy
y_pred = rf_model.predict(X_test)
precision, recall, fscore, support = score(y_test, y_pred, pos_label='spam', average='binary')

# Precision when a model identify something as spam it was spam the 'x' number of time
print("Precision: {} / Recall: {} / Accuracy: {}".format(round(precision, 3), 
                                                         round(recall, 3),
                                                         round((y_pred==y_test).sum() / len(y_pred), 3)))

Precision: 0.944 / Recall: 0.873 / Accuracy: 0.978


## GRID SEARCH
So here we need to test different hyperparamter setting to know which gives better model performance, so we would wirte the inner and out loop for the grid settings
then call a function that takes the grid settings, test and predicts the various model performance.

In [25]:
# Function to instantiate the classifier, train and predict
def train_pred_RF(n_est, depth):
    rf = RandomForestClassifier(n_estimators=n_est, max_depth=depth, n_jobs=-1)
    rf_model = rf.fit(X_train, y_train)
    y_pred = rf_model.predict(X_test)
    precision, recall, fscore, support = score(y_test, y_pred, pos_label='spam', average='binary')
    print('Est: {} / Depth: {} / ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
        n_est, depth, round(precision, 3), round(recall, 3), round((y_pred==y_test).sum() / len(y_pred), 3)
    ))

    

In [26]:
# Perform Grid search by costructing this inner and outter loop
for n_est in [10, 50, 100]:
    for depth in [None, 10, 20, 30]:
        train_pred_RF(n_est, depth)

Est: 10 / Depth: None / ---- Precision: 0.942 / Recall: 0.843 / Accuracy: 0.975
Est: 10 / Depth: 10 / ---- Precision: 0.943 / Recall: 0.858 / Accuracy: 0.977
Est: 10 / Depth: 20 / ---- Precision: 0.967 / Recall: 0.873 / Accuracy: 0.981
Est: 10 / Depth: 30 / ---- Precision: 0.966 / Recall: 0.858 / Accuracy: 0.979
Est: 50 / Depth: None / ---- Precision: 0.967 / Recall: 0.873 / Accuracy: 0.981
Est: 50 / Depth: 10 / ---- Precision: 0.967 / Recall: 0.881 / Accuracy: 0.982
Est: 50 / Depth: 20 / ---- Precision: 0.959 / Recall: 0.873 / Accuracy: 0.98
Est: 50 / Depth: 30 / ---- Precision: 0.944 / Recall: 0.873 / Accuracy: 0.978
Est: 100 / Depth: None / ---- Precision: 0.952 / Recall: 0.881 / Accuracy: 0.98
Est: 100 / Depth: 10 / ---- Precision: 0.967 / Recall: 0.881 / Accuracy: 0.982
Est: 100 / Depth: 20 / ---- Precision: 0.975 / Recall: 0.866 / Accuracy: 0.981
Est: 100 / Depth: 30 / ---- Precision: 0.952 / Recall: 0.881 / Accuracy: 0.98
