<p style="color:#153462; 
          font-weight: bold; 
          font-size: 30px; 
          font-family: Gill Sans, sans-serif; 
          text-align: center;">
          Model Selection</p>

### objective
<p style="text-align: justify; text-justify: inter-word;">
   <font size=3>
       <ul>
           <li>Split the data into training and test sets</li>
           <li>Train vectorizer on training set and use it transform test set</li>
           <li>Fit best Random Forest and Gradient Boosting model on training set and predict on test set</li>
           <li>Thoroughly evaluate results of these two models to select best model</li>
       </ul>
   </font>
</p>

In [37]:
import pandas as pd
import string
import re
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import precision_recall_fscore_support as score
import time

In [25]:
nltk.data.path.append(r"D:\Artificial_Intelligence\nat_lang_proc\nltk_data")
stopwords = nltk.corpus.stopwords.words("english")
ps = nltk.PorterStemmer()

data_df = pd.read_csv(r"D:/Artificial_Intelligence/nat_lang_proc/data/SMSSpamCollection.tsv", 
                      delimiter="\t", 
                      header=None)
data_df.columns = ["labels", "body_text"]

# Adding two new features
def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text)-text.count(" ")), 3)*100

data_df["body_len"] = data_df["body_text"].apply(lambda x:len(x) - x.count(" "))
data_df["punct%"] = data_df["body_text"].apply(lambda x: count_punct(x))
data_df.head()

Unnamed: 0,labels,body_text,body_len,punct%
0,ham,I've been searching for the right words to tha...,160,2.5
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...,128,4.7
2,ham,"Nah I don't think he goes to usf, he lives aro...",49,4.1
3,ham,Even my brother is not like to speak with me. ...,62,3.2
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,28,7.1


In [26]:
# Cleaning and Vectorizing the data using TF-IDF vectorizer
def clean_data(text):
    without_punc = "".join([char.lower() for char in text if char not in string.punctuation])
    tokenzied_text = re.findall("\w+", without_punc)
    stemmed_tokens = [ps.stem(word) for word in tokenzied_text if word not in stopwords]
    return stemmed_tokens

### Vectorizing the Text

In [27]:
X_train, X_test, y_train, y_test = train_test_split(data_df[["body_text", "body_len", "punct%"]],
                                                    data_df["labels"],
                                                    test_size=0.2)

In [28]:
tfidf_vect = TfidfVectorizer(analyzer=clean_data)
tfidf_vect_fit = tfidf_vect.fit(X_train["body_text"])

In [29]:
tfidf_train = tfidf_vect_fit.transform(X_train["body_text"])
tfidf_test = tfidf_vect_fit.transform(X_test["body_text"])

In [30]:
tfidf_train

<4454x7123 sparse matrix of type '<class 'numpy.float64'>'
	with 39819 stored elements in Compressed Sparse Row format>

In [31]:
tfidf_test

<1114x7123 sparse matrix of type '<class 'numpy.float64'>'
	with 8875 stored elements in Compressed Sparse Row format>

<p style="text-align: justify; text-justify: inter-word;">
   <font size=3>
       <code>tfidf_train</code> and <code>tfidf_test</code> will have same number of columns. Because they
       both transformed using <code>tfidf_vect_fit</code> that was trained on the training set. So it only
       recongnizes word in the training set and can only create columns for words from the training set.
   </font>
</p>

In [44]:
X_train_vect_df = pd.DataFrame(tfidf_train.toarray())
X_train_vect_df.columns = tfidf_vect.get_feature_names_out()
                               
X_train_vect = pd.concat([X_train[["body_len", "punct%"]].reset_index(drop=True), X_train_vect_df], axis=1)
X_train_vect.head()

Unnamed: 0,body_len,punct%,0,008704050406,0089mi,01223585236,01223585334,0125698789,02,020603,...,zed,zero,zindgi,zoe,zogtoriu,zoom,zyada,é,ü,üll
0,133,3.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,19,5.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.379866,0.0
3,93,1.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,27,25.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [45]:
X_test_vect_df = pd.DataFrame(tfidf_test.toarray())
X_test_vect_df.columns = tfidf_vect.get_feature_names_out()

X_test_vect = pd.concat([X_test[["body_len", "punct%"]].reset_index(drop=True), X_test_vect_df], axis=1)
X_test_vect.head()

Unnamed: 0,body_len,punct%,0,008704050406,0089mi,01223585236,01223585334,0125698789,02,020603,...,zed,zero,zindgi,zoe,zogtoriu,zoom,zyada,é,ü,üll
0,39,5.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,23,4.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,32,12.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,58,8.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,137,6.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Model Evaluation

In [46]:
# Evaluating Random Forest Classifier
rf = RandomForestClassifier(n_estimators=150, max_depth=None, n_jobs=-1)

start = time.time()
rf_model = rf.fit(X_train_vect, y_train)
end = time.time()
rf_model_fit_time = end - start

start = time.time()
y_pred = rf_model.predict(X_test_vect)
end = time.time()
rf_model_pred_time = end - start

precision, recall, fscore, support =score(y_test, y_pred, pos_label="spam", average="binary")

In [51]:
print(f"""
Fit Time\t: {round(rf_model_fit_time, 3)}
Predict Time\t: {round(rf_model_pred_time, 3)}
Precision\t: {round(precision, 3)}
Recall \t\t: {round(recall, 3)}
Accuracy\t: {round((y_pred == y_test).sum()/len(y_pred), 3)}
""")


Fit Time	: 9.173
Predict Time	: 0.275
Precision	: 1.0
Recall 		: 0.794
Accuracy	: 0.971



In [53]:
# Validating Gradient Boosting Model

gb = GradientBoostingClassifier(n_estimators=150, max_depth=11)

start = time.time()
gb_model = gb.fit(X_train_vect, y_train)
end = time.time()
gb_model_fit_time = end - start

start = time.time()
y_pred = gb_model.predict(X_test_vect)
end = time.time()
gb_model_pred_time = end - start

precision, recall, fscore, support = score(y_test, y_pred, pos_label="spam", average="binary")

In [54]:
print(f"""
Fit Time\t: {round(gb_model_fit_time, 3)}
Predict Time\t: {round(gb_model_pred_time, 3)}
Precision\t: {round(precision, 3)}
Recall \t\t: {round(recall, 3)}
Accuracy\t: {round((y_pred == y_test).sum()/len(y_pred), 3)}
""")


Fit Time	: 376.128
Predict Time	: 0.275
Precision	: 0.97
Recall 		: 0.839
Accuracy	: 0.974



In opinion consider Random Forest Model.