### Building Machine Learning Classifiers: Random Forest on a holdout test set

In [1]:
import nltk
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import string

stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

data = pd.read_csv("SMSSpamCollection.tsv", sep='\t')
data.columns = ['label', 'body_text']

def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")), 3)*100

data['body_len'] = data['body_text'].apply(lambda x: len(x) - x.count(" "))
data['punct%'] = data['body_text'].apply(lambda x: count_punct(x))

def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text

tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf = tfidf_vect.fit_transform(data['body_text'])

X_features = pd.concat([data['body_len'], data['punct%'], pd.DataFrame(X_tfidf.toarray())], axis=1)
X_features.head()

Unnamed: 0,body_len,punct%,0,1,2,3,4,5,6,7,...,8094,8095,8096,8097,8098,8099,8100,8101,8102,8103
0,128,4.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,49,4.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,62,3.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,28,7.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,135,4.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


**Explore RandomForestClassifier through Holdout Set**

In [42]:
from sklearn.metrics import precision_recall_fscore_support 
from sklearn.model_selection import train_test_split

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X_features, data['label'], test_size=0.2) # remember: train_test_split outputs 4 datasets

In [46]:
# Convert feature names to string if they are not already strings
X_train.columns = X_train.columns.astype(str)

# Initialize the RandomForestClassifier with specified parameters
rf = RandomForestClassifier(n_estimators=50, max_depth=20, n_jobs=-1)
# A random forest classifier (RandomForestClassifier) is initialized with specified parameters such as the number of trees (n_estimators=50) and the maximum depth of each tree (max_depth=20). The n_jobs=-1 parameter allows the classifier to use all available CPU cores for parallel processing



# Fit the RandomForestClassifier to the training data
rf_model = rf.fit(X_train, y_train)
# The random forest classifier is trained (fit) using the training data (X_train, y_train). This step involves building multiple decision trees using the training data and combining their predictions to make more accurate predictions.

In [47]:
# looking at feature_importances_

sorted(zip(rf_model.feature_importances_, X_train.columns), reverse=True)[0:10]

# This line calculates the feature importances of the trained model and sorts them in descending order. Each feature importance score is paired with its corresponding feature name. It selects the top 10 most important features based on their importance scores.

[(0.05238952660486053, 'body_len'),
 (0.04933552418490005, '7350'),
 (0.03245851075841468, '1803'),
 (0.027196710003358448, '6746'),
 (0.025984247048935347, '5724'),
 (0.02387990552999657, '3134'),
 (0.02296065484562926, '2031'),
 (0.02289305286721421, '4796'),
 (0.02172753703126454, '7461'),
 (0.02105655663579477, '6285')]

In [59]:
# Convert feature names to string if they are not already strings
X_test.columns = X_test.columns.astype(str)

# Assuming rf_model is already trained and X_test, y_test are your test data
y_pred = rf_model.predict(X_test)
# Using the trained random forest model (rf_model), predictions are made on the test data (X_test). The predicted labels are stored in y_pred.




# Calculate precision, recall, F1-score, and support
precision, recall, fscore, support = precision_recall_fscore_support(y_test, y_pred, pos_label='spam', average='binary')
# Evaluation metrics including precision, recall, F1-score, and support are calculated using the predicted labels (y_pred) and the true labels (y_test). These metrics are commonly used to assess the performance of a binary classification model.


# Print the results
print("Precision:", round(precision, 3))
print("Recall:", round(recall, 3))
print("F1-score:", round(fscore,3))
print("Support:", support)
print("Accuracy:",round((y_pred==y_test).sum() / len(y_pred), 3))


Precision: 1.0
Recall: 0.627
F1-score: 0.771
Support: None
Accuracy: 0.952
