<p style="color:#153462; 
          font-weight: bold; 
          font-size: 30px; 
          font-family: Gill Sans, sans-serif; 
          text-align: center;">
          Random Forest Implementation For NLP</p>

### Loading and Preparing the Data

In [1]:
import nltk
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import string

nltk.data.path.append(r"D:\Artificial_Intelligence\nat_lang_proc\nltk_data")
stopwords = nltk.corpus.stopwords.words("english")
ps = nltk.PorterStemmer()

In [2]:
data_df = pd.read_csv(r"D:/Artificial_Intelligence/nat_lang_proc/data/SMSSpamCollection.tsv", 
                      delimiter="\t", 
                      header=None)
data_df.columns = ["labels", "body_text"]

In [3]:
def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text)-text.count(" ")), 3)*100

In [4]:
data_df["body_len"] = data_df["body_text"].apply(lambda x:len(x) - x.count(" "))
data_df["punct%"] = data_df["body_text"].apply(lambda x: count_punct(x))
data_df.head()

Unnamed: 0,labels,body_text,body_len,punct%
0,ham,I've been searching for the right words to tha...,160,2.5
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...,128,4.7
2,ham,"Nah I don't think he goes to usf, he lives aro...",49,4.1
3,ham,Even my brother is not like to speak with me. ...,62,3.2
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,28,7.1


In [5]:
def clean_data(text):
    without_punc = "".join([char.lower() for char in text if char not in string.punctuation])
    tokenzied_text = re.findall("\w+", without_punc)
    stemmed_tokens = [ps.stem(word) for word in tokenzied_text if word not in stopwords]
    return stemmed_tokens
tfidf_vect = TfidfVectorizer(analyzer=clean_data)
X_tfidf = tfidf_vect.fit_transform(data_df["body_text"])

In [6]:
X_dfidf_df = pd.DataFrame(X_tfidf.toarray())
X_dfidf_df.columns = tfidf_vect.get_feature_names_out()
X_features = pd.concat([data_df["body_len"], data_df["punct%"], X_dfidf_df], axis=1)
X_features.head()

Unnamed: 0,body_len,punct%,0,008704050406,0089mi,0121,01223585236,01223585334,0125698789,02,...,zindgi,zoe,zogtoriu,zoom,zouk,zyada,é,ü,üll,〨ud
0,160,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,128,4.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,49,4.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,62,3.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,28,7.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Instantiating Random Forest Classifier

In [7]:
from sklearn.ensemble import RandomForestClassifier

In [8]:
# n_jobs will tell that model should run jobs(decision tree) in pipeline
rf = RandomForestClassifier(n_jobs=-1)

### KFold and cross_val_score

In [9]:
from sklearn.model_selection import cross_val_score, KFold

In [10]:
k_fold = KFold(n_splits=5)

In [11]:
cross_val_score(rf, X_features, data_df["labels"], cv=k_fold, scoring="accuracy", n_jobs=-1)

array([0.97666068, 0.98025135, 0.97486535, 0.96495957, 0.97394429])

### Random Forest Throught Holdout Set

In [12]:
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split

In [14]:
# Spliting data
X_train, X_test, y_train, y_test = train_test_split(
    X_features,
    data_df["labels"],
    test_size=0.2
)

In [17]:
# instantiating model
rf = RandomForestClassifier(n_estimators=50, # Tells the number of decision trees to consider
                            max_depth=20, # Maximum depth of the decision trees
                            n_jobs=-1)
rf_model = rf.fit(X_train, y_train)

In [21]:
# checking import features of the model
sorted(zip(rf_model.feature_importances_, X_train.columns), reverse=True)[:10]

[(0.060629490205550855, 'body_len'),
 (0.045668736978189954, 'txt'),
 (0.03950366232278892, 'free'),
 (0.03392137889383775, 'call'),
 (0.02351650306398356, 'prize'),
 (0.018965004260149004, 'claim'),
 (0.017612146194033985, 'servic'),
 (0.0164931086462909, 'cash'),
 (0.015334899848996665, 'award'),
 (0.01461329012029837, 'text')]

In [22]:
# Predicting the labels for test data set
y_predicted = rf_model.predict(X_test)

In [23]:
precision, recall, fscore, support = score(y_test, y_predicted, pos_label="spam", average="binary")

In [25]:
print(f"precision: {round(precision, 3)}\nrecall:{round(recall, 2)}\naccuracy: {round((y_predicted == y_test).sum()/len(y_predicted), 3)}")

precision: 1.0
recall:0.64
accuracy: 0.956
