<p style="color:#153462; 
          font-weight: bold; 
          font-size: 30px; 
          font-family: Gill Sans, sans-serif; 
          text-align: center;">
          Random Forest Implementation For NLP</p>

### Loading and Preparing the Data

In [31]:
import nltk
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import string

nltk.data.path.append(r"D:\Artificial_Intelligence\nat_lang_proc\nltk_data")
stopwords = nltk.corpus.stopwords.words("english")
ps = nltk.PorterStemmer()

In [2]:
data_df = pd.read_csv(r"D:/Artificial_Intelligence/nat_lang_proc/data/SMSSpamCollection.tsv", 
                      delimiter="\t", 
                      header=None)
data_df.columns = ["labels", "body_text"]

In [3]:
def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text)-text.count(" ")), 3)*100

In [4]:
data_df["body_len"] = data_df["body_text"].apply(lambda x:len(x) - x.count(" "))
data_df["punct%"] = data_df["body_text"].apply(lambda x: count_punct(x))
data_df.head()

Unnamed: 0,labels,body_text,body_len,punct%
0,ham,I've been searching for the right words to tha...,160,2.5
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...,128,4.7
2,ham,"Nah I don't think he goes to usf, he lives aro...",49,4.1
3,ham,Even my brother is not like to speak with me. ...,62,3.2
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,28,7.1


In [5]:
def clean_data(text):
    without_punc = "".join([char.lower() for char in text if char not in string.punctuation])
    tokenzied_text = re.findall("\w+", without_punc)
    stemmed_tokens = [ps.stem(word) for word in tokenzied_text if word not in stopwords]
    return stemmed_tokens
tfidf_vect = TfidfVectorizer(analyzer=clean_data)
X_tfidf = tfidf_vect.fit_transform(data_df["body_text"])

In [6]:
X_dfidf_df = pd.DataFrame(X_tfidf.toarray())
X_dfidf_df.columns = tfidf_vect.get_feature_names_out()
X_features = pd.concat([data_df["body_len"], data_df["punct%"], X_dfidf_df], axis=1)
X_features.head()

Unnamed: 0,body_len,punct%,0,008704050406,0089mi,0121,01223585236,01223585334,0125698789,02,...,zindgi,zoe,zogtoriu,zoom,zouk,zyada,é,ü,üll,〨ud
0,160,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,128,4.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,49,4.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,62,3.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,28,7.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Instantiating Random Forest Classifier

In [7]:
from sklearn.ensemble import RandomForestClassifier

In [8]:
# n_jobs will tell that model should run jobs(decision tree) in pipeline
rf = RandomForestClassifier(n_jobs=-1)

### KFold and cross_val_score

In [9]:
from sklearn.model_selection import cross_val_score, KFold

In [10]:
k_fold = KFold(n_splits=5)

In [11]:
cross_val_score(rf, X_features, data_df["labels"], cv=k_fold, scoring="accuracy", n_jobs=-1)

array([0.97755835, 0.97666068, 0.97307002, 0.96495957, 0.97304582])

### Random Forest Throught Holdout Set

In [12]:
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split

In [13]:
# Spliting data
X_train, X_test, y_train, y_test = train_test_split(
    X_features,
    data_df["labels"],
    test_size=0.2
)

In [14]:
# instantiating model
rf = RandomForestClassifier(n_estimators=50, # Tells the number of decision trees to consider
                            max_depth=20, # Maximum depth of the decision trees
                            n_jobs=-1)
rf_model = rf.fit(X_train, y_train)

In [15]:
# checking import features of the model
sorted(zip(rf_model.feature_importances_, X_train.columns), reverse=True)[:10]

[(0.07098722902468964, 'body_len'),
 (0.041519313507758795, 'txt'),
 (0.036523777796449136, 'mobil'),
 (0.035859293596444325, 'call'),
 (0.030344473592360654, 'free'),
 (0.025696618963121625, 'claim'),
 (0.02110967621467747, 'servic'),
 (0.01931735514429946, '500'),
 (0.01866539945869072, 'stop'),
 (0.013500151604101058, 'contact')]

In [16]:
# Predicting the labels for test data set
y_predicted = rf_model.predict(X_test)

In [17]:
precision, recall, fscore, support = score(y_test, y_predicted, pos_label="spam", average="binary")

In [18]:
print(f"precision: {round(precision, 3)}\nrecall:{round(recall, 2)}\naccuracy: {round((y_predicted == y_test).sum()/len(y_predicted), 3)}")

precision: 1.0
recall:0.61
accuracy: 0.943


### Grid Search
Grid search is a tuning technique that attempts to compute the optimum values of hyperparameters. It is an exhaustive search that is performed on a the specific parameter values of a model. The model is also known as an estimator

In [24]:
def train_RF(n_est, depth):
    rf = RandomForestClassifier(n_estimators=n_est,
                                max_depth=depth,
                                n_jobs=-1)
    rf_model = rf.fit(X_train, y_train)
    y_pred = rf_model.predict(X_test)
    precision, recall, fscore, support = score(y_test, y_pred, pos_label="spam", average="binary")
    print(f"EST: {n_est}, \
    Depth: {depth},\
    Precision: {round(precision, 3)}, \
    Recall: {round(recall, 3)}, \
    Accuracy: {round((y_pred == y_test).sum()/len(y_pred), 3)}")

In [26]:
for n_est in [10, 50, 100]:
    for depth in [10, 20, 30, None]:
        train_RF(n_est, depth)
    print("*" * 50)

EST: 10,     Depth: 10,    Precision: 1.0,     Recall: 0.185,     Accuracy: 0.882
EST: 10,     Depth: 20,    Precision: 0.989,     Recall: 0.574,     Accuracy: 0.937
EST: 10,     Depth: 30,    Precision: 0.982,     Recall: 0.691,     Accuracy: 0.953
EST: 10,     Depth: None,    Precision: 0.984,     Recall: 0.778,     Accuracy: 0.966
**************************************************
EST: 50,     Depth: 10,    Precision: 1.0,     Recall: 0.278,     Accuracy: 0.895
EST: 50,     Depth: 20,    Precision: 1.0,     Recall: 0.586,     Accuracy: 0.94
EST: 50,     Depth: 30,    Precision: 1.0,     Recall: 0.704,     Accuracy: 0.957
EST: 50,     Depth: None,    Precision: 0.977,     Recall: 0.784,     Accuracy: 0.966
**************************************************
EST: 100,     Depth: 10,    Precision: 1.0,     Recall: 0.21,     Accuracy: 0.885
EST: 100,     Depth: 20,    Precision: 1.0,     Recall: 0.562,     Accuracy: 0.936
EST: 100,     Depth: 30,    Precision: 1.0,     Recall: 0.673,    

### GridSearchCV

 GridSearchCV, what this method will allow you to do is define a grid of parameters that you want to explore and then within each setting, it will run cross-validation.

In [34]:
from sklearn.model_selection import GridSearchCV

In [33]:
# We will also see the difference in result of both TfidfVectorizer and CountVectorizer
X_tfidf_feat = X_features

count_vect = CountVectorizer(analyzer=clean_data)
X_count_vect = count_vect.fit_transform(data_df["body_text"])
X_count_vect_df = pd.DataFrame(X_count_vect.toarray())
X_count_vect_df.columns = count_vect.get_feature_names_out()
X_count_vect_features = pd.concat([data_df["body_len"], data_df["punct%"], X_count_vect_df], axis=1)
X_count_vect_features.head()

Unnamed: 0,body_len,punct%,0,008704050406,0089mi,0121,01223585236,01223585334,0125698789,02,...,zindgi,zoe,zogtoriu,zoom,zouk,zyada,é,ü,üll,〨ud
0,160,2.5,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,128,4.7,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,49,4.1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,62,3.2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,28,7.1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [37]:
rf = RandomForestClassifier()
params = { "n_estimators":[10, 150, 300],
           "max_depth": [30, 60, 90, None]}
gs = GridSearchCV(rf, params, cv=5, n_jobs=-1)
gs_fit = gs.fit(X_tfidf_feat, data_df["labels"])
pd.DataFrame(gs_fit.cv_results_).sort_values("mean_test_score", ascending=False)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
7,34.470396,0.904333,0.506253,0.087621,90.0,150,"{'max_depth': 90, 'n_estimators': 150}",0.979354,0.978456,0.975763,0.969452,0.971249,0.974855,0.003905,1
8,67.741712,1.363963,0.681813,0.010927,90.0,300,"{'max_depth': 90, 'n_estimators': 300}",0.978456,0.978456,0.974865,0.968553,0.973046,0.974675,0.003708,2
11,69.911508,8.206558,0.716933,0.211969,,300,"{'max_depth': None, 'n_estimators': 300}",0.977558,0.977558,0.974865,0.969452,0.973046,0.974496,0.003048,3
10,38.518934,0.510064,0.509887,0.021077,,150,"{'max_depth': None, 'n_estimators': 150}",0.976661,0.977558,0.972172,0.969452,0.97035,0.973239,0.003292,4
4,28.468587,0.131534,0.446072,0.008538,60.0,150,"{'max_depth': 60, 'n_estimators': 150}",0.977558,0.976661,0.973968,0.966757,0.969452,0.972879,0.004162,5
5,55.185389,0.066707,0.622617,0.023496,60.0,300,"{'max_depth': 60, 'n_estimators': 300}",0.974865,0.972172,0.973968,0.966757,0.97035,0.971622,0.002885,6
6,3.546135,0.064686,0.292764,0.013764,90.0,10,"{'max_depth': 90, 'n_estimators': 10}",0.973968,0.966786,0.974865,0.96496,0.973046,0.970725,0.004045,7
3,3.075616,0.081649,0.365383,0.102473,60.0,10,"{'max_depth': 60, 'n_estimators': 10}",0.97307,0.978456,0.971275,0.96496,0.96496,0.970544,0.005136,8
9,3.82307,0.096718,0.295417,0.014827,,10,"{'max_depth': None, 'n_estimators': 10}",0.975763,0.967684,0.966786,0.96496,0.967655,0.96857,0.00373,9
0,4.122379,1.340971,0.249655,0.092334,30.0,10,"{'max_depth': 30, 'n_estimators': 10}",0.962298,0.956912,0.965889,0.954178,0.960467,0.959949,0.004089,10


In [39]:
rf = RandomForestClassifier()
params = { "n_estimators":[10, 150, 300],
           "max_depth": [30, 60, 90, None]}
gs = GridSearchCV(rf, params, cv=5, n_jobs=-1)
gs_fit = gs.fit(X_count_vect_features, data_df["labels"])
results = pd.DataFrame(gs_fit.cv_results_).sort_values("mean_test_score", ascending=False)
results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
7,47.255805,0.847613,0.760613,0.227499,90.0,150,"{'max_depth': 90, 'n_estimators': 150}",0.979354,0.974865,0.974865,0.967655,0.97035,0.973418,0.004051,1
8,89.931547,0.627204,1.159445,0.240929,90.0,300,"{'max_depth': 90, 'n_estimators': 300}",0.978456,0.973968,0.973968,0.968553,0.97035,0.973059,0.003418,2
11,88.788417,14.694663,1.014585,0.306717,,300,"{'max_depth': None, 'n_estimators': 300}",0.977558,0.974865,0.973968,0.967655,0.969452,0.9727,0.00363,3
10,50.142477,0.887726,0.704861,0.235281,,150,"{'max_depth': None, 'n_estimators': 150}",0.976661,0.970377,0.97307,0.969452,0.969452,0.971802,0.002767,4
6,5.490256,0.855882,0.441548,0.153373,90.0,10,"{'max_depth': 90, 'n_estimators': 10}",0.968582,0.980251,0.969479,0.968553,0.97035,0.971443,0.004454,5
5,73.410212,1.136992,0.990469,0.234574,60.0,300,"{'max_depth': 60, 'n_estimators': 300}",0.975763,0.970377,0.97307,0.96496,0.968553,0.970545,0.003709,6
4,37.665565,0.692362,0.724433,0.211044,60.0,150,"{'max_depth': 60, 'n_estimators': 150}",0.975763,0.97307,0.968582,0.962264,0.97035,0.970006,0.004574,7
3,4.173922,0.733998,0.49218,0.147537,60.0,10,"{'max_depth': 60, 'n_estimators': 10}",0.974865,0.974865,0.969479,0.961366,0.963163,0.968748,0.005676,8
9,5.523036,1.354415,0.589305,0.038643,,10,"{'max_depth': None, 'n_estimators': 10}",0.971275,0.964991,0.968582,0.959569,0.972147,0.967313,0.004605,9
1,23.254112,0.819941,0.494655,0.197629,30.0,150,"{'max_depth': 30, 'n_estimators': 150}",0.966786,0.960503,0.962298,0.954178,0.954178,0.959589,0.004869,10


In [38]:
24* 8

192

### Important Resources
<a hreaf="https://medium.com/fintechexplained/what-is-grid-search-c01fe886ef0a">Grid Search</a>