In [2]:
#Program 12 - Building Random Forest Classifier

In [3]:
#cleaning data for the model

In [4]:
#importing all required libraries
import nltk
import re
import string
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
#default colwidth is 50 characters, set_option allows to define custom no. of characters to be displayed within each column of a dataframe
pd.set_option('max_colwidth',100)

#importing the data into a dataframe
data = pd.read_csv("E:/Training/Certifications - LinkedIn/NLP with Python for Machine Learning Essential Training/Dataset/SMSSpamCollection.tsv", sep='\t', names=['Label','Text'])

#displaying first 5 rows of the dataframe
data.head()

Unnamed: 0,Label,Text
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take yo...
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
2,ham,"Nah I don't think he goes to usf, he lives around here though"
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!


In [6]:
#creating a function to calculate the percent of punctuation in a text message
def punct_count(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")),3)*100

#applying the method
data["Text_Punct%"] = data["Text"].apply(lambda x: punct_count(x))

#displaying first 10 entries of the dataframe
data.head(10)

Unnamed: 0,Label,Text,Text_Punct%
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take yo...,2.5
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,4.7
2,ham,"Nah I don't think he goes to usf, he lives around here though",4.1
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.,3.2
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,7.1
5,ham,As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your call...,4.4
6,spam,WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To c...,4.5
7,spam,Had your mobile 11 months or more? U R entitled to Update to the latest colour mobiles with came...,1.6
8,ham,"I'm gonna be home soon and i don't want to talk about this stuff anymore tonight, k? I've cried ...",6.7
9,spam,"SIX chances to win CASH! From 100 to 20,000 pounds txt> CSH11 and send to 87575. Cost 150p/day, ...",7.2


In [7]:
#calculating total number of characters (excluding white spaces) in a text message
data["Text_Length"] = data["Text"].apply(lambda x: len(x) - x.count(" "))

#displaying first 10 entries of the dataframe
data.head(10)

Unnamed: 0,Label,Text,Text_Punct%,Text_Length
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take yo...,2.5,160
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,4.7,128
2,ham,"Nah I don't think he goes to usf, he lives around here though",4.1,49
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.,3.2,62
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,7.1,28
5,ham,As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your call...,4.4,135
6,spam,WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To c...,4.5,132
7,spam,Had your mobile 11 months or more? U R entitled to Update to the latest colour mobiles with came...,1.6,126
8,ham,"I'm gonna be home soon and i don't want to talk about this stuff anymore tonight, k? I've cried ...",6.7,89
9,spam,"SIX chances to win CASH! From 100 to 20,000 pounds txt> CSH11 and send to 87575. Cost 150p/day, ...",7.2,111


In [8]:
#extracting built-in punctuations and stopwords
punctuation = string.punctuation
stopwords = nltk.corpus.stopwords.words('english')

#creating a custom function to remove punctuation, tokenize and remove stopwords from the text
def clean_data(text):
    text = "".join([char for char in text if char not in punctuation])
    tokens = re.split('\W+',text)
    text = [word for word in tokens if word not in stopwords]
    return text

In [9]:
#importing TfidfVectorizer class from scikit-learn library
from sklearn.feature_extraction.text import TfidfVectorizer

#creating an object of TfidfVectorizer class and passing method for pre-processing of data
Tfidf_Weight_Vector = TfidfVectorizer(analyzer=clean_data)

#applying fit_tranform() method to train and transform (pre-processed) data on vector object
X_counts = Tfidf_Weight_Vector.fit_transform(data["Text"])

#creating final dataframe
X_features = pd.concat([data['Text_Length'],data['Text_Punct%'],pd.DataFrame(X_counts.toarray())],axis=1)
X_features.head()

Unnamed: 0,Text_Length,Text_Punct%,0,1,2,3,4,5,6,7,...,11509,11510,11511,11512,11513,11514,11515,11516,11517,11518
0,160,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,128,4.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,49,4.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,62,3.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,28,7.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
#preparing the model

In [11]:
#importing the model
from sklearn.ensemble import RandomForestClassifier

In [12]:
#exploring the model
print(dir(RandomForestClassifier))
print("\n")
print(RandomForestClassifier())

['__abstractmethods__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_abc_impl', '_estimator_type', '_get_param_names', '_get_tags', '_make_estimator', '_more_tags', '_required_parameters', '_set_oob_score', '_validate_X_predict', '_validate_estimator', '_validate_y_class_weight', 'apply', 'decision_path', 'feature_importances_', 'fit', 'get_params', 'predict', 'predict_log_proba', 'predict_proba', 'score', 'set_params']


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0

In [13]:
#importing modules to perform K-fold cross validation for the classifier
from sklearn.model_selection import KFold, cross_val_score

In [14]:
#creating model object
#paramter n_jobs=-1 will help to create and run decision trees in parallel and faster
rfc = RandomForestClassifier(n_jobs=-1)
#paramter n_splits will help to define the number of subsets into which the entire dataset is to be divided
k_folds = KFold(n_splits=5)
#generating score
cross_val_score(rfc, X_features,data['Label'],cv=k_folds,scoring='accuracy',n_jobs=-1)

array([0.96588869, 0.96947935, 0.97037702, 0.96046721, 0.97214735])