In [1]:
#Program 13 - Testing Random Forest Classifier using Holdout Test Set

In [2]:
#cleaning data for the model

In [3]:
#importing all required libraries
import nltk
import re
import string
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
#default colwidth is 50 characters, set_option allows to define custom no. of characters to be displayed within each column of a dataframe
pd.set_option('max_colwidth',100)

#importing the data into a dataframe
data = pd.read_csv("E:/Training/Certifications - LinkedIn/NLP with Python for Machine Learning Essential Training/Dataset/SMSSpamCollection.tsv", sep='\t', names=['Label','Text'])

#displaying first 5 rows of the dataframe
data.head()

Unnamed: 0,Label,Text
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take yo...
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
2,ham,"Nah I don't think he goes to usf, he lives around here though"
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!


In [5]:
#creating a function to calculate the percent of punctuation in a text message
def punct_count(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")),3)*100

#applying the method
data["Text_Punct%"] = data["Text"].apply(lambda x: punct_count(x))

#displaying first 10 entries of the dataframe
data.head(10)

Unnamed: 0,Label,Text,Text_Punct%
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take yo...,2.5
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,4.7
2,ham,"Nah I don't think he goes to usf, he lives around here though",4.1
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.,3.2
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,7.1
5,ham,As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your call...,4.4
6,spam,WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To c...,4.5
7,spam,Had your mobile 11 months or more? U R entitled to Update to the latest colour mobiles with came...,1.6
8,ham,"I'm gonna be home soon and i don't want to talk about this stuff anymore tonight, k? I've cried ...",6.7
9,spam,"SIX chances to win CASH! From 100 to 20,000 pounds txt> CSH11 and send to 87575. Cost 150p/day, ...",7.2


In [6]:
#calculating total number of characters (excluding white spaces) in a text message
data["Text_Length"] = data["Text"].apply(lambda x: len(x) - x.count(" "))

#displaying first 10 entries of the dataframe
data.head(10)

Unnamed: 0,Label,Text,Text_Punct%,Text_Length
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take yo...,2.5,160
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,4.7,128
2,ham,"Nah I don't think he goes to usf, he lives around here though",4.1,49
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.,3.2,62
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,7.1,28
5,ham,As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your call...,4.4,135
6,spam,WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To c...,4.5,132
7,spam,Had your mobile 11 months or more? U R entitled to Update to the latest colour mobiles with came...,1.6,126
8,ham,"I'm gonna be home soon and i don't want to talk about this stuff anymore tonight, k? I've cried ...",6.7,89
9,spam,"SIX chances to win CASH! From 100 to 20,000 pounds txt> CSH11 and send to 87575. Cost 150p/day, ...",7.2,111


In [7]:
#extracting built-in punctuations and stopwords
punctuation = string.punctuation
stopwords = nltk.corpus.stopwords.words('english')

#creating a custom function to remove punctuation, tokenize and remove stopwords from the text
def clean_data(text):
    text = "".join([char for char in text if char not in punctuation])
    tokens = re.split('\W+',text)
    text = [word for word in tokens if word not in stopwords]
    return text

In [8]:
#importing TfidfVectorizer class from scikit-learn library
from sklearn.feature_extraction.text import TfidfVectorizer

#creating an object of TfidfVectorizer class and passing method for pre-processing of data
Tfidf_Weight_Vector = TfidfVectorizer(analyzer=clean_data)

#applying fit_tranform() method to train and transform (pre-processed) data on vector object
X_counts = Tfidf_Weight_Vector.fit_transform(data["Text"])

#creating final dataframe
X_features = pd.concat([data['Text_Length'],data['Text_Punct%'],pd.DataFrame(X_counts.toarray())],axis=1)
X_features.head()

Unnamed: 0,Text_Length,Text_Punct%,0,1,2,3,4,5,6,7,...,11509,11510,11511,11512,11513,11514,11515,11516,11517,11518
0,160,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,128,4.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,49,4.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,62,3.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,28,7.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
#importing required modules
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split

In [18]:
#creating training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X_features,data['Label'],test_size=0.2)

In [19]:
#training the classifier

#importing the classifier 
from sklearn.ensemble import RandomForestClassifier

#instantiating the object for the classifier
#n-estimators define the number of decision trees to be created
#max_depth defines the max depth for each tree to be created
#n_jobs defines to run all process in paraller
rfc = RandomForestClassifier(n_estimators=50, max_depth=20,n_jobs=-1)

#creating the model
rfc_model = rfc.fit(X_train,y_train)

In [20]:
#evaluating feature importance
#sorted is used to sort the features according to their importance, reverse=True is used to sort in descending order
#zip is used to combine the two input arrays i.e feature importances and feature names (columns)
#[0:10] is used to display first 10 important features
sorted(zip(rfc_model.feature_importances_,X_train.columns),reverse=True)[0:10]

[(0.0668870806107654, 'Text_Length'),
 (0.03693547647170546, 5716),
 (0.026374915213099304, 1488),
 (0.021767190275447917, 3697),
 (0.02174596489787937, 1943),
 (0.01746138696307708, 8388),
 (0.017197534645536625, 9228),
 (0.015075177184141341, 716),
 (0.014701074924426245, 5475),
 (0.013363494525082347, 4177)]

In [21]:
#testing the classifier

y_pred = rfc_model.predict(X_test)
#pos_label defines the class label on which all scores should be generated
#average=binary outputs score only for the class (label) defined by pos_label
precision, recall, fscore, support = score(y_test,y_pred,pos_label='spam',average='binary')

In [23]:
print("Precision: {}, Recall: {}, Accuracy: {}".format(round(precision,3),round(recall,3),round((y_pred==y_test).sum()/len(y_test),3)))

Precision: 1.0, Recall: 0.394, Accuracy: 0.91
