In [2]:
#Program 15 - Testing Random Forest Classifier with GridSearchCV 

In [3]:
#cleaning data for the model

In [4]:
#importing all required libraries
import nltk
import re
import string
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
#default colwidth is 50 characters, set_option allows to define custom no. of characters to be displayed within each column of a dataframe
pd.set_option('max_colwidth',100)

#importing the data into a dataframe
data = pd.read_csv("E:/Training/Certifications - LinkedIn/NLP with Python for Machine Learning Essential Training/Dataset/SMSSpamCollection.tsv", sep='\t', names=['Label','Text'])

#displaying first 5 rows of the dataframe
data.head()

Unnamed: 0,Label,Text
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take yo...
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
2,ham,"Nah I don't think he goes to usf, he lives around here though"
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!


In [6]:
#creating a function to calculate the percent of punctuation in a text message
def punct_count(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")),3)*100

#applying the method
data["Text_Punct%"] = data["Text"].apply(lambda x: punct_count(x))

#displaying first 10 entries of the dataframe
data.head(10)

Unnamed: 0,Label,Text,Text_Punct%
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take yo...,2.5
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,4.7
2,ham,"Nah I don't think he goes to usf, he lives around here though",4.1
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.,3.2
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,7.1
5,ham,As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your call...,4.4
6,spam,WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To c...,4.5
7,spam,Had your mobile 11 months or more? U R entitled to Update to the latest colour mobiles with came...,1.6
8,ham,"I'm gonna be home soon and i don't want to talk about this stuff anymore tonight, k? I've cried ...",6.7
9,spam,"SIX chances to win CASH! From 100 to 20,000 pounds txt> CSH11 and send to 87575. Cost 150p/day, ...",7.2


In [7]:
#calculating total number of characters (excluding white spaces) in a text message
data["Text_Length"] = data["Text"].apply(lambda x: len(x) - x.count(" "))

#displaying first 10 entries of the dataframe
data.head(10)

Unnamed: 0,Label,Text,Text_Punct%,Text_Length
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take yo...,2.5,160
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,4.7,128
2,ham,"Nah I don't think he goes to usf, he lives around here though",4.1,49
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.,3.2,62
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,7.1,28
5,ham,As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your call...,4.4,135
6,spam,WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To c...,4.5,132
7,spam,Had your mobile 11 months or more? U R entitled to Update to the latest colour mobiles with came...,1.6,126
8,ham,"I'm gonna be home soon and i don't want to talk about this stuff anymore tonight, k? I've cried ...",6.7,89
9,spam,"SIX chances to win CASH! From 100 to 20,000 pounds txt> CSH11 and send to 87575. Cost 150p/day, ...",7.2,111


In [8]:
#extracting built-in punctuations and stopwords
punctuation = string.punctuation
stopwords = nltk.corpus.stopwords.words('english')

#creating a custom function to remove punctuation, tokenize and remove stopwords from the text
def clean_data(text):
    text = "".join([char for char in text if char not in punctuation])
    tokens = re.split('\W+',text)
    text = [word for word in tokens if word not in stopwords]
    return text

In [9]:
#importing TfidfVectorizer class from scikit-learn library
from sklearn.feature_extraction.text import TfidfVectorizer

#creating an object of TfidfVectorizer class and passing method for pre-processing of data
Tfidf_Weight_Vector = TfidfVectorizer(analyzer=clean_data)

#applying fit_tranform() method to train and transform (pre-processed) data on vector object
X_counts = Tfidf_Weight_Vector.fit_transform(data["Text"])

#creating final dataframe
X_Tfidf_features = pd.concat([data['Text_Length'],data['Text_Punct%'],pd.DataFrame(X_counts.toarray())],axis=1)
X_Tfidf_features.head()

Unnamed: 0,Text_Length,Text_Punct%,0,1,2,3,4,5,6,7,...,11509,11510,11511,11512,11513,11514,11515,11516,11517,11518
0,160,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,128,4.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,49,4.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,62,3.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,28,7.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
#importing CountVectorizer class from scikit-learn library
from sklearn.feature_extraction.text import CountVectorizer

#creating an object of CountVectorizer class and passing range for n-grams as (2,2) to generate bigrams (For e.g. if given (1,3) method will generate unigrams, bigrams and trigrams)
NGV_Count_Vector = CountVectorizer(analyzer=clean_data)

#applying fit_tranform() method to train and transform (pre-processed) data
X_counts = NGV_Count_Vector.fit_transform(data['Text'])

#creating final dataframe
X_Cv_features = pd.concat([data['Text_Length'],data['Text_Punct%'],pd.DataFrame(X_counts.toarray())],axis=1)
X_Cv_features.head()


Unnamed: 0,Text_Length,Text_Punct%,0,1,2,3,4,5,6,7,...,11509,11510,11511,11512,11513,11514,11515,11516,11517,11518
0,160,2.5,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,128,4.7,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,49,4.1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,62,3.2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,28,7.1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
#importing required modules
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [16]:
#implementing model for X_Tfidf_features

#instantiating the classifier object
rfc = RandomForestClassifier()

#creating the grid
param = {'n_estimators':[10,150,300],'max_depth':[30,60,90,None]}

#instantiating grid search cross validation object
gsc = GridSearchCV(rfc,param,cv=5,n_jobs=1)

#fitting the model 
gsc_fit = gsc.fit(X_Tfidf_features,data['Label'])

#displaying the results in a dataframe
pd.DataFrame(gsc_fit.cv_results_).sort_values('mean_test_score',ascending=False)[0:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
7,15.619778,0.055637,0.160845,0.00579,90.0,150,"{'max_depth': 90, 'n_estimators': 150}",0.974888,0.975763,0.969452,0.967655,0.979335,0.97342,0.00428,1
8,31.04503,0.265741,0.259407,0.012517,90.0,300,"{'max_depth': 90, 'n_estimators': 300}",0.977578,0.973968,0.968553,0.967655,0.978437,0.97324,0.004461,2
10,18.814512,0.300703,0.177899,0.006951,,150,"{'max_depth': None, 'n_estimators': 150}",0.973094,0.977558,0.97035,0.967655,0.97664,0.97306,0.003731,3
11,38.259531,2.705244,0.305807,0.034892,,300,"{'max_depth': None, 'n_estimators': 300}",0.977578,0.97307,0.971249,0.966757,0.97664,0.97306,0.003905,3
5,23.21532,0.121462,0.213114,0.008281,60.0,300,"{'max_depth': 60, 'n_estimators': 300}",0.973991,0.967684,0.965858,0.964061,0.973046,0.96893,0.003931,5


In [18]:
#implementing model for X_Cv_features

#instantiating the classifier object
rfc = RandomForestClassifier()

#creating the grid
param = {'n_estimators':[10,150,300],'max_depth':[30,60,90,None]}

#instantiating grid search cross validation object
gsc = GridSearchCV(rfc,param,cv=5,n_jobs=1)

#fitting the model 
gsc_fit = gsc.fit(X_Cv_features,data['Label'])

#displaying the results in a dataframe
pd.DataFrame(gsc_fit.cv_results_).sort_values('mean_test_score',ascending=False)[0:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
8,30.78233,0.283038,0.252805,0.016946,90.0,300,"{'max_depth': 90, 'n_estimators': 300}",0.975785,0.972172,0.97035,0.965858,0.97664,0.972162,0.003903,1
7,15.703611,0.099223,0.178185,0.004972,90.0,150,"{'max_depth': 90, 'n_estimators': 150}",0.975785,0.970377,0.969452,0.966757,0.974843,0.971444,0.00339,2
10,18.681794,0.444842,0.190061,0.002782,,150,"{'max_depth': None, 'n_estimators': 150}",0.975785,0.968582,0.97035,0.96496,0.97664,0.971264,0.004407,3
11,37.078489,0.657566,0.274763,0.006329,,300,"{'max_depth': None, 'n_estimators': 300}",0.973094,0.971275,0.969452,0.964061,0.977538,0.971085,0.00442,4
6,1.678176,0.034056,0.108389,0.006335,90.0,10,"{'max_depth': 90, 'n_estimators': 10}",0.970404,0.968582,0.969452,0.968553,0.97664,0.970726,0.003033,5


In [19]:
#mean_fit_time: average time to fit each model
#mean_score_time: average time taken by each model to predict a test set
#mean_test_score: average accuracy on the testing set
#mean_train_score: average accuracy on the training set