# Random Forest Algo

### importing pandas


In [1]:
import pandas as pd

### getting data

In [2]:
df = pd.read_excel (r'C:\Users\HP\Downloads\Dataset.xlsx')

In [3]:
df = df.sample(frac=1).reset_index(drop=True)
df.head()

Unnamed: 0,INTENT,ID,Unnamed: 2
0,can you remove formatting from this paragraph,18,
1,remove bold for the selected text,3,
2,last two words underline,6,
3,Display help,25,
4,can you centre align the last word of the para...,14,


### removing column 2 and null rows

In [4]:
df = pd.DataFrame(df) 
df.drop(['Unnamed: 2'], axis=1)
df.dropna(subset=['INTENT'], inplace=True)
df.head()

Unnamed: 0,INTENT,ID,Unnamed: 2
0,can you remove formatting from this paragraph,18,
1,remove bold for the selected text,3,
2,last two words underline,6,
3,Display help,25,
4,can you centre align the last word of the para...,14,


### Tokenized

In [5]:
import re

def tokenize(text):
    tokens=re.split('\W+',text)
    return tokens
df['intent_tokenized']=df['INTENT'].apply(lambda x:tokenize(x.lower()))
df.head()

Unnamed: 0,INTENT,ID,Unnamed: 2,intent_tokenized
0,can you remove formatting from this paragraph,18,,"[can, you, remove, formatting, from, this, par..."
1,remove bold for the selected text,3,,"[remove, bold, for, the, selected, text]"
2,last two words underline,6,,"[last, two, words, underline]"
3,Display help,25,,"[display, help]"
4,can you centre align the last word of the para...,14,,"[can, you, centre, align, the, last, word, of,..."


### remove stopwords

In [6]:
import nltk
#stopword=nltk.download('stopwords')
stopword=nltk.corpus.stopwords.words('english')
stopword.append("please")
stopword.append("could")
stopword.append("need")
stopword.append(" ")
stopword.append("kindly")
def remove_stopword(tokenised_list):
    text=[word for word in tokenised_list if word not in stopword]
    return text
df['intent_nostop']=df['intent_tokenized'].apply(lambda x:remove_stopword(x))
df.head()

Unnamed: 0,INTENT,ID,Unnamed: 2,intent_tokenized,intent_nostop
0,can you remove formatting from this paragraph,18,,"[can, you, remove, formatting, from, this, par...","[remove, formatting, paragraph]"
1,remove bold for the selected text,3,,"[remove, bold, for, the, selected, text]","[remove, bold, selected, text]"
2,last two words underline,6,,"[last, two, words, underline]","[last, two, words, underline]"
3,Display help,25,,"[display, help]","[display, help]"
4,can you centre align the last word of the para...,14,,"[can, you, centre, align, the, last, word, of,...","[centre, align, last, word, paragraph]"


### stemming

In [7]:
ps=nltk.PorterStemmer()
def stemming(tokenised_list):
    text=[ps.stem(word) for word in tokenised_list ]
    return text
df['intent_nostop_stem']=df['intent_nostop'].apply(lambda x:stemming(x))
df.head()

Unnamed: 0,INTENT,ID,Unnamed: 2,intent_tokenized,intent_nostop,intent_nostop_stem
0,can you remove formatting from this paragraph,18,,"[can, you, remove, formatting, from, this, par...","[remove, formatting, paragraph]","[remov, format, paragraph]"
1,remove bold for the selected text,3,,"[remove, bold, for, the, selected, text]","[remove, bold, selected, text]","[remov, bold, select, text]"
2,last two words underline,6,,"[last, two, words, underline]","[last, two, words, underline]","[last, two, word, underlin]"
3,Display help,25,,"[display, help]","[display, help]","[display, help]"
4,can you centre align the last word of the para...,14,,"[can, you, centre, align, the, last, word, of,...","[centre, align, last, word, paragraph]","[centr, align, last, word, paragraph]"


### lemmatizing

In [8]:
#nltk.download('wordnet')
lm=nltk.WordNetLemmatizer()
def lemmatizing(tokenised_list):
    text=[lm.lemmatize(word) for word in tokenised_list ]
    return text
df['intent_nostop_lem']=df['intent_nostop'].apply(lambda x:lemmatizing(x))
df.head()

Unnamed: 0,INTENT,ID,Unnamed: 2,intent_tokenized,intent_nostop,intent_nostop_stem,intent_nostop_lem
0,can you remove formatting from this paragraph,18,,"[can, you, remove, formatting, from, this, par...","[remove, formatting, paragraph]","[remov, format, paragraph]","[remove, formatting, paragraph]"
1,remove bold for the selected text,3,,"[remove, bold, for, the, selected, text]","[remove, bold, selected, text]","[remov, bold, select, text]","[remove, bold, selected, text]"
2,last two words underline,6,,"[last, two, words, underline]","[last, two, words, underline]","[last, two, word, underlin]","[last, two, word, underline]"
3,Display help,25,,"[display, help]","[display, help]","[display, help]","[display, help]"
4,can you centre align the last word of the para...,14,,"[can, you, centre, align, the, last, word, of,...","[centre, align, last, word, paragraph]","[centr, align, last, word, paragraph]","[centre, align, last, word, paragraph]"


### count vectorizing / feature extraction

In [9]:
def clean_text_stem(text):
    tokens=re.split('\W+',text)
    text=[ps.stem(word) for word in tokens if word not in stopword]
    text=[word for word in text if word not in stopword]
    return text

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect_stem=CountVectorizer(analyzer=clean_text_stem)
X_counts_stem=count_vect_stem.fit_transform(df['INTENT'])
print(X_counts_stem.shape)
print(count_vect_stem.get_feature_names())

(409, 86)
['', 'Go', 'add', 'align', 'bold', 'break', 'bullet', 'capit', 'centr', 'chang', 'charact', 'clear', 'color', 'command', 'comment', 'content', 'delet', 'dictat', 'display', 'end', 'eras', 'first', 'five', 'format', 'four', 'full', 'go', 'halt', 'help', 'icon', 'imag', 'insert', 'ital', 'italic', 'italicis', 'kindli', 'last', 'left', 'let', 'letter', 'line', 'list', 'make', 'middl', 'move', 'near', 'new', 'next', 'open', 'page', 'paragraph', 'paus', 'pictur', 'place', 'pleas', 'posit', 'present', 'previou', 'put', 'red', 'remov', 'requir', 'reveal', 'right', 'select', 'sentenc', 'shift', 'show', 'start', 'stop', 'strike', 'strikethrough', 'subscript', 'superscript', 'symbol', 'tabl', 'take', 'text', 'textoutlin', 'three', 'two', 'unbold', 'underlin', 'undo', 'unitalic', 'word']


### changing sparse matrix to df

In [11]:
X_counts_stem_df=pd.DataFrame(X_counts_stem.toarray())
X_counts_stem_df.columns=count_vect_stem.get_feature_names()
X_counts_stem_df.head()

Unnamed: 0,Unnamed: 1,Go,add,align,bold,break,bullet,capit,centr,chang,...,take,text,textoutlin,three,two,unbold,underlin,undo,unitalic,word
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1


### getting x features

In [12]:
X_features=X_counts_stem_df
X_features.head()

Unnamed: 0,Unnamed: 1,Go,add,align,bold,break,bullet,capit,centr,chang,...,take,text,textoutlin,three,two,unbold,underlin,undo,unitalic,word
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1


## Random Forest

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score

In [22]:
rf=RandomForestClassifier(n_jobs=-1)
k_fold=KFold(n_splits=10)
cross_val_score(rf,X_features,df['ID'],cv=k_fold,scoring='accuracy',n_jobs=-1)

array([0.90243902, 0.97560976, 0.97560976, 0.97560976, 0.95121951,
       0.95121951, 1.        , 0.92682927, 0.97560976, 0.975     ])

In [15]:
rf.fit(X_features, df['ID'])
from sklearn.externals import joblib 
import os
# Save the model as a pickle in a file 
joblib.dump(rf, 'rf')
os.path.getsize('rf')



455641