# random forest classifier model

- assume training data in filepath
    + `data/processed/train.csv`
- assume testing data in filepath
    + `data/processed/test.csv`
- assume both have same structure:
    + first column is query 
    + second column is category
- assume input strings are 'clean':
    + in lower case
    + punctuation removed (stop words included)
    + words separated by spaces (no padding)
- output:
    + `models/randomforest.pckl`

In [1]:
import numpy as np
import pandas as pd
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import make_column_transformer
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline

split_training_data_filepath = '../data/processed/train.csv'
split_testing_data_filepath  = '../data/processed/test.csv'
model_filepath = '../models/randomforest.pckl'

In [2]:
# fetch training data only:
df_train = pd.read_csv(split_training_data_filepath, index_col=0)
# do not want the class label to be numerical nor ordinal. 
df_train['category'] = pd.Categorical(df_train['category'])
display(df_train.sample(3))

Unnamed: 0,query,category
397564,can i get help with divorce costs,607
292274,recycle computer batteries,235
419959,will proforma,1277


In [3]:
# transform the query to a scaled frequency vector of vocabulary terms
tfidf = TfidfVectorizer(
#     stop_words=stop_words,
    stop_words = {'english'},
    strip_accents= 'ascii',
    ngram_range=(1,1), # consider unigrams/bigrams/trigrams?
    min_df = 10,
    max_df = 0.80,
    binary=True, # count term occurance in each query only once
)
# column transformer
all_transforms = make_column_transformer(
    (tfidf, ['query'])
)

# classify using random forest (resistant to overfitting):
clf_rf = RandomForestClassifier(
    n_estimators = 256,
    max_depth = 8,
    n_jobs = -1,
#     warm_start = True,
    warm_start = False,
    max_samples = None,
#     verbose = 3,
)
pipe = make_pipeline(all_transforms, clf_rf)

In [4]:
%%time
# scores = cross_val_score(pipe, df_train['query'], df_train['category'], cv=5, scoring='accuracy')
# scores = cross_val_score(clf_rf, X, y, cv=5, scoring='f1_macro')
# scores = cross_val_score(clf_rf, X, df_train['category'], cv=2, scoring='accuracy')
# cross_val_score(pipe, df_train['query'], df_train['category'], cv=5, scoring='accuracy')
# print("accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 5.25 µs


In [5]:
%%time
X = tfidf.fit_transform(df_train['query'])
display(X.shape)
X

(485458, 12650)

CPU times: user 2.58 s, sys: 28.6 ms, total: 2.61 s
Wall time: 2.61 s


<485458x12650 sparse matrix of type '<class 'numpy.float64'>'
	with 1384726 stored elements in Compressed Sparse Row format>

In [6]:
%%time
# train model
clf_rf.fit(X, df_train['category'])
with open(model_filepath, 'wb') as filepath:
    pickle.dump(clf_rf, filepath)
print('trained randomforest model and exported to', model_filepath)

trained randomforest model and exported to ../models/randomforest.pckl
CPU times: user 1min 56s, sys: 4.21 s, total: 2min 1s
Wall time: 9.35 s


In [7]:
%%time
# check model against test data
df_test = pd.read_csv(split_testing_data_filepath, index_col=0)
X_test = tfidf.transform(df_test['query'])
print('read in test data:', X_test.shape, 'now running model predictions')
y_predicted = clf_rf.predict(X_test)
print('computed', len(y_predicted), 'predictions of test data')

read in test data: (121365, 12650) now running model predictions
computed 121365 predictions of test data
CPU times: user 5min 49s, sys: 16min 8s, total: 21min 58s
Wall time: 30min 23s


In [9]:
print('number of   correct predictions:', sum(y_predicted == df_test['category']))
print('number of incorrect predictions:', sum(y_predicted != df_test['category']))
print('ratio of   correct  predictions:', round(sum(y_predicted == df_test['category'])/len(df_test),3))
print('')

number of   correct predictions: 19504
number of incorrect predictions: 101861
ratio of   correct  predictions: 0.161

