# 3.3 Random Forest

In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.feature_extraction.text import TfidfVectorizer

np.random.seed(42)

In [2]:
df = pd.read_csv('./cleaned_data')
# dropping new rows with null selftext values
df.drop(df.loc[df['selftext'].isna()].index, inplace=True)
df.reset_index(drop=True, inplace=True)

X = df['selftext']
y = df['solotravel']

tvec = TfidfVectorizer()
tvec.fit(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, stratify=y, random_state=42)
X_train_transformed = tvec.transform(X_train)
X_test_transformed = tvec.transform(X_test)

In [3]:
def scores(model):
    print(f'train score: {model.score(X_train_transformed, y_train)}')
    print(f'test score: {model.score(X_test_transformed, y_test)}')
    print()
    try:
        print(f'best score: {model.best_score_}')
        print(f'best params: {model.best_params_}')
    except:
        pass

In [4]:
rf = RandomForestClassifier()

In [5]:
%%time
rf.fit(X_train_transformed, y_train)

CPU times: user 2min 12s, sys: 437 ms, total: 2min 12s
Wall time: 2min 13s


RandomForestClassifier()

In [6]:
scores(rf)

train score: 0.9969992498124531
test score: 0.7458114528632158



This model is massively overfit.

In [10]:
%%time
rf_params = {
    'n_estimators': [100, 150, 200], 
    'max_depth'   : [None, 1, 2, 3, 4, 5]
}

gs = GridSearchCV(rf, param_grid=rf_params, cv=5, n_jobs=-1, verbose=1)
gs.fit(X_train_transformed, y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
CPU times: user 4min 27s, sys: 1.29 s, total: 4min 29s
Wall time: 25min 57s


GridSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'max_depth': [None, 1, 2, 3, 4, 5],
                         'n_estimators': [100, 150, 200]},
             verbose=1)

In [11]:
scores(gs)

train score: 0.9969992498124531
test score: 0.753563390847712

best score: 0.7493121654040875
best params: {'max_depth': None, 'n_estimators': 200}


The model remains overfit. So random forests are not a good model for this problem.