
## TFIDF with Random Forests with GridSearch tuning

In [1]:
# do the imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

from nltk.corpus import stopwords
nltk_stops = stopwords.words('english')

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.preprocessing import PolynomialFeatures

from sklearn.pipeline import Pipeline

from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler


import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_rows', 5000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [2]:
df = pd.read_csv('../../datasets/df_onion_not_onion.csv')

In [3]:
X = df['title']
y = df['source']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify= y, random_state= 42)

In [5]:
pipe_tvec_r_forest = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('r_forest', RandomForestClassifier())
])

pipe_tvec_r_forest_params = {
    'tvec__max_features': [25_000, 60_000],
    'tvec__stop_words': [None, nltk_stops],
    'tvec__ngram_range': [(1, 1), (1, 4)],
    'r_forest__n_estimators': [5, 10, 20, 100],
    'r_forest__max_depth': [3, 4, 5, 7, 100],
    'r_forest__min_samples_leaf': [1, 2], 
    'r_forest__min_samples_split': [2, 5]
}

gs_tvec_r_forest = GridSearchCV(estimator= pipe_tvec_r_forest,
                                param_grid= pipe_tvec_r_forest_params,
                                cv= 3,
                                verbose= 1,
                                n_jobs= -1
)

In [6]:
gs_tvec_r_forest.fit(X_train, y_train)

Fitting 3 folds for each of 640 candidates, totalling 1920 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    3.2s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:   12.1s
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:   26.6s
[Parallel(n_jobs=-1)]: Done 776 tasks      | elapsed:   48.4s
[Parallel(n_jobs=-1)]: Done 1226 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 1776 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 1920 out of 1920 | elapsed:  2.4min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('tvec',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                          

In [7]:
gs_tvec_r_forest.score(X_train, y_train)

0.9673333333333334

In [8]:
gs_tvec_r_forest.score(X_test, y_test)

0.8064

In [9]:
cross_val_score(gs_tvec_r_forest, X_train, y_train, cv= 3).mean()

Fitting 3 folds for each of 640 candidates, totalling 1920 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  29 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 225 tasks      | elapsed:    9.7s
[Parallel(n_jobs=-1)]: Done 475 tasks      | elapsed:   20.9s
[Parallel(n_jobs=-1)]: Done 825 tasks      | elapsed:   36.6s
[Parallel(n_jobs=-1)]: Done 1275 tasks      | elapsed:   57.3s
[Parallel(n_jobs=-1)]: Done 1825 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 1920 out of 1920 | elapsed:  1.7min finished


Fitting 3 folds for each of 640 candidates, totalling 1920 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  29 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 224 tasks      | elapsed:    9.8s
[Parallel(n_jobs=-1)]: Done 474 tasks      | elapsed:   21.5s
[Parallel(n_jobs=-1)]: Done 824 tasks      | elapsed:   37.4s
[Parallel(n_jobs=-1)]: Done 1274 tasks      | elapsed:   58.9s
[Parallel(n_jobs=-1)]: Done 1824 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 1920 out of 1920 | elapsed:  1.7min finished


Fitting 3 folds for each of 640 candidates, totalling 1920 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  29 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done 223 tasks      | elapsed:   10.1s
[Parallel(n_jobs=-1)]: Done 473 tasks      | elapsed:   21.7s
[Parallel(n_jobs=-1)]: Done 823 tasks      | elapsed:   37.7s
[Parallel(n_jobs=-1)]: Done 1273 tasks      | elapsed:   59.5s
[Parallel(n_jobs=-1)]: Done 1823 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 1920 out of 1920 | elapsed:  1.7min finished


0.7938666666666666

In [10]:
gs_tvec_r_forest.best_params_

{'r_forest__max_depth': 100,
 'r_forest__min_samples_leaf': 1,
 'r_forest__min_samples_split': 5,
 'r_forest__n_estimators': 100,
 'tvec__max_features': 25000,
 'tvec__ngram_range': (1, 4),
 'tvec__stop_words': None}