## TFIDF with Multi-NaiveBayes with GridSearch tuning

In [1]:
# do the imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

from nltk.corpus import stopwords
nltk_stops = stopwords.words('english')

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression


import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_rows', 5000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [2]:
df = pd.read_csv('../../datasets/df_onion_not_onion.csv')

In [3]:
X = df['title']
y = df['source']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify= y, random_state= 42)

In [5]:
pipe_tvec_nb = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('multi_nb', MultinomialNB())
])

params_tvec_nb = {
    'tvec__max_features': [1000, 2500, 5000, 10_000, 25_000, 40_000, 60_000],
    'tvec__stop_words': [None, nltk_stops],
    'tvec__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4)],
    'multi_nb__alpha': [1.0, 0.0, 2.0]
}

gs_tvec_nb = GridSearchCV(estimator= pipe_tvec_nb, 
                          param_grid= params_tvec_nb,
                          cv= 3)

In [6]:
gs_tvec_nb.fit(X_train, y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('tvec',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                          

In [7]:
gs_tvec_nb.best_params_

{'multi_nb__alpha': 1.0,
 'tvec__max_features': 60000,
 'tvec__ngram_range': (1, 4),
 'tvec__stop_words': None}

In [8]:
gs_tvec_nb.score(X_train, y_train)

0.9688

In [9]:
gs_tvec_nb.score(X_test, y_test)

0.8592

In [10]:
cross_val_score(gs_tvec_nb, X_train, y_train, cv= 3).mean()

0.8412000000000001