
## CountVectorizor with Multi Naive Bayes (with GridSearch tuning)


In [1]:
# do the imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

from nltk.corpus import stopwords
nltk_stops = stopwords.words('english')

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression


import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_rows', 5000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [2]:
df = pd.read_csv('../../datasets/df_onion_not_onion.csv')

In [3]:
X = df['title']
y = df['source']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify= y, random_state= 42)

In [5]:
pipe_cvec_nb = Pipeline([
    ('cvec', CountVectorizer()),
    ('multi_nb', MultinomialNB())
])

params_cvec_nb = {
    'cvec__max_features': [10000, 25000, 60000],
    'cvec__stop_words': [None, nltk_stops],
    'cvec__ngram_range': [(1, 1), (1, 2), (1, 4)] 
}

gs_cvec_nb = GridSearchCV(estimator= pipe_cvec_nb, 
                          param_grid= params_cvec_nb,
                          cv= 3)

In [6]:
gs_cvec_nb.fit(X_train, y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('cvec',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                            

In [7]:
gs_cvec_nb.best_params_

{'cvec__max_features': 25000,
 'cvec__ngram_range': (1, 2),
 'cvec__stop_words': None}

In [8]:
gs_cvec_nb.score(X_train, y_train)

0.944

In [9]:
gs_cvec_nb.score(X_test, y_test)

0.8552

In [10]:
cross_val_score(gs_cvec_nb, X_train, y_train).mean()

0.8386666666666667