
## Use TFIDF with Gaussian-NaiveBayes

In [1]:
# do the imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline

from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_rows', 5000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [2]:
df = pd.read_csv('../../datasets/df_onion_not_onion.csv')

In [3]:
X = df['title']
y = df['source']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify= y, random_state= 42)

In [5]:
# use as TFIDC parameters the params that worked best in all other TFIDC or CountVectorizor cases in this project
tvec = TfidfVectorizer(max_features= 25000,
                       stop_words= None,
                       ngram_range= (1, 2))

tvec.fit(X_train, y_train)
X_train_tvec = tvec.transform(X_train).todense()
X_test_tvec = tvec.transform(X_test).todense()

In [6]:
gnb = GaussianNB()

gnb.fit(X_train_tvec, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [7]:
gnb.score(X_train_tvec, y_train)

0.9818666666666667

In [8]:
gnb.score(X_test_tvec, y_test)

0.802

In [9]:
cross_val_score(gnb, X_train_tvec, y_train).mean()

0.7994666666666667