## TFIDF with Ada (w/Decision Trees) and GradBoost as Voters with GridSearch tuning

In [1]:
# do the imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

from nltk.corpus import stopwords
nltk_stops = stopwords.words('english')

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.preprocessing import PolynomialFeatures

from sklearn.pipeline import Pipeline

from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler


import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_rows', 5000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [2]:
df = pd.read_csv('../../datasets/df_onion_not_onion.csv')

In [3]:
X = df['title']
y = df['source']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify= y, random_state= 42)

In [5]:
pipe_tvec_ss_vote = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('vote', VotingClassifier([('ada', AdaBoostClassifier(base_estimator= DecisionTreeClassifier())),
                               ('gb', GradientBoostingClassifier())
                              ]))
])

pipe_params = {'tvec__max_features': [25_000, 60_000],
               'tvec__stop_words': [None],
               'tvec__ngram_range': [(1, 1), (1, 4)],
               'vote__ada__n_estimators': [50, 75],
               'vote__ada__base_estimator__max_depth': [1, 2],
               'vote__gb__n_estimators': [100, 150]
}

gs_tvec_ss_vote = GridSearchCV(estimator= pipe_tvec_ss_vote,
                               param_grid= pipe_params,
                               cv= 3)


In [6]:
gs_tvec_ss_vote.fit(X_train, y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('tvec',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                          

In [7]:
gs_tvec_ss_vote.score(X_train, y_train)


0.7952

In [8]:
gs_tvec_ss_vote.score(X_test, y_test)

0.7516

In [9]:
gs_tvec_ss_vote.best_params_

{'tvec__max_features': 25000,
 'tvec__ngram_range': (1, 4),
 'tvec__stop_words': None,
 'vote__ada__base_estimator__max_depth': 2,
 'vote__ada__n_estimators': 75,
 'vote__gb__n_estimators': 150}