## Use TFIDF with Logistic Regression with GridSearch tuning

#### Tried various TFIDF and CountVector max_features, at around 25,000 the test data performance stagnated even though the training data performance kept getting better ----- the overfitting got better with more max_features but the test data performance stayed around the same CONCLUSION I stopped using any max_features > 25,000;  decided to just cap max_features at 25,000 to avoid phony overfitting and conserve memory and time

In [1]:
# do the imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_rows', 5000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

nltk_stops = stopwords.words('english')  

In [2]:
df = pd.read_csv('../../datasets/df_onion_not_onion.csv')

In [3]:
X = df['title']
y = df['source']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify= y, random_state= 42)

In [5]:
pipe_tvec_logreg = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('logreg', LogisticRegression())
])

params_tvec_logreg = {
    'tvec__max_features': [1000, 2500, 5000, 25000],
    'tvec__stop_words': [None, nltk_stops],
    'tvec__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'logreg__C': [1.0, 1e9],
    'logreg__solver': ['lbfgs']
}

gs_tvec_logreg = GridSearchCV(estimator= pipe_tvec_logreg, 
                          param_grid= params_tvec_logreg,
                          cv= 3)

In [6]:
gs_tvec_logreg.fit(X_train, y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('tvec',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                          

In [7]:
gs_tvec_logreg.best_params_

{'logreg__C': 1000000000.0,
 'logreg__solver': 'lbfgs',
 'tvec__max_features': 25000,
 'tvec__ngram_range': (1, 2),
 'tvec__stop_words': None}

In [8]:
gs_tvec_logreg.score(X_train, y_train)

0.9986666666666667

In [9]:
gs_tvec_logreg.score(X_test, y_test)

0.8472

In [10]:
cross_val_score(gs_tvec_logreg, X_train, y_train, cv= 3).mean()

0.8333333333333334

In [11]:
gs_tvec_logreg.best_estimator_

Pipeline(memory=None,
         steps=[('tvec',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=25000,
                                 min_df=1, ngram_range=(1, 2), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('logreg',
                 LogisticRegression(C=1000000000.0, class_weight=None,
                                    dual=False, fit_intercept=True,
     

### Logistic Regression w/o regularization (by C= 1e9) nearly always wins out as the best Logistic Regression model on training data because of  overfitting;  here I try using Regularization by setting C closer to default 1.0 value to see what happens


In [12]:
pipe_tvec_logreg = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('logreg', LogisticRegression())
])

params_tvec_logreg = {
    'tvec__max_features': [25000],
    'tvec__stop_words': [None, nltk_stops],
    'tvec__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'logreg__C': [0.5, 1.0, 2.0],
    'logreg__solver': ['lbfgs']
}

gs_tvec_logreg = GridSearchCV(estimator= pipe_tvec_logreg, 
                          param_grid= params_tvec_logreg,
                          cv= 3)

In [14]:
gs_tvec_logreg.fit(X_train, y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('tvec',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                          

In [15]:
gs_tvec_logreg.score(X_train, y_train)

0.9693333333333334

In [16]:
gs_tvec_logreg.score(X_test, y_test)

0.8532

In [17]:
cross_val_score(gs_tvec_logreg, X_train, y_train, cv= 3).mean()

0.8336