# TF-IDF Models
The intuition behind TF-IDF is that if a word appears multiple times in a document it should be more important; if it appears in fewer documents it might be important; and if it appears multiple times in a small subset of documents then it might be _very_ important. In this notebook, we will reapply the analysis from the previous notebook, but using TF-IDF to vectorize our titles. Since what we are doing is going to be virtually identical, we will only comment on things that differ.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import pickle

In [2]:
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.naive_bayes import MultinomialNB, GaussianNB, ComplementNB, BernoulliNB
# Import CountVectorizer and TFIDFVectorizer from feature_extraction.text.
from sklearn.feature_extraction.text import CountVectorizer,\
                                            TfidfVectorizer

In [3]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer

In [4]:
def save_obj(obj, filename):
    with open(filename + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(filename):
    with open(filename + '.pkl', 'rb') as f:
        return pickle.load(f)

In [5]:
DIR = "C:\\Users\\AzNsAnTaGiN\\DSI\\Projects\\project_3\\data\\"
FILE1 = "theonion"
FILE2 = "nottheonion"
FILE3 = "onionheadlines"

# Data Import

In [6]:
X_theonion = load_obj(DIR+FILE1+"_df_clean")
X_nottheonion = load_obj(DIR+FILE2+"_df_clean")

In [7]:
X_theonion["is_onion"] = 1
X_nottheonion["is_onion"] = -1

## Generating our samples and holdout

In [8]:
N=4000
X_theonion_shuffled = X_theonion.sample(len(X_theonion))
theonion_sample = X_theonion_shuffled.head(N)
theonion_holdout = X_theonion_shuffled.tail(len(X_theonion_shuffled) - N)

X_nottheonion_shuffled = X_nottheonion.sample(len(X_nottheonion))
nottheonion_sample = X_nottheonion_shuffled.head(N)
nottheonion_holdout = X_nottheonion_shuffled.tail(len(X_nottheonion_shuffled)-N)
X_sample = pd.concat([theonion_sample, nottheonion_sample])
X = pd.concat([X_theonion, X_nottheonion])

In [9]:
cvec = CountVectorizer(ngram_range=(1,1))
cvec.fit(theonion_sample["title"])
cvec.transform(theonion_sample["title"])
cvec_df = pd.DataFrame(cvec.transform(theonion_sample["title"]).toarray(),
                      columns=cvec.get_feature_names())

In [10]:
cvec2 = CountVectorizer(ngram_range=(1,1))
cvec2.fit(nottheonion_sample["title"])
cvec2.transform(nottheonion_sample["title"])
cvec2_df = pd.DataFrame(cvec2.transform(nottheonion_sample["title"]).toarray(),
                      columns=cvec2.get_feature_names())

## Preparing our stop-words list

In [11]:
N = 250

In [12]:
theonion_top_words = cvec_df.sum().sort_values(ascending=False).index[:N]

In [13]:
nottheonion_top_words = cvec2_df.sum().sort_values(ascending=False).index[:N]

# TF-IDF + Logistic Regression

## No custom tokenizing, no lemmatizing, no stop words.

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X_sample["title"], X_sample["is_onion"])
pipe = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("logreg", LogisticRegressionCV(Cs=np.logspace(-2,2,100), max_iter=1000))])
     
pipe.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='...
       5.21400829e+01, 5.72236766e+01, 6.28029144e+01, 6.89261210e+01,
       7.56463328e+01, 8.30217568e+01, 9.11162756e+01, 1.00000000e+02]),
                                      class_weight=None, cv=None, dual=False,
                                      fit_intercept=True, intercept_scaling=1.0,

In [15]:
display(pipe.score(X_train, y_train))
display(pipe.score(X_test, y_test))
display(pipe.score(theonion_holdout["title"], theonion_holdout["is_onion"]))
display(pipe.score(nottheonion_holdout["title"], nottheonion_holdout["is_onion"]))

0.976

0.7985

0.8097210113339146

0.7949465804761033

## Hyperparameter searching ft. ngrams, finite features, stop words

In [16]:
pipe = Pipeline([
    ("tfidf", TfidfVectorizer(strip_accents="unicode")),
#     ("norm", Normalizer()),
    ("logreg", LogisticRegression(max_iter=1000))
])

rand_search = RandomizedSearchCV(pipe,
                                 n_jobs=-2,
                                 n_iter=50,
                                 param_distributions={
    "logreg__C": np.logspace(-1,1,100),
    "tfidf__max_df": [.99, 1],
    "tfidf__min_df": [0, 0.01],
    "tfidf__stop_words": [None, [i for i in theonion_top_words if i in nottheonion_top_words]],
    "tfidf__ngram_range": [(1,1), (1,2)],
    "tfidf__max_features": [None, 10, 50, 100, 250, 500, 1000, 2000, 5000, 10000, 20000, 50000]
})
rand_search.fit(X_train, y_train);

In [17]:
rand_search.best_estimator_

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=0.99, max_features=None,
                                 min_df=0, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents='unicode',
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('logreg',
                 LogisticRegression(C=0.739072203352578, class_weight=None,
                                    dual=False, fit_intercept=

In [18]:
rand_search.score(X_train, y_train)

0.9133333333333333

In [19]:
rand_search.score(X_test, y_test)

0.7825

In [20]:
rand_search.score(theonion_holdout["title"], theonion_holdout["is_onion"])

0.7968613775065388

In [21]:
rand_search.score(nottheonion_holdout["title"], nottheonion_holdout["is_onion"])

0.7829018988256375

This is a marginal improvement over our naive model, but it's worth noting that we finish training this model in half the time.

# CountVectorizer + Naive Bayes

In the following section, we apply a variety of Naive Bayes models to our CountVectorized titles.

## CVec + MultinomialNB
### Naive

In [22]:
pipe = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("mnb", MultinomialNB())
])
     
pipe.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('mnb',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [23]:
display(pipe.score(X_train, y_train))
display(pipe.score(X_test, y_test))
display(pipe.score(theonion_holdout["title"], theonion_holdout["is_onion"]))
display(pipe.score(nottheonion_holdout["title"], nottheonion_holdout["is_onion"]))

0.9546666666666667

0.8045

0.8256320836965998

0.7918902515390025

### RandomizedSearchCV'd

In [24]:
pipe = Pipeline([
    ("tfidf", TfidfVectorizer(strip_accents="unicode")),
    ("mnb", MultinomialNB())
])

rand_search = RandomizedSearchCV(pipe,
                                 n_jobs=-2,
                                 n_iter=50,
                                 param_distributions={
    "mnb__alpha": np.linspace(0,1,11),
    "tfidf__max_df": [.99, 1],
    "tfidf__min_df": [0, 0.01],
    "tfidf__stop_words": [None, [i for i in theonion_top_words if i in nottheonion_top_words]],
    "tfidf__ngram_range": [(1,1), (1,2)],
    "tfidf__max_features": [None, 10, 50, 100, 250, 500, 1000, 2000, 5000, 10000, 20000, 50000]
})
rand_search.fit(X_train, y_train);

In [25]:
rand_search.best_estimator_

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=0.99,
                                 max_features=50000, min_df=0,
                                 ngram_range=(1, 2), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents='unicode',
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('mnb',
                 MultinomialNB(alpha=0.2, class_prior=None, fit_prior=True))],
         verbose=False)

In [26]:
display(rand_search.score(X_train, y_train))
display(rand_search.score(X_test, y_test))
display(rand_search.score(theonion_holdout["title"], theonion_holdout["is_onion"]))
display(rand_search.score(nottheonion_holdout["title"], nottheonion_holdout["is_onion"]))

0.9993333333333333

0.8185

0.8236704446381866

0.7888242887833451

## CVec + ComplementNB

In [27]:
pipe = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("cnb", ComplementNB())
])
     
pipe.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('cnb',
                 ComplementNB(alpha=1.0, class_prior=None, fit_prior=True,
                              norm=False))],
         verbose=False)

In [28]:
display(pipe.score(X_train, y_train))
display(pipe.score(X_test, y_test))
display(pipe.score(theonion_holdout["title"], theonion_holdout["is_onion"]))
display(pipe.score(nottheonion_holdout["title"], nottheonion_holdout["is_onion"]))

0.955

0.807

0.8227986050566696

0.7970901051049605

In [29]:
pipe = Pipeline([
    ("tfidf", TfidfVectorizer(strip_accents="unicode")),
    ("cnb", ComplementNB())
])

rand_search = RandomizedSearchCV(pipe,
                                 n_jobs=-2,
                                 n_iter=50,
                                 param_distributions={
    "cnb__alpha": np.linspace(0,1,11),
    "tfidf__max_df": [.99, 1],
    "tfidf__min_df": [0, 0.01],
    "tfidf__stop_words": [None, [i for i in theonion_top_words if i in nottheonion_top_words]],
    "tfidf__ngram_range": [(1,1), (1,2)],
    "tfidf__max_features": [None, 10, 50, 100, 250, 500, 1000, 2000, 5000, 10000, 20000, 50000]
})
rand_search.fit(X_train, y_train);

In [30]:
display(rand_search.score(X_train, y_train))
display(rand_search.score(X_test, y_test))
display(rand_search.score(theonion_holdout["title"], theonion_holdout["is_onion"]))
display(rand_search.score(nottheonion_holdout["title"], nottheonion_holdout["is_onion"]))

0.9395

0.815

0.8121185701830863

0.798788065625572

## CVec + BernoulliNB

In [31]:
pipe = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("bnb", BernoulliNB())
])
     
pipe.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('bnb',
                 BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None,
                             fit_prior=True))],
         verbose=False)

In [32]:
display(pipe.score(X_train, y_train))
display(pipe.score(X_test, y_test))
display(pipe.score(theonion_holdout["title"], theonion_holdout["is_onion"]))
display(pipe.score(nottheonion_holdout["title"], nottheonion_holdout["is_onion"]))

0.9558333333333333

0.807

0.8129904097646034

0.8115191568482

In [33]:
pipe = Pipeline([
    ("tfidf", TfidfVectorizer(strip_accents="unicode")),
    ("bnb", BernoulliNB())
])

rand_search = RandomizedSearchCV(pipe,
                                 n_jobs=-2,
                                 n_iter=50,
                                 param_distributions={
    "bnb__alpha": np.linspace(0,1,11),
    "tfidf__max_df": [.99, 1],
    "tfidf__min_df": [0, 0.01],
    "tfidf__stop_words": [None, [i for i in theonion_top_words if i in nottheonion_top_words]],
    "tfidf__ngram_range": [(1,1), (1,2)],
    "tfidf__max_features": [None, 10, 50, 100, 250, 500, 1000, 2000, 5000, 10000, 20000, 50000]
})
rand_search.fit(X_train, y_train);

In [34]:
display(rand_search.score(X_train, y_train))
display(rand_search.score(X_test, y_test))
display(rand_search.score(theonion_holdout["title"], theonion_holdout["is_onion"]))
display(rand_search.score(nottheonion_holdout["title"], nottheonion_holdout["is_onion"]))

0.9046666666666666

0.8055

0.7813862249346121

0.8149512047090105

# Conclusions

As with our CountVectorized models, our BernoulliNB models were slighty better than the others, but it's hard to tell if its a statistically significant difference. 