# About this notebook:

This is where the meat of the project is. Various models and their parameters are explored and evaluated for each of our three comparisons.

The majority of the work is demonstrated in Jokes vs DadJokes. This is where all models, count vectorized, tfidf vectorized, logistic regression, gaussian naive bayes, multinomial naive bayes, linear support vector machine, lemmatizing and stemming are all explored. Some subset of these models and processing techniques were explored in Jokes vs Momma Jokes, and also in CleanJokes vs Dirty Jokes, but not all were included in the final notebook.

### Table of Contents:
- [Imports and Useful Functions](#Imports-and-Useful-Functions)
- [Jokes vs DadJokes](#Jokes-vs-DadJokes)
- [Jokes vs MommaJokes](#Jokes-vs-MommaJokes)
- [CleanJokes vs DirtyJokes](#Clean-vs-Dirty-Jokes)

# Imports and Useful Functions

In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
import regex as re
from bs4 import BeautifulSoup 
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer



import pandas as pd
import numpy as np
# import matplotlib as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.pipeline import Pipeline

from scipy import sparse


In [None]:
# DOING THE TOKENIZING, LEMMATIZING

def nlp_preprocess_lem(df):

    # Instantiate Tokenizer
    # this tokenizer splits tokens up by spaces or by periods that are not attached to a digit.
    tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+')

    # Instantiate lemmatizer. (Same as above.)
    lemmatizer = WordNetLemmatizer()

    # english stopwords
    stops = set(stopwords.words('english'))

    clean = []

    for document in df:
        # breaking up the document into words
        words = tokenizer.tokenize(document)

        #removing stopwords
        #meaningful_words = [w for w in words if w not in stops]

        # lemmatizing
        lem_words = [lemmatizer.lemmatize(w) for w in words]

        # recombining the cleaned words
        new_document = " ".join(lem_words)

        # appending to cleaned list
        clean.append(new_document)
        
    return clean

In [None]:
# DOING THE TOKENIZING, STEMMING

def nlp_preprocess_stem(df):

    # Instantiate Tokenizer
    # this tokenizer splits tokens up by spaces or by periods that are not attached to a digit.
    tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+')

    # Instantiate lemmatizer. (Same as above.)
    p_stemmer = PorterStemmer()

    clean = []

    for document in df:
        # breaking up the document into words
        words = tokenizer.tokenize(document)

        # lemmatizing
        stem_words = [p_stemmer.stem(w) for w in words]

        # recombining the cleaned words
        new_document = " ".join(stem_words)

        # appending to cleaned list
        clean.append(new_document)
        
    return clean

In [4]:
# replace the nulls with the mean for the numeric columns
# and replace the nans in object columns with the empty string


def replace_nans(df):
    # replacing nans in numeric columns with the mean
    numeric = df.select_dtypes(exclude=['object','bool'])
    for col in numeric.columns:
        m = df[col].mean()
        df[col].fillna(m, inplace = True)
    
    #replacing nans in object columns with the empty string
    non_numeric = df.select_dtypes('object')
    for col in non_numeric.columns:
        df[col].fillna('', inplace = True)

# Jokes vs DadJokes

### Read in data, define train and test sets

In [5]:
# import the data
df = pd.read_csv('../data/jokes_v_dadjokes')

# check the shape
df.shape

(9126, 29)

In [6]:
# check nans
df.isnull().sum().sum()

0

In [7]:
# baseline accuracy
df['is_dadjoke'].value_counts()

1    4641
0    4485
Name: is_dadjoke, dtype: int64

In [8]:
# input variables should be everything other than the subreddit label
X = df.drop(columns = ['is_dadjoke'])

# target is whether or not submission is a dad joke
y = df['is_dadjoke']

In [10]:
X.shape

(9126, 28)

In [11]:
# baseline accuracy
y.value_counts(normalize = True)

1    0.508547
0    0.491453
Name: is_dadjoke, dtype: float64

In [9]:
# need to join these columns so that our gridsearch likes this dataset
X_title_sf = X['title']+X['selftext']

In [13]:
X_title_sf.shape

(9126,)

In [14]:
# Defining train and test sets

X_train, X_test, y_train, y_test = train_test_split(X_title_sf,
                                                    y,
                                                    stratify=y,     # keeps balance in the output variable
                                                    random_state=5)

### Support Vector Machine

In [81]:
# Instantiate support vector machine.
svc = LinearSVC()

In [82]:
# pipline to experimenct with count vectorizer parameters
pipe_params_cvec = {
    'cvec__max_features': [100, 500],
    'cvec__stop_words': ['english', None],
    'cvec__ngram_range': [(1,1),(1,2)]
}

pipe_cvec_svc = Pipeline([
    ('cvec', CountVectorizer()),
    ('svc', svc)
])

# gridsearch with 5 folds

# count vectorizer logistic regression
gs_cvec_svc = GridSearchCV(pipe_cvec_svc,     # what object are we optimizing?
                          pipe_params_cvec, # what parameters values are we searching?
                          cv = 5)           # 5-fold cross-validation.


In [83]:

# Fit support vector machine to training data.
gs_cvec_svc.fit(X_train, y_train)





GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('cvec',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        prep

In [1]:
gs_cvec_svc.best_score_

NameError: name 'gs_cvec_svc' is not defined

In [91]:
gs_cvec_svc_model = gs_cvec_svc.best_estimator_

In [92]:
gs_cvec_svc_model.score(X_train, y_train)

0.5856224430157803

In [93]:
gs_cvec_svc_model.score(X_test, y_test)

0.7598597721297108

In [94]:
gs_cvec_svc.best_params_

{'cvec__max_features': 500,
 'cvec__ngram_range': (1, 1),
 'cvec__stop_words': None}

In [24]:
y_train.value_counts()

1    3480
0    3364
Name: is_dadjoke, dtype: int64

In [25]:
X_train.columns

AttributeError: 'Series' object has no attribute 'columns'

### CountVectorizer and Linear Regression

In [15]:
pipe_params_cvec = {
    'cvec__max_features': [100, 500, 1000],
    'cvec__stop_words': ['english', None],
    'cvec__ngram_range': [(1,1),(1,2)]
}

pipe_cvec_lr = Pipeline([
    ('cvec', CountVectorizer()),
    ('lr', LogisticRegression(solver = 'lbfgs'))
])

# gridsearch with 5 folds

# count vectorizer logistic regression
gs_cvec_lr = GridSearchCV(pipe_cvec_lr,     # what object are we optimizing?
                          pipe_params_cvec, # what parameters values are we searching?
                          cv = 5)           # 5-fold cross-validation.


In [16]:
gs_cvec_lr.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('cvec',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        prep

In [17]:
gs_cvec_lr.best_score_

0.6217135698999142

In [18]:
gs_cvec_lr.coef_

AttributeError: 'GridSearchCV' object has no attribute 'coef_'

In [17]:
gs_cvec_lr.best_params_

{'cvec__max_features': 1000,
 'cvec__ngram_range': (1, 1),
 'cvec__stop_words': None}

In [20]:
gs_cvec_lr_model = gs_cvec_lr.best_estimator_

In [21]:
gs_cvec_lr_model.score(X_train, y_train)

0.7286674459380479

In [22]:
gs_cvec_lr_model.score(X_test, y_test)

0.6336546888694128

In [23]:
gs_cvec_lr_model.coef_

AttributeError: 'Pipeline' object has no attribute 'coef_'

### Multinomial Naive Bayes Model

In [42]:
pipe_params_cvec = {
    'cvec__max_features': [100, 500],
    'cvec__stop_words': ['english', None],
    'cvec__ngram_range': [(1,1),(1,2)]
}

pipe_cvec_multiNB = Pipeline([
    ('cvec', CountVectorizer()),
    ('multiNB', MultinomialNB())
])

# gridsearch with 5 folds

# count vectorizer logistic regression
gs_cvec_multiNB = GridSearchCV(pipe_cvec_multiNB,     # what object are we optimizing?
                          pipe_params_cvec, # what parameters values are we searching?
                          cv = 5)           # 5-fold cross-validation.


In [43]:
gs_cvec_multiNB.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('cvec',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        prep

In [44]:
gs_cvec_multiNB.best_score_

0.604472573569302

In [45]:
gs_cvec_multiNB_model = gs_cvec_multiNB.best_estimator_

In [46]:
gs_cvec_multiNB_model.score(X_train,y_train)

0.6274108708357685

In [47]:
gs_cvec_multiNB_model.score(X_test,y_test)

0.6148115687992989

### Tfidf and Logistic Regression

In [48]:
# tfidi parameters
pipe_params_tfidf = {
    'tfidf__max_features': [100, 500],
    'tfidf__stop_words': ['english', None],
    'tfidf__ngram_range': [(1,1),(1,2)]
}

In [49]:
# tfidi with logistic regression pipe
pipe_tfidf_lr = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('lr', LogisticRegression(solver = 'lbfgs'))
])

In [50]:
# tfidf logistic regression
gs_tfidf_lr = GridSearchCV(pipe_tfidf_lr,     # what object are we optimizing?
                          pipe_params_tfidf, # what parameters values are we searching?
                          cv = 5)           # 5-fold cross-validation.

In [51]:
#tfidf
gs_tfidf_lr.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('tfidf',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        n

In [52]:
gs_tfidf_lr.best_score_

0.6237616350347504

### Tfidf and Gaussian Naive Bayes

In [61]:
# TRYING TO DEAL WITH THE SPARSITY ISSUE
class DenseTransformer():
    def fit(self, X, y=None, **fit_params):
        return self
    def transform(self, X, y=None, **fit_params):
        return X.todense()
# https://stackoverflow.com/a/28384887
# James Dargan found this   

In [62]:
# tfidi with gaussian NB pipe
pipe_tfidf_gNB = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('to_dense', DenseTransformer()),
    ('multiNB', GaussianNB())
])


# tfidf with gaussian NB
gs_tfidf_gNB = GridSearchCV(pipe_tfidf_gNB,     # what object are we optimizing?
                          pipe_params_tfidf, # what parameters values are we searching?
                          cv = 5)           # 5-fold cross-validation.



In [63]:
# tfidf with multinomial NB just for fun
gs_tfidf_gNB = GridSearchCV(pipe_tfidf_gNB,     # what object are we optimizing?
                          pipe_params_tfidf, # what parameters values are we searching?
                          cv = 5)           # 5-fold cross-validation.

In [64]:
#tfidf
gs_tfidf_gNB.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('tfidf',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        n

In [66]:
gs_tfidf_gNB.best_score_

0.5964367639332078

### Logistic Regression with Lemmatizing

In [None]:
clean_train = nlp_preprocess_lem(X_train)
clean_test = nlp_preprocess_lem(X_test)

In [13]:
gs_cvec_lr.fit(clean_train, y_train)

NameError: name 'gs_cvec_lr' is not defined

In [None]:
gs_cvec_lr.best_score_

In [None]:
gs_cvec_lr.best_params_

In [None]:
gs_cvec_lr_model = gs_cvec_lr.best_estimator_

In [None]:
gs_cvec_lr_model.score(clean_train, y_train)

In [None]:
gs_cvec_lr_model.score(clean_test, y_test)

### Logistic Regression with Stemming

In [None]:
clean_train = nlp_preprocess_stem(X_train)
clean_test = nlp_preprocess_stem(X_test)

gs_cvec_lr.fit(clean_train, y_train)

gs_cvec_lr.best_score_

gs_cvec_lr.best_params_

gs_cvec_lr_model = gs_cvec_lr.best_estimator_

print('Train score:  ', gs_cvec_lr_model.score(clean_train, y_train))

print('Test score:  ', gs_cvec_lr_model.score(clean_test, y_test))

# Jokes vs MommaJokes

### Read in data, define train and test sets

In [29]:
df = pd.read_csv('../data/jokes_v_mommajokes')

In [30]:
df.isnull().sum().sum()

184

In [31]:
replace_nans(df)

In [32]:
X = df.drop(columns = ['is_mommajoke'])
y = df['is_mommajoke']

In [33]:
X_title_sf = X['title'] + X['selftext']

In [34]:
X_title_sf.shape

(2744,)

In [35]:
# Defining train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_title_sf,
                                                    y,
                                                    stratify=y,
                                                    random_state=5)

### CounVectorizer and Logistic Regression:

In [45]:
pipe_params_cvec = {
    'cvec__max_features': [100, 500],
    'cvec__stop_words': ['english', None],
    'cvec__ngram_range': [(1,1),(1,2)]
}

pipe_cvec_lr = Pipeline([
    ('cvec', CountVectorizer()),
    ('lr', LogisticRegression(solver = 'lbfgs'))
])

# gridsearch with 5 folds

# count vectorizer logistic regression
gs_cvec_lr = GridSearchCV(pipe_cvec_lr,     # what object are we optimizing?
                          pipe_params_cvec, # what parameters values are we searching?
                          cv = 5)           # 5-fold cross-validation.


In [46]:
gs_cvec_lr.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('cvec',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        prep

In [47]:
gs_cvec_lr.best_score_

0.934398696052725

In [48]:
gs_cvec_lr.best_params_

{'cvec__max_features': 500,
 'cvec__ngram_range': (1, 2),
 'cvec__stop_words': None}

In [18]:
# baseline accuracy
y.value_counts(normalize = True)

1    0.5
0    0.5
Name: is_mommajoke, dtype: float64

In [19]:
gs_cvec_lr_model = gs_cvec_lr.best_estimator_

In [20]:
gs_cvec_lr_model.score(X_train, y_train)

0.966958211856171

In [21]:
gs_cvec_lr_model.score(X_test, y_test)

0.9504373177842566

In [None]:
# way better than dadjokes! as expected

### Count Vectorizer and Multinomial Naive Bayes

In [36]:
pipe_params_cvec = {
    'cvec__max_features': [100, 500],
    'cvec__stop_words': ['english', None],
    'cvec__ngram_range': [(1,1),(1,2)]
}

pipe_cvec_multiNB = Pipeline([
    ('cvec', CountVectorizer()),
    ('multiNB', MultinomialNB())
])

# gridsearch with 5 folds

# count vectorizer logistic regression
gs_cvec_multiNB = GridSearchCV(pipe_cvec_multiNB,     # what object are we optimizing?
                          pipe_params_cvec, # what parameters values are we searching?
                          cv = 5)           # 5-fold cross-validation.


In [37]:
gs_cvec_multiNB.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('cvec',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        prep

In [40]:
gs_cvec_multiNB.best_score_

0.9207970141497178

In [41]:
gs_cvec_multiNB.best_params_

{'cvec__max_features': 500,
 'cvec__ngram_range': (1, 2),
 'cvec__stop_words': None}

In [42]:
gs_cvec_multiNB_model = gs_cvec_multiNB.best_estimator_

In [43]:
gs_cvec_multiNB_model.score(X_train, y_train)

0.9222546161321672

In [44]:
gs_cvec_multiNB_model.score(X_test, y_test)

0.9256559766763849

# Clean vs Dirty Jokes

### Read in data and define train and test sets

In [49]:
df = pd.read_csv('../data/clean_v_dirty.csv')

In [50]:
df.isnull().sum().sum()

378

In [51]:
replace_nans(df)

In [52]:
X = df.drop(columns = ['is_dirtyjoke'])
y = df['is_dirtyjoke']

In [53]:
X_title_sf = X['title'] + X['selftext']
X_title_sf.shape

(9276,)

In [54]:
# Defining train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_title_sf,
                                                    y,
                                                    stratify=y,
                                                    random_state=5)

### CountVectorizer and Logistic Regression

In [55]:
pipe_params_cvec = {
    'cvec__max_features': [100, 500],
    'cvec__stop_words': ['english', None],
    'cvec__ngram_range': [(1,1),(1,2)]
}

pipe_cvec_lr = Pipeline([
    ('cvec', CountVectorizer()),
    ('lr', LogisticRegression(solver = 'lbfgs'))
])

# gridsearch with 5 folds

# count vectorizer logistic regression
gs_cvec_lr = GridSearchCV(pipe_cvec_lr,     # what object are we optimizing?
                          pipe_params_cvec, # what parameters values are we searching?
                          cv = 5)           # 5-fold cross-validation.


In [56]:
gs_cvec_lr.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('cvec',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        prep

In [57]:
gs_cvec_lr.best_score_

0.8036515530875828

In [58]:
gs_cvec_lr.best_params_

{'cvec__max_features': 500,
 'cvec__ngram_range': (1, 2),
 'cvec__stop_words': 'english'}

In [45]:
gs_cvec_lr_model = gs_cvec_lr.best_estimator_

In [46]:
gs_cvec_lr_model.score(X_train, y_train)

0.8434670116429496

In [66]:
gs_cvec_lr_model.score(X_test, y_test)

NameError: name 'gs_cvec_lr_model' is not defined

In [48]:
# baseline accuracy
y.value_counts(normalize = True)

0    0.539025
1    0.460975
Name: is_dirtyjoke, dtype: float64

In [49]:
0.8111254851228978/0.460975

1.7595867132119916

### Count Vectorizer and Multinomial Naive Bayes

In [59]:
# NAIVE BAYES

pipe_params_cvec = {
    'cvec__max_features': [100, 500],
    'cvec__stop_words': ['english', None],
    'cvec__ngram_range': [(1,1),(1,2)]
}

pipe_cvec_multiNB = Pipeline([
    ('cvec', CountVectorizer()),
    ('multiNB', MultinomialNB())
])

# gridsearch with 5 folds

# count vectorizer logistic regression
gs_cvec_multiNB = GridSearchCV(pipe_cvec_multiNB,     # what object are we optimizing?
                          pipe_params_cvec, # what parameters values are we searching?
                          cv = 5)           # 5-fold cross-validation.


In [60]:
gs_cvec_multiNB.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('cvec',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        prep

In [62]:
gs_cvec_multiNB.best_params_

{'cvec__max_features': 500,
 'cvec__ngram_range': (1, 1),
 'cvec__stop_words': 'english'}

In [61]:
gs_cvec_multiNB.best_score_

0.7741852384375749

In [63]:
gs_cvec_multiNB.score(X_train, y_train)

0.7918643093287336

In [65]:
gs_cvec_multiNB.score(X_test, y_test)

0.7783527382492453