# Vectorizer Tuning

In [1]:
import pandas as pd


data = pd.read_csv("reviews.csv")

data.head()

Unnamed: 0,target,reviews
0,neg,"plot : two teen couples go to a church party ,..."
1,neg,the happy bastard's quick movie review \ndamn ...
2,neg,it is movies like these that make a jaded movi...
3,neg,""" quest for camelot "" is warner bros . ' firs..."
4,neg,synopsis : a mentally unstable man undergoing ...


The dataset is made up of positive and negative movie reviews.

## Preprocessing

👇 Remove punctuation and lower case the text.

In [2]:
data['reviews'] = data['reviews'].str.lower()
data['reviews'] = data['reviews'].str.replace(r'[^\w\s]','')

In [3]:
print(data['reviews'])

0       plot  two teen couples go to a church party  d...
1       the happy bastards quick movie review \ndamn t...
2       it is movies like these that make a jaded movi...
3         quest for camelot  is warner bros   first fe...
4       synopsis  a mentally unstable man undergoing p...
                              ...                        
1995    wow  what a movie  \nits everything a movie ca...
1996    richard gere can be a commanding actor  but he...
1997    glorystarring matthew broderick  denzel washin...
1998    steven spielbergs second epic film on world wa...
1999    truman   trueman   burbank is the perfect name...
Name: reviews, Length: 2000, dtype: object


## Tuning

👇 Tune a vectorizer of your choice (or try both!) and a MultinomialNB model simultaneously.

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV

# Create Pipeline
pipe= Pipeline([('count_vectorizer',CountVectorizer()),('mnb',MultinomialNB())])
print(data['target'])

# Set parameters to search (model and vectorizer)
pipe.fit(data['reviews'],data['target'])
print(data['target'])
cross_validate(pipe,data['reviews'],data['target'])
# Perform grid search on pipeline




0       neg
1       neg
2       neg
3       neg
4       neg
       ... 
1995    pos
1996    pos
1997    pos
1998    pos
1999    pos
Name: target, Length: 2000, dtype: object
0       neg
1       neg
2       neg
3       neg
4       neg
       ... 
1995    pos
1996    pos
1997    pos
1998    pos
1999    pos
Name: target, Length: 2000, dtype: object




{'fit_time': array([0.76203394, 0.73504114, 0.78103375]),
 'score_time': array([0.33796358, 0.36295938, 0.35796404]),
 'test_score': array([0.7994012 , 0.8048048 , 0.81681682])}

In [5]:
from sklearn.model_selection import train_test_split

param_grid = [
    {
        "count_vectorizer__min_df": [0,0.01,0.1],
        "mnb__alpha": [0, 0.1, 0.5, 1],
    }
]




grid_search = GridSearchCV(pipe, param_grid)
grid_search.fit(data['reviews'], data['target'])
print(grid_search.best_estimator_)
print(grid_search.best_score_)


  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)


Pipeline(memory=None,
         steps=[('count_vectorizer',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=0.01,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('mnb',
                 MultinomialNB(alpha=0, class_prior=None, fit_prior=True))],
         verbose=False)
0.822


  'setting alpha = %.1e' % _ALPHA_MIN)


⚠️ Please push the exercise once you are done 🙃

## 🏁 