# Bag of words model using sklearn

**Text preprocessing**
* No punctuations
* No stopwords
* Leads to smaller vocabularies
* Reducing number of dimensions helps improve performance

In [2]:
import pandas as pd
corpus = pd.Series(
    [
        "The lion is the king of the jungle",
        "Lions have lifespans of a decade",
        "The lion is an endangered species",
    ]
)

In [13]:
# Import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
# Create CountVectorizer object
vectorizer = CountVectorizer(token_pattern=r'(?u)\b\w+\b')
# Generate matrix of word vectors
bow_matrix = vectorizer.fit_transform(corpus)
pd.DataFrame(bow_matrix.toarray(), columns=vectorizer.get_feature_names_out())

Unnamed: 0,a,an,decade,endangered,have,is,jungle,king,lifespans,lion,lions,of,species,the
0,0,0,0,0,0,1,1,1,0,1,0,1,0,3
1,1,0,1,0,1,0,0,0,1,0,1,1,0,0
2,0,1,0,1,0,1,0,0,0,1,0,0,1,1


## BoW model for movie taglines

In [29]:
movies = pd.read_csv('movie_overviews.csv')
vectorizer = CountVectorizer(token_pattern=r'(?u)\b\w+\b')
bow_matrix = vectorizer.fit_transform(movies['tagline'].fillna(''))
pd.DataFrame(bow_matrix.toarray(), columns=vectorizer.get_feature_names_out())

Unnamed: 0,000,007,05,06,08,09,1,10,100,1000,...,zombies,zone,zones,zoo,zorba,zwei,ē,ə,ˈfil,ˌrän
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9094,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9095,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9096,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9097,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Bow model for (NLTK) lemmatized movie taglines

In [36]:
from nltk.tokenize.regexp import RegexpTokenizer
from nltk.stem import WordNetLemmatizer 
class LemmaTokenizer:
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
        self.tokenizer = RegexpTokenizer(pattern=r'(?u)\b\w+\b')
    def __call__(self, doc):
        return [self.lemmatizer.lemmatize(t) for t in self.tokenizer.tokenize(doc)]

vectorizer = CountVectorizer(tokenizer=LemmaTokenizer(), token_pattern=None)
bow_matrix = vectorizer.fit_transform(movies['tagline'].fillna(''))
pd.DataFrame(bow_matrix.toarray(), columns=vectorizer.get_feature_names_out())

Unnamed: 0,000,007,05,06,08,09,1,10,100,1000,...,zip,zombie,zone,zoo,zorba,zwei,ē,ə,ˈfil,ˌrän
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9094,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9095,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9096,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9097,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Building a BoW Naive Bayes classifier

## Steps
1. Text preprocessing
2. Building a bag-of-words model (or representation)
3. Machine learning

## Text preprocessing using CountVectorizer
CountVectorizer arguments
* lowercase : False , True
* strip_accents : 'unicode' , 'ascii' , None
* stop_words : 'english' , list , None
* token_pattern : regex
* tokenizer : function

## Building the BoW model

In [47]:
import pandas as pd
df = pd.read_csv('movie_reviews_clean.csv')
df.shape

(1000, 2)

In [45]:
# Import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
# Create CountVectorizer object
vectorizer = CountVectorizer(strip_accents='ascii', stop_words='english', lowercase=False)
# Import train_test_split
from sklearn.model_selection import train_test_split
# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['sentiment'], test_size=0.25)
# Fit model and generate training Bow vectors
X_train_bow = vectorizer.fit_transform(X_train)
pd.DataFrame(X_train_bow.toarray(), columns=vectorizer.get_feature_names_out())

Unnamed: 0,00,000,02,06,08,10,100,1000,104,105,...,zippy,zither,zmeu,zoey,zombie,zombied,zombies,zone,zoom,zucker
0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
745,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
746,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
747,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
748,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [46]:
# Generate test BoW vectors
X_test_bow = vectorizer.transform(X_test)
pd.DataFrame(X_train_bow.toarray(), columns=vectorizer.get_feature_names_out())
pd.DataFrame(X_test_bow.toarray(), columns=vectorizer.get_feature_names_out())

Unnamed: 0,00,000,02,06,08,10,100,1000,104,105,...,zippy,zither,zmeu,zoey,zombie,zombied,zombies,zone,zoom,zucker
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
246,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
247,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
248,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Training the Naive Bayes classifier

In [48]:
# Import MultinomialNB
from sklearn.naive_bayes import MultinomialNB
# Create MultinomialNB object
clf = MultinomialNB()
# Train clf
clf.fit(X_train_bow, y_train)

In [53]:
from sklearn.metrics import accuracy_score
# Compute accuracy on train set
clf.score(X_train_bow, y_train), accuracy_score(y_train, clf.predict(X_train_bow))

In [60]:
# Compute accuracy on test set
accuracy = clf.score(X_test_bow, y_test)
clf.score(X_test_bow, y_test), accuracy_score(y_test, clf.predict(X_test_bow))

(0.832, 0.832)

# Building n-gram models

The higher the n-gram range captures more context.

## BoW shortcomings
* Exactly the same BoW representation for different meaning texts with same words.
* Context of the words is lost.
* Sentiment dependent on the position of 'not'.

## n-gram shortcomings

* Curse of dimensionality
* Higher order n-grams are rare
* Keep n small

## Comparing performance of n-gram models

In [63]:
movies = pd.read_csv('movie_reviews_clean.csv')

In [66]:
import time
start_time = time.time()
# Splitting the data into training and test sets
train_X, test_X, train_y, test_y = train_test_split(movies['review'], movies['sentiment'], test_size=0.5, random_state=42, stratify=movies['sentiment'])

# Generating ngrams
vectorizer = CountVectorizer()
train_X = vectorizer.fit_transform(train_X)
test_X = vectorizer.transform(test_X)

# Fit classifier
clf = MultinomialNB()
clf.fit(train_X, train_y)

# Print accuracy, time and number of dimensions
print("The program took %.3f seconds to complete. The accuracy on the test set is %.2f. The ngram representation had %i features." % (time.time() - start_time, clf.score(test_X, test_y), train_X.shape[1]))

The program took 0.078 seconds to complete. The accuracy on the test set is 0.75. The ngram representation had 12347 features.


In [67]:
start_time = time.time()
# Splitting the data into training and test sets
train_X, test_X, train_y, test_y = train_test_split(movies['review'], movies['sentiment'], test_size=0.5, random_state=42, stratify=movies['sentiment'])

# Generating ngrams
vectorizer = CountVectorizer(ngram_range=(1,3))
train_X = vectorizer.fit_transform(train_X)
test_X = vectorizer.transform(test_X)

# Fit classifier
clf = MultinomialNB()
clf.fit(train_X, train_y)

# Print accuracy, time and number of dimensions
print("The program took %.3f seconds to complete. The accuracy on the test set is %.2f. The ngram representation had %i features." % (time.time() - start_time, clf.score(test_X, test_y), train_X.shape[1]))

The program took 0.385 seconds to complete. The accuracy on the test set is 0.77. The ngram representation had 178240 features.
