In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [48]:
import re
import nltk
import spacy
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [4]:
overviews = pd.read_csv('./movie_overviews.csv')
reviews = pd.read_csv('./movie_reviews_clean.csv')

# Building a bag-of-words model

In [8]:
overviews.head(3)

Unnamed: 0,id,title,overview,tagline
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...",
1,8844,Jumanji,When siblings Judy and Peter discover an encha...,Roll the dice and unleash the excitement!
2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...,Still Yelling. Still Fighting. Still Ready for...


In [9]:
overviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9099 entries, 0 to 9098
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        9099 non-null   int64 
 1   title     9099 non-null   object
 2   overview  9087 non-null   object
 3   tagline   7033 non-null   object
dtypes: int64(1), object(3)
memory usage: 284.5+ KB


## BoW model for movie taglines

In [18]:
overviews_ = overviews.copy().dropna()
overviews_['tagline'] = overviews_['tagline'].str.lower()

corpus = overviews_['tagline'] #we'll be using this one

In [20]:
corpus.head()

1            roll the dice and unleash the excitement!
2    still yelling. still fighting. still ready for...
3    friends are the people who let you be yourself...
4    just when his world is back to normal... he's ...
5                             a los angeles crime saga
Name: tagline, dtype: object

In [22]:
# create CountVectorizer object
vectorizer = CountVectorizer()

# generate matrix of word vectors
bow_matrix = vectorizer.fit_transform(corpus)

print(bow_matrix.shape)

(7033, 6614)


## Analyzing dimensionality and preprocessing

In [49]:
nlp = spacy.load('en_core_web_sm')
stopwords = spacy.lang.en.stop_words.STOP_WORDS

In [25]:
def preprocess(text):
    doc = nlp(text)
    lemmas = [token.lemma_ for token in doc]
    a_lemmas = [lemma for lemma in lemmas
                if lemma.isalpha() and lemma not in stopwords]
    return " ".join(a_lemmas)

In [26]:
lem_corpus = corpus.apply(preprocess)

In [28]:
lem_corpus.head()

1    roll dice unleash excitement
2           yell fight ready love
3    friend people let let forget
4      world normal surprise life
5          los angeles crime saga
Name: tagline, dtype: object

In [43]:
lem_corpus.shape

(7033,)

In [44]:
# create another vectorizer
vectorizer = CountVectorizer()

# generate a matrix of word vectors
bow_lem_matrix = vectorizer.fit_transform(lem_corpus)

print(bow_lem_matrix.shape)

(7033, 4964)


Notice how the number of features have reduced significantly from around 6600 to **4964** for pre-processed movie taglines. The reduced number of dimensions on account of text preprocessing usually leads to better performance when conducting machine learning and it is a good idea to consider it. However, as mentioned in a previous lesson, the final decision always depends on the nature of the application.

## Mapping feature indices with feature names

In [45]:
corpus = ['The lion is the king of the jungle',
          'Lions have lifespans of a decade', 
          'The lion is an endangered species']

In [47]:
# create CountVectorizer object
vectorizer = CountVectorizer()

# generate matrix of word vectors
bow_matrix = vectorizer.fit_transform(corpus)

# convert bow_matrix into a DataFrame
bow_df = pd.DataFrame(bow_matrix.toarray())

# map the column names to vocabulary 
bow_df.columns = vectorizer.get_feature_names()

bow_df

Unnamed: 0,an,decade,endangered,have,is,jungle,king,lifespans,lion,lions,of,species,the
0,0,0,0,0,1,1,1,0,1,0,1,0,3
1,0,1,0,1,0,0,0,1,0,1,1,0,0
2,1,0,1,0,1,0,0,0,1,0,0,1,1


# Building a BoW Naive Bayes classifier

## BoW vectors for movie reviews

In [51]:
print(reviews.shape)
reviews.head()

(1000, 2)


Unnamed: 0,review,sentiment
0,this anime series starts out great interesting...,0
1,some may go for a film like this but i most as...,0
2,i ve seen this piece of perfection during the ...,1
3,this movie is likely the worst movie i ve ever...,0
4,it ll soon be 10 yrs since this movie was rele...,1


In [95]:
X = reviews['review']
y = reviews['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.75,
                                                    random_state=1111)

In [96]:
# create CountVectorizer object
vectorizer = CountVectorizer(lowercase=True,
                             stop_words='english')

# transform data
X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)

print(f'train shape: {X_train_bow.shape}')
print(f'test shape: {X_test_bow.shape}')

train shape: (250, 7317)
test shape: (750, 7317)


## Predicting the sentiment of a movie review

In [97]:
# create a MultinomialNB object
nb = MultinomialNB()

# fit and eval
nb.fit(X_train_bow, y_train)

acc = nb.score(X_test_bow, y_test)
print(f'test acc: {acc: .1%}')

test acc:  80.1%


In [100]:
# predict the sentiment of a negative review
review = "The movie was terrible. The music was underwhelming and the acting mediocre."
pred = nb.predict(vectorizer.transform([review]))[0]
print(f'sentiment: {pred}')

sentiment: 0


# Building n-gram models

## n-gram models for movie tag lines

In [101]:
corpus = overviews_['tagline']

In [102]:
# generate n-grams upto n=1
vectorizer_ng1 = CountVectorizer(ngram_range=(1, 1))
ng1 = vectorizer_ng1.fit_transform(corpus)

# generate n-grams upto n=2
vectorizer_ng2 = CountVectorizer(ngram_range=(1, 2))
ng2 = vectorizer_ng2.fit_transform(corpus)

# generate n-grams upto n=3
vectorizer_ng3 = CountVectorizer(ngram_range=(1, 3))
ng3 = vectorizer_ng3.fit_transform(corpus)

# Print the number of features for each model
print(f'ng1, ng2 and ng3 have {ng1.shape[1]}, {ng2.shape[1]}, and {ng3.shape[1]} features, respectively')

ng1, ng2 and ng3 have 6614, 37100, and 76881 features, respectively


Notice that `ng2` has over 37,000 features whereas `ng3` has over 76,000 features. This is much greater than the 6,000 dimensions obtained for `ng1`. As the n-gram range increases, so does the number of features, leading to increased computational costs and a problem known as the curse of dimensionality.

## Higher order n-grams for sentiment analysis

In [118]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.50,
                                                    random_state=422)

In [119]:
# create CountVectorizer object
vectorizer = CountVectorizer(lowercase=True,
                             stop_words='english',
                             ngram_range=(1, 2))

# transform data
X_train_ng = vectorizer.fit_transform(X_train)
X_test_ng = vectorizer.transform(X_test)

print(f'train shape: {X_train_ng.shape}')
print(f'test shape: {X_test_ng.shape}')

train shape: (500, 57625)
test shape: (500, 57625)


In [120]:
# define an instance of MultinomialNB
nb_ng = MultinomialNB()

# fit and eval
nb_ng.fit(X_train_ng, y_train)

acc = nb_ng.score(X_test_ng, y_test)
print(f'test acc: {acc: .1%}')

test acc:  81.0%


In [121]:
# predict the sentiment of a negative review
review = "The movie was not good. The plot had several holes and the acting lacked panache."
prediction = nb_ng.predict(vectorizer.transform([review]))[0]
print(f'The sentiment predicted by the classifier is {prediction}')

The sentiment predicted by the classifier is 0


## Comparing performance of n-gram models

In [125]:
import time
start_time = time.time()
# Splitting the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.50,
                                                    stratify=y,
                                                    random_state=422)

# Generating ngrams
vectorizer = CountVectorizer(lowercase=True,
                             stop_words='english',
                             ngram_range=(1, 1))
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

# Fit classifier
nb = MultinomialNB()
nb.fit(X_train, y_train)

# Print accuracy, time and number of dimensions
print(f'The program took {time.time() - start_time: .2f} seconds to complete. The accuracy on the test set is {nb.score(X_test, y_test): .1%}. The ngram representation had {X_train.shape[1]} features.')

The program took  0.31 seconds to complete. The accuracy on the test set is  78.8%. The ngram representation had 11781 features.


In [126]:
import time
start_time = time.time()
# Splitting the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.50,
                                                    stratify=y,
                                                    random_state=422)

# Generating ngrams
vectorizer = CountVectorizer(lowercase=True,
                             stop_words='english',
                             ngram_range=(1, 3))
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

# Fit classifier
nb = MultinomialNB()
nb.fit(X_train, y_train)

# Print accuracy, time and number of dimensions
print(f'The program took {time.time() - start_time: .2f} seconds to complete. The accuracy on the test set is {nb.score(X_test, y_test): .1%}. The ngram representation had {X_train.shape[1]} features.')

The program took  0.92 seconds to complete. The accuracy on the test set is  79.2%. The ngram representation had 109901 features.
