In [187]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [188]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [189]:
from collections import defaultdict
import math

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/imdb_reviews.csv')
print(df)

                                                 review sentiment
0     One of the other reviewers has mentioned that ...  positive
1     A wonderful little production. <br /><br />The...  positive
2     I thought this was a wonderful way to spend ti...  positive
3     Basically there's a family where a little boy ...  negative
4     Petter Mattei's "Love in the Time of Money" is...  positive
...                                                 ...       ...
4993  This is a painfully slow story about the last ...  negative
4994  Joel schumacher Made a heck of a choice when h...  positive
4995  An interesting slasher film with multiple susp...  negative
4996  i watched this series when it first came out i...  positive
4997  Once again Jet Li brings his charismatic prese...  positive

[4998 rows x 2 columns]


In [190]:
lemmatizer = WordNetLemmatizer()
print('Test lemmatizer:', lemmatizer.lemmatize('geese'))

# word_counts[word][0] = occurrences of word in negative reviews
# word_counts[word][1] = occurrences of word in positive reviews
word_counts = defaultdict(lambda: [0, 0]) # returns [0, 0] by default if the key does not exist

STOP_WORDS = stopwords.words('english')
print('Test stopwords:', STOP_WORDS[0:6])

tokenizer = RegexpTokenizer(r'\w+')
print('Test tokenizer:', tokenizer.tokenize('Hello, my name is Minh'))

sentiment = list(df['sentiment'])

done =  0

total_positive_words = 0
total_negative_words = 0

# keep track of the number of positive and negative reviews (prior probabilities)
total_positive_reviews = 0
total_negative_reviews = 0

Test lemmatizer: goose
Test stopwords: ['i', 'me', 'my', 'myself', 'we', 'our']
Test tokenizer: ['Hello', 'my', 'name', 'is', 'Minh']


In [191]:
for ele in enumerate(list(df['review'])):
  if ele[0] < 3:
   print(ele)

(0, "One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to th

In [192]:
for i, review in enumerate(list(df['review'])):
    if sentiment[i] == 'positive':
        total_positive_reviews += 1
    else:
        total_negative_reviews += 1

    for token in tokenizer.tokenize(review):
        token = token.lower()
        token = lemmatizer.lemmatize(token)
        if token not in STOP_WORDS:
            if sentiment[i] == 'positive':
                word_counts[token][1] += 1
                total_positive_words += 1
            else:
                word_counts[token][0] += 1
                total_negative_words += 1

In [201]:
word_counts_list = list(word_counts.items())
print('Test word_counts:')
print(' ', word_counts_list[0:5])

print('Vocabulary:', len(word_counts), 'words')

Test word_counts:
  [('one', [2702, 2796]), ('reviewer', [54, 44]), ('ha', [1556, 1712]), ('mentioned', [56, 67]), ('watching', [540, 409])]
Vocabulary: 34767 words


In [194]:
def calculate_word_probability(word, sentiment):
    if sentiment == 'positive':
        return math.log((word_counts[word][1] + 1) / (total_positive_words + len(word_counts)))
    else:
        return math.log((word_counts[word][0] + 1) / (total_negative_words + len(word_counts)))


def calculate_review_probability(review, sentiment):
    if sentiment == 'positive':
        probability = math.log(total_positive_reviews / len(df))
    else:
        probability = math.log(total_negative_reviews / len(df))

    for token in tokenizer.tokenize(review):
        token = token.lower()
        token = lemmatizer.lemmatize(token)
        if token not in STOP_WORDS:
            probability += calculate_word_probability(token, sentiment)
    return probability


def predict(review):
    if calculate_review_probability(review, 'positive') > calculate_review_probability(review, 'negative'):
        return 'positive'
    else:
        return 'negative'

In [195]:
print('The review is:', predict('This movie was awesome'))
print('The review is:', predict('Not so good. I found it boring'))

The review is: positive
The review is: negative


In [196]:
correct  = 0
incorrect = 0
sentiments = list(df['sentiment'])
for i, text in enumerate(list(df['review'])):
    if predict(text) == sentiments[i]:
        correct += 1
    else:
        incorrect += 1

print('Accuracy of the classifier:', round(correct / (correct + incorrect) * 100, 2), '%')

Accuracy of the classifier: 95.04 %
