# Sentiment analysis of movie reviews

### Runze Xiang

In [1]:
# Extract the movie reviews from NLTK packages
# Assign the category (pos or neg) to each bag of words
import nltk
from nltk.corpus import movie_reviews 

pos_reviews = []
for fileid in movie_reviews.fileids('pos'):
    words = movie_reviews.words(fileid)
    pos_reviews.append(words)

neg_reviews = []
for fileid in movie_reviews.fileids('neg'):
    words = movie_reviews.words(fileid)
    neg_reviews.append(words)

In [2]:
# Clean the word list and define the feature extractor
from nltk import ngrams
from nltk.corpus import stopwords 
import string 

stopwords_english = stopwords.words('english')

# clean words, i.e. remove stopwords and punctuation
def clean_words(words, stopwords_english):
    words_clean = []
    for word in words:
        word = word.lower()
        if word not in stopwords_english and word not in string.punctuation:
            words_clean.append(word)    
    return words_clean

# feature extractor function for unigram
def bag_of_words(words):    
    words_dictionary = dict([word, True] for word in words) 
    return words_dictionary

# feature extractor function for ngrams (bigram)
def bag_of_ngrams(words, n=2):
    words_ng = []
    for item in iter(ngrams(words, n)):
        words_ng.append(item)
    words_dictionary = dict([word, True] for word in words_ng)  
    return words_dictionary

# Define stopwords_english_for_bigrams
important_words = ['above', 'below', 'off', 'over', 'under', 'more', 'most', 'such', 'no', 'nor', 'not', 'only', 'so', 'than', 'too', 'very', 'just', 'but']
stopwords_english_for_bigrams = set(stopwords_english) - set(important_words)

# let's define a new function that extracts all features
# i.e. that extracts both unigram and bigrams features
def bag_of_all_words(words, n=2):
    words_clean = clean_words(words, stopwords_english)
    words_clean_for_bigrams = clean_words(words, stopwords_english_for_bigrams)

    unigram_features = bag_of_words(words_clean)
    bigram_features = bag_of_ngrams(words_clean_for_bigrams)

    all_features = unigram_features.copy()
    all_features.update(bigram_features)

    return all_features

In [3]:
# Create Feature Set

# positive reviews feature set
pos_reviews_set = []
for words in pos_reviews:
    pos_reviews_set.append((bag_of_all_words(words), 'pos'))

# negative reviews feature set
neg_reviews_set = []
for words in neg_reviews:
    neg_reviews_set.append((bag_of_all_words(words), 'neg'))

In [4]:
# Creat train and test set
from random import shuffle 
shuffle(pos_reviews_set)
shuffle(neg_reviews_set)

test_set = pos_reviews_set[:200] + neg_reviews_set[:200]
train_set = pos_reviews_set[200:] + neg_reviews_set[200:]

In [5]:
# Training classifier
from nltk import classify
from nltk import NaiveBayesClassifier

classifier = NaiveBayesClassifier.train(train_set)
accuracy = classify.accuracy(classifier, test_set)
print (accuracy)

0.7725


In [6]:
# Web scraping the movie (Spider-Man: Across the Spider-verse) review from rotten tomatoes
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup as bs
import time

driver_path = '/Users/xiangrunze/Downloads/chromedriver'

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

driver.get("https://www.rottentomatoes.com/m/spider_man_across_the_spider_verse/reviews?intcmp=rt-scorecard_tomatometer-reviews")
reviewText=[]



# Creates "load more" button object.
wait = WebDriverWait(driver, 10)
load_more_button = wait.until(EC.element_to_be_clickable((By.XPATH, "//*[@id='reviews']/div[3]/rt-button[2]")))

while True:
    try:
        page_source = driver.page_source
        soup = bs(page_source, 'html.parser')
        reviewTable = soup.find("div", {"class": "review_table"})
        reviewText += [t.get_text() for t in reviewTable.select(".review-row .review-text")]
        load_more_button = wait.until(EC.element_to_be_clickable((By.XPATH, "//*[@id='reviews']/div[3]/rt-button[2]")))
        load_more_button.click()
        time.sleep(5)
    except:
        break

#close the browser
driver.quit()

In [7]:
# Get the sentiment analysis result
for review in reviewText:
    features = bag_of_all_words(nltk.word_tokenize(review.lower()))
    prediction = classifier.classify(features)
    print(f"Review: {review}\nPredicted Sentiment: {prediction}\n")

Review: It's super fun, exciting, and surprising. [Full review in Spanish]
Predicted Sentiment: pos

Review: …a hard-sell for a 60 year old product you already bought, used, grew out of and got bored of some time ago…
Predicted Sentiment: neg

Review: Spider-Man: Across the Spider-Verse is AMAZING and a REVOLUTIONARY achievement in animation. It’s a wild and darker journey that is FULL of JAW-DROPPING surprises. Spider-Man has never looked so good.
Predicted Sentiment: pos

Review: Everything audiences have seen thus far from the first frame of "Into" to the last of "Across" means something to the larger story, and it’s been in front of us the whole time.
Predicted Sentiment: pos

Review: A sequel that’s just as good as its predecessor.
Predicted Sentiment: pos

Review: For a 140 min film, you’ll feel like you’re in an art gallery in the sense that you know there’s not enough time in the day to see everything, so you go ahead and make an internal commitment to come back. 
Predicted Sen

Review: How many Spider-Men does it take to make a successful multiverse sequel? I’m not certain, but it might be the countless number of Spideys that appear in the delightful “Spider-Man: Across the Spider-Verse.”
Predicted Sentiment: pos

Review: Spider-Man: Across the Spider-Verse is an exceptional sequel that skyrockets its way into the conversation of the best comic book movie of all time.
Predicted Sentiment: pos

Review: Spider-Man: Across the Spider-Verse is the rare sequel that dazzles as much as the original did. It’s something to behold. Colors drip, invert and splatter in a shimmering pop-art swirl.
Predicted Sentiment: pos

Review: Sony has managed to compile tomes of information into visually articulate snapshots, loading them with dialog, emotion, action, and purpose. With humor, drama, and tragedy into one film, less ambitious studious may have broken out into 4 or 5.
Predicted Sentiment: pos

Review: Across the Spider-Verse keeps up the momentum, goes further with the 

In [8]:
# Counting the number of positive and negative reviews
pos_count = 0
neg_count = 0

for review in reviewText:
    features = bag_of_all_words(nltk.word_tokenize(review.lower()))
    prediction = classifier.classify(features)
    if prediction == 'pos':
        pos_count += 1
    elif prediction == 'neg':
        neg_count += 1

print(f"Number of positive reviews: {pos_count}")
print(f"Number of negative reviews: {neg_count}")

Number of positive reviews: 329
Number of negative reviews: 49
