# Week 12 - Sentiment Analysis



## Sentiment Analysis Using Sentiment Lexicons

### Preparing the NLTK Movie Review Dataset

https://www.nltk.org/book/ch02.html

In [1]:
import nltk
nltk.download('movie_reviews')

from nltk.corpus import movie_reviews

print('#review count:', len(movie_reviews.fileids()))  # Returns the IDs of the movie review documents
print('#samples of file ids:', movie_reviews.fileids()[:10])  # Prints the first 10 file IDs
print('#categories of reviews:', movie_reviews.categories())  # Returns the categories, i.e., whether positive or negative
print('#num of "neg" reviews:', len(movie_reviews.fileids(categories='neg')))  # Returns the count of reviews labeled as negative
print('#num of "pos" reviews:', len(movie_reviews.fileids(categories='pos')))  # Returns the count of reviews labeled as positive

fileid = movie_reviews.fileids()[0]  # Returns the ID of the first document
print('#id of the first review:', fileid)
print('#part of the first review:', movie_reviews.raw(fileid)[:500])  # Prints the first 500 characters of the first review
print('#sentiment of the first review:', movie_reviews.categories(fileid))  # Prints the sentiment (category) of the first review

fileids = movie_reviews.fileids()  # Retrieves the file IDs from the movie review data
reviews = [movie_reviews.raw(fileid) for fileid in fileids]  # Retrieves the raw text files using the file IDs
categories = [movie_reviews.categories(fileid)[0] for fileid in fileids]  # Retrieves the category (sentiment) for each review

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


#review count: 2000
#samples of file ids: ['neg/cv000_29416.txt', 'neg/cv001_19502.txt', 'neg/cv002_17424.txt', 'neg/cv003_12683.txt', 'neg/cv004_12641.txt', 'neg/cv005_29357.txt', 'neg/cv006_17022.txt', 'neg/cv007_4992.txt', 'neg/cv008_29326.txt', 'neg/cv009_29417.txt']
#categories of reviews: ['neg', 'pos']
#num of "neg" reviews: 1000
#num of "pos" reviews: 1000
#id of the first review: neg/cv000_29416.txt
#part of the first review: plot : two teen couples go to a church party , drink and then drive . 
they get into an accident . 
one of the guys dies , but his girlfriend continues to see him in her life , and has nightmares . 
what's the deal ? 
watch the movie and " sorta " find out . . . 
critique : a mind-fuck movie for the teen generation that touches on a very cool idea , but presents it in a very bad package . 
which is what makes this review an even harder one to write , since i generally applaud films which attempt
#sentiment of the first review: ['neg']


### Sentiment Analysis Using TextBlob

https://textblob.readthedocs.io/en/dev/

https://textblob.readthedocs.io/en/dev/quickstart.html

In [2]:
!pip install -U textblob
!python -m textblob.download_corpora

Collecting textblob
  Downloading textblob-0.18.0.post0-py3-none-any.whl.metadata (4.5 kB)
Downloading textblob-0.18.0.post0-py3-none-any.whl (626 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m626.3/626.3 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: textblob
  Attempting uninstall: textblob
    Found existing installation: textblob 0.17.1
    Uninstalling textblob-0.17.1:
      Successfully uninstalled textblob-0.17.1
Successfully installed textblob-0.18.0.post0
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package conll2000 

In [3]:
from textblob import TextBlob

result = TextBlob(reviews[0])
print(result.sentiment)

Sentiment(polarity=0.06479782948532947, subjectivity=0.5188408350908352)


In [4]:
def sentiment_TextBlob(docs):
    results = []

    for doc in docs:
        testimonial = TextBlob(doc)
        if testimonial.sentiment.polarity > 0:
            results.append('pos')
        else:
            results.append('neg')
    return results

In [5]:
from sklearn.metrics import accuracy_score

# Accuracy of sentiment analysis using TextBlob
print('#Accuracy of sentiment analysis using TextBlob:', accuracy_score(categories, sentiment_TextBlob(reviews)))

#Accuracy of sentiment analysis using TextBlob: 0.6


### Sentiment Analysis Using AFINN

https://github.com/fnielsen/afinn

http://corpustext.com/reference/sentiment_afinn.html

In [7]:
!pip install afinn



In [8]:
from afinn import Afinn

def sentiment_Afinn(docs):
    afn = Afinn(emoticons=True)
    results = []

    for doc in docs:
        if afn.score(doc) > 0:
            results.append('pos')
        else:
            results.append('neg')
    return results

print('#Accuracy of sentiment analysis using Affin:', accuracy_score(categories, sentiment_Afinn(reviews)))

#Accuracy of sentiment analysis using Affin: 0.664


### Sentiment Analysis Using VADER

https://github.com/cjhutto/vaderSentiment

In [9]:
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [10]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

def sentiment_vader(docs):
    analyser = SentimentIntensityAnalyzer()
    results = []

    for doc in docs:
        score = analyser.polarity_scores(doc)
        if score['compound'] > 0:
            results.append('pos')
        else:
            results.append('neg')

    return results

print('#Accuracy of sentiment analysis using VADER:', accuracy_score(categories, sentiment_vader(reviews)))

#Accuracy of sentiment analysis using VADER: 0.635


### Korean Sentiment Lexicon

1: https://github.com/park1200656/KnuSentiLex   
2: https://github.com/mrlee23/KoreanSentimentAnalyzer

## Sentiment Analysis Based on Machine Learning through Training

### Machine Learning-Based Sentiment Analysis on NLTK Movie Reviews


In [11]:
from sklearn.model_selection import train_test_split  # Using the split function provided by sklearn

X_train, X_test, y_train, y_test = train_test_split(reviews, categories, test_size=0.2, random_state=7)

print('Train set count: ', len(X_train))
print('Test set count: ', len(X_test))

Train set count:  1600
Test set count:  400


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB  # Using the MultinomialNB provided by sklearn

tfidf = TfidfVectorizer().fit(X_train)

X_train_tfidf = tfidf.transform(X_train)  # Transform the train set
print('#Train set dimension:', X_train_tfidf.shape)  # Check how many features are actually used
X_test_tfidf = tfidf.transform(X_test)  # Transform the test set
print('#Test set dimension:', X_test_tfidf.shape)

NB_clf = MultinomialNB(alpha=0.01)  # Declare the classifier
NB_clf.fit(X_train_tfidf, y_train)  # Train the classifier using the train set
print('#Train set score: {:.3f}'.format(NB_clf.score(X_train_tfidf, y_train)))  # Check the prediction accuracy on the train set
print('#Test set score: {:.3f}'.format(NB_clf.score(X_test_tfidf, y_test)))  # Check the prediction accuracy on the test set

#Train set dimension: (1600, 36189)
#Test set dimension: (400, 36189)
#Train set score: 0.998
#Test set score: 0.797



| | Predicted Positive Reviews (PP) | Predicted Negative Reviews (PN) |
|---|---|---|
|Actual Positive Reviews (P) | True positive(TP) | False negative(FN) |
|Actual Negative Reviews (N) | False positive(FP) | True negative(TN) |



In [2]:
import string  # Add this import to resolve the error
import numpy as np
import pandas as pd
import nltk
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
nltk.download('punkt_tab')

# Download necessary resources for nltk
nltk.download('stopwords')
nltk.download('punkt')

# Load IMDB dataset
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=1000)  # Use the top 10,000 most frequent words

# Function to decode the text data from word indices to actual words
word_index = imdb.get_word_index()

def decode_review(encoded_review):
    reverse_word_index = {value: key for (key, value) in word_index.items()}
    return ' '.join([reverse_word_index.get(i - 3, '?') for i in encoded_review])

# Decode IMDB reviews into actual text from word indices
X_train_text = [decode_review(review) for review in X_train]
X_test_text = [decode_review(review) for review in X_test]

# Text preprocessing (convert to lowercase and remove punctuation)
def text_preprocessor(text):
    # Convert to lowercase and remove punctuation
    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])  # Use string.punctuation
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    # Remove stopwords
    stop_words = set(nltk.corpus.stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

# TF-IDF vectorization
tfidf = TfidfVectorizer(tokenizer=text_preprocessor, max_features=2000, min_df=5, max_df=0.5)

# Transform training and test data using TF-IDF
X_train_tfidf = tfidf.fit_transform(X_train_text)
X_test_tfidf = tfidf.transform(X_test_text)

# Train a Logistic Regression model
LR_clf = LogisticRegression(max_iter=1000)
LR_clf.fit(X_train_tfidf, y_train)

# Evaluate performance on training and test sets
y_train_predict = LR_clf.predict(X_train_tfidf)
y_test_predict = LR_clf.predict(X_test_tfidf)

# Training set performance
print('#Accuracy for train set: {:.3f}'.format(accuracy_score(y_train, y_train_predict)))
print('#Precision for train set: {:.3f}'.format(precision_score(y_train, y_train_predict)))
print('#Recall for train set: {:.3f}'.format(recall_score(y_train, y_train_predict)))
print('#F1 for train set: {:.3f}'.format(f1_score(y_train, y_train_predict)))

# Test set performance
print('#Accuracy for test set: {:.3f}'.format(accuracy_score(y_test, y_test_predict)))
print('#Precision for test set: {:.3f}'.format(precision_score(y_test, y_test_predict)))
print('#Recall for test set: {:.3f}'.format(recall_score(y_test, y_test_predict)))
print('#F1 for test set: {:.3f}'.format(f1_score(y_test, y_test_predict)))

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


#Accuracy for train set: 0.899
#Precision for train set: 0.893
#Recall for train set: 0.907
#F1 for train set: 0.900
#Accuracy for test set: 0.877
#Precision for test set: 0.872
#Recall for test set: 0.884
#F1 for test set: 0.878
