# Trying Sentiment Analysis on a different dataset


In [30]:
# libraries imports
import nltk
import random
import pandas as pd
from nltk.corpus import movie_reviews
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer

from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


Data Preprocessing

In [3]:
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

In [4]:
documents[:5]

[(['plot',
   ':',
   'two',
   'teen',
   'couples',
   'go',
   'to',
   'a',
   'church',
   'party',
   ',',
   'drink',
   'and',
   'then',
   'drive',
   '.',
   'they',
   'get',
   'into',
   'an',
   'accident',
   '.',
   'one',
   'of',
   'the',
   'guys',
   'dies',
   ',',
   'but',
   'his',
   'girlfriend',
   'continues',
   'to',
   'see',
   'him',
   'in',
   'her',
   'life',
   ',',
   'and',
   'has',
   'nightmares',
   '.',
   'what',
   "'",
   's',
   'the',
   'deal',
   '?',
   'watch',
   'the',
   'movie',
   'and',
   '"',
   'sorta',
   '"',
   'find',
   'out',
   '.',
   '.',
   '.',
   'critique',
   ':',
   'a',
   'mind',
   '-',
   'fuck',
   'movie',
   'for',
   'the',
   'teen',
   'generation',
   'that',
   'touches',
   'on',
   'a',
   'very',
   'cool',
   'idea',
   ',',
   'but',
   'presents',
   'it',
   'in',
   'a',
   'very',
   'bad',
   'package',
   '.',
   'which',
   'is',
   'what',
   'makes',
   'this',
   'review',
   'an'

In [5]:
random.shuffle(documents)
df = pd.DataFrame(documents, columns=['review', 'sentiment'])


In [6]:
df.head()

Unnamed: 0,review,sentiment
0,"[i, was, anxious, to, see, this, for, a, long,...",pos
1,"[i, have, to, say, it, ., tim, burton, ', s, r...",pos
2,"[these, days, ,, we, are, witnessing, the, del...",neg
3,"[for, those, interested, in, the, true, spirit...",neg
4,"[i, must, say, from, the, outset, that, i, hav...",pos


Exploratory Data Analysis

In [7]:
print(df['sentiment'].value_counts())


pos    1000
neg    1000
Name: sentiment, dtype: int64


Text Preprocessing


In [9]:
stop_words = set(stopwords.words('english'))
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [11]:
def preprocess_text(text):
    # Remove HTML tags, punctuation, and special characters
    text = ' '.join(word_tokenize(' '.join(text).lower()))
    text = ''.join([c for c in text if c.isalnum() or c.isspace()])
    
    # Remove stop words
    text = ' '.join([word for word in text.split() if word not in stop_words])
    
    return text


df['processed_review'] = df['review'].apply(preprocess_text)

In [12]:
df.head()

Unnamed: 0,review,sentiment,processed_review
0,"[i, was, anxious, to, see, this, for, a, long,...",pos,anxious see long time friend mine recommended ...
1,"[i, have, to, say, it, ., tim, burton, ', s, r...",pos,say tim burton retelling planet apes fun barre...
2,"[these, days, ,, we, are, witnessing, the, del...",neg,days witnessing deluge films based old cult tv...
3,"[for, those, interested, in, the, true, spirit...",neg,interested true spirit moviemaking left mainst...
4,"[i, must, say, from, the, outset, that, i, hav...",pos,must say outset never much kurt russell fan se...


Feature Extraction

In [13]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['processed_review'])
y = df['sentiment']


Model Training and Evaluation

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = MultinomialNB()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print('Accuracy:', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred, average='weighted'))
print('Recall:', recall_score(y_test, y_pred, average='weighted'))
print('F1-Score:', f1_score(y_test, y_pred, average='weighted'))

Accuracy: 0.8425
Precision: 0.8463469776297
Recall: 0.8425
F1-Score: 0.8425187032418953


Good metrics, though this is just to have a taste of an end to end sentiment analysis using multinomial naive baye model to predict whether a sentiment is positive or negative.
Next, I try to use my own sentences to determine the performance of the model as shown below.

In [18]:
# Example test sentences
test_sentences = [
    "This movie is fantastic and highly recommended!",
    "I didn't enjoy the plot and the acting was subpar.",
    "The cinematography and soundtrack were amazing.",
    "The film was a complete disappointment.",
    "Worst movie ever.",
    "I cant believe I wasted my time on this shit."
]

# Preprocess the test sentences
preprocessed_sentences = [preprocess_text(sentence) for sentence in test_sentences]

# Vectorize the preprocessed test sentences
X_test = vectorizer.transform(preprocessed_sentences)

# Predict sentiment labels for the test sentences
y_pred = model.predict(X_test)

# Print the predicted sentiment labels
for sentence, sentiment in zip(test_sentences, y_pred):
    print(f"Sentence: {sentence}")
    print(f"Sentiment: {sentiment}")
    print()


Sentence: This movie is fantastic and highly recommended!
Sentiment: pos

Sentence: I didn't enjoy the plot and the acting was subpar.
Sentiment: pos

Sentence: The cinematography and soundtrack were amazing.
Sentiment: pos

Sentence: The film was a complete disappointment.
Sentiment: pos

Sentence: Worst movie ever.
Sentiment: pos

Sentence: I cant believe I wasted my time on this shit.
Sentiment: pos



Obviously, the model was a dud and it got away with predicting a positive sentiments for all the sentences

Now using nltk

In [19]:

# Download the VADER lexicon
#nltk.download('vader_lexicon')

# Create an instance of the SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

# Example sentences
sentences = [
    "This movie is fantastic and highly recommended!",
    "I didn't enjoy the plot and the acting was subpar.",
    "The cinematography and soundtrack were amazing.",
    "The film was a complete disappointment.",
    "Worst movie ever.",
    "I cant believe I wasted my time on this shit."
]

# Perform sentiment analysis on the sentences
for sentence in sentences:
    sentiment_scores = sid.polarity_scores(sentence)
    
    # Extract the compound score, which represents the overall sentiment
    compound_score = sentiment_scores['compound']
    
    if compound_score >= 0.05:
        sentiment = "Positive"
    elif compound_score <= -0.05:
        sentiment = "Negative"
    else:
        sentiment = "Neutral"
    
    print(f"Sentence: {sentence}")
    print(f"Sentiment: {sentiment}")
    print()


Sentence: This movie is fantastic and highly recommended!
Sentiment: Positive

Sentence: I didn't enjoy the plot and the acting was subpar.
Sentiment: Negative

Sentence: The cinematography and soundtrack were amazing.
Sentiment: Positive

Sentence: The film was a complete disappointment.
Sentiment: Negative

Sentence: Worst movie ever.
Sentiment: Negative

Sentence: I cant believe I wasted my time on this shit.
Sentiment: Negative



Now this is more like it. It is beginning to make sense. Let's do same for movie review dataset

In [25]:
# Create an instance of the SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

# Load the IMDb Movie Reviews dataset
from nltk.corpus import movie_reviews
reviews = []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        review = movie_reviews.raw(fileid)
        sentiment = 'positive' if category == 'pos' else 'negative'
        reviews.append((review, sentiment))

# Perform sentiment analysis on the movie reviews
sentiments = []
for review, sentiment in reviews:
    sentiment_scores = sid.polarity_scores(review)
    
    # Extract the compound score, which represents the overall sentiment
    compound_score = sentiment_scores['compound']
    
    if compound_score >= 0.05:
        predicted_sentiment = "positive"
    elif compound_score <= -0.05:
        predicted_sentiment = "negative"
    else:
        predicted_sentiment = "neutral"
    
    sentiments.append((review, sentiment, predicted_sentiment))

# Convert sentiments to a DataFrame for easier analysis
df = pd.DataFrame(sentiments, columns=['Review', 'Actual Sentiment', 'Predicted Sentiment'])

# Print a sample of the results
df.head()


Unnamed: 0,Review,Actual Sentiment,Predicted Sentiment
0,"plot : two teen couples go to a church party ,...",negative,positive
1,the happy bastard's quick movie review \ndamn ...,negative,positive
2,it is movies like these that make a jaded movi...,negative,positive
3,""" quest for camelot "" is warner bros . ' firs...",negative,negative
4,synopsis : a mentally unstable man undergoing ...,negative,positive


Seeing how well it did

In [29]:
# Filter out the neutral class from the data
filtered_df = df[df['Predicted Sentiment'] != 'neutral']

# Convert the actual and predicted sentiments to lists
actual_sentiments = filtered_df['Actual Sentiment'].tolist()
predicted_sentiments = filtered_df['Predicted Sentiment'].tolist()

# Generate the classification report
report = classification_report(actual_sentiments, predicted_sentiments)

# Print the classification report
print(report)


              precision    recall  f1-score   support

    negative       0.72      0.44      0.55      1000
    positive       0.60      0.83      0.69       999

    accuracy                           0.64      1999
   macro avg       0.66      0.64      0.62      1999
weighted avg       0.66      0.64      0.62      1999



Not so good. The metrics are just mid. 