In [1]:
import nltk, re
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords, wordnet, treebank
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag, ne_chunk
from nltk.text import Text
from nltk.probability import FreqDist
from nltk.collocations import BigramCollocationFinder, TrigramCollocationFinder
from nltk.metrics import BigramAssocMeasures, TrigramAssocMeasures
from nltk.util import ngrams
from collections import defaultdict
import math

In [2]:
import pandas as pd
data = pd.read_csv('IMDB Dataset.csv')
reviews = data['review']
sentiment = data['sentiment']
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
# This script performs text preprocessing on movie reviews.
# It includes steps for lowercasing, cleaning text (removing HTML tags and special characters), 
# tokenization, and lemmatization. Additionally, it maintains a mapping between lemmatized words 
# and their original forms for future reference.

lemmatizer = WordNetLemmatizer()
word_mapping = {}

def preprocess_text_with_mapping(text):
    text = text.lower()  # Convert text to lowercase
    text = re.sub(r'(<br\s*/?>)|[^\w\s-]', '', text)  # Remove HTML tags and non-alphanumeric characters
    tokens = word_tokenize(text)  # Tokenize the text into individual words
    
    lemmatized_tokens = []
    for token in tokens:
        lemma = lemmatizer.lemmatize(token)  # Lemmatize each token
        lemmatized_tokens.append(lemma)
        word_mapping[lemma] = token  # Map the lemmatized word to its original form

    return lemmatized_tokens

# Apply preprocessing to the 'review' column of the dataset
data['tokens'] = data['review'].apply(preprocess_text_with_mapping)

In [4]:
data['tokens']

0        [one, of, the, other, reviewer, ha, mentioned,...
1        [a, wonderful, little, production, the, filmin...
2        [i, thought, this, wa, a, wonderful, way, to, ...
3        [basically, there, a, family, where, a, little...
4        [petter, matteis, love, in, the, time, of, mon...
                               ...                        
49995    [i, thought, this, movie, did, a, down, right,...
49996    [bad, plot, bad, dialogue, bad, acting, idioti...
49997    [i, am, a, catholic, taught, in, parochial, el...
49998    [im, going, to, have, to, disagree, with, the,...
49999    [no, one, expects, the, star, trek, movie, to,...
Name: tokens, Length: 50000, dtype: object

In [5]:
unigram_model_freq = FreqDist([word for tokens in data['tokens'] for word in tokens])
bigram_model_freq = FreqDist([bigram for tokens in data['tokens'] for bigram in ngrams(tokens, 2)])
trigram_model_freq = FreqDist([trigram for tokens in data['tokens'] for trigram in ngrams(tokens, 3)])

In [6]:
unigram_model_freq

FreqDist({'the': 650834, 'a': 409471, 'and': 319819, 'of': 288073, 'to': 266317, 'is': 210100, 'it': 199366, 'in': 183211, 'i': 145581, 'this': 145536, ...})

In [7]:
bigram_model_freq

FreqDist({('of', 'the'): 76983, ('in', 'the'): 49633, ('this', 'movie'): 29796, ('is', 'a'): 27107, ('and', 'the'): 26279, ('the', 'film'): 25454, ('to', 'the'): 23619, ('to', 'be'): 23154, ('the', 'movie'): 22981, ('this', 'film'): 20637, ...})

In [8]:
trigram_model_freq

FreqDist({('one', 'of', 'the'): 9621, ('of', 'the', 'film'): 5078, ('this', 'movie', 'is'): 4852, ('a', 'lot', 'of'): 4650, ('this', 'is', 'a'): 4370, ('of', 'the', 'movie'): 4249, ('some', 'of', 'the'): 3676, ('is', 'one', 'of'): 3539, ('the', 'film', 'is'): 3337, ('this', 'film', 'is'): 3228, ...})

In [15]:
# This function predicts the next word in a sequence using unigram, bigram, and trigram models.
# It lemmatizes the previous words for consistency with the model and then uses the n-grams to predict the most probable next word.
# If no suitable n-gram is found, it falls back on the unigram model. The final predicted word is mapped back to its original form using a word mapping dictionary.

def predict_next_word_ngram(prev_words):
    # Lemmatize previous words for consistency with the n-gram models
    prev_words = [lemmatizer.lemmatize(word) for word in prev_words]
    
    # If no previous words are provided, predict the most frequent word from the unigram model
    if len(prev_words) == 0:
        predicted_word = unigram_model_freq.max()
        
    # If two previous words are provided, search for trigrams that match the first two words
    elif len(prev_words) == 2:
        ngrams_with_prev = {key: value for key, value in trigram_model_freq.items() if key[:2] == tuple(prev_words)}
        if ngrams_with_prev:
            predicted_word = max(ngrams_with_prev, key=ngrams_with_prev.get)[-1]  # Predict based on the most frequent trigram
        else:
            predicted_word = unigram_model_freq.max()  # Fall back to the unigram model if no trigram is found
            
    # If only one previous word is provided, search for bigrams that match the last word
    elif len(prev_words) == 1:
        ngrams_with_prev = {key: value for key, value in bigram_model_freq.items() if key[0] == prev_words[-1]}
        if ngrams_with_prev:
            predicted_word = max(ngrams_with_prev, key=ngrams_with_prev.get)[-1]  # Predict based on the most frequent bigram
        else:
            predicted_word = unigram_model_freq.max()  # Fall back to the unigram model if no bigram is found
    else:
        predicted_word = unigram_model_freq.max()  # General fallback to unigram model if something unexpected happens
    
    # Map the lemmatized predicted word back to its original form, if it exists in the word mapping
    return word_mapping.get(predicted_word, predicted_word)



predicted_word = predict_next_word_ngram([''])
print(predicted_word)

predicted_word = predict_next_word_ngram(['movies'])
print(predicted_word)

predicted_word = predict_next_word_ngram(['please', 'can'])
print(predicted_word)


the
is
anyone


In [17]:
stop_words = set(stopwords.words('english'))

def preprocess_text_for_classifier(text):
    text = text.lower()
    text = re.sub(r'(<br\s*/?>)|[^\w\s]', '', text)
    tokens = word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    meaningful_words = [word for word in lemmatized_tokens if word not in stop_words]
    return meaningful_words

data['tokens'] = data['review'].apply(preprocess_text_for_classifier)



In [18]:
# Calculating Prior for postive and negative

positive_reviews = data[data['sentiment'] == 'positive']
negative_reviews = data[data['sentiment'] == 'negative']

prior_positive = len(positive_reviews) / len(data)
prior_negative = len(negative_reviews) / len(data)

In [19]:
# Unigram, Bigram, and Trigram Models for Positive Reviews
positive_unigram_model = FreqDist([word for tokens in data[data['sentiment'] == 'positive']['tokens'] for word in tokens])
positive_bigram_model = FreqDist(ngrams([word for tokens in data[data['sentiment'] == 'positive']['tokens'] for word in tokens], 2))
positive_trigram_model = FreqDist(ngrams([word for tokens in data[data['sentiment'] == 'positive']['tokens'] for word in tokens], 3))

# Unigram, Bigram, and Trigram Models for Negative Reviews
negative_unigram_model = FreqDist([word for tokens in data[data['sentiment'] == 'negative']['tokens'] for word in tokens])
negative_bigram_model = FreqDist(ngrams([word for tokens in data[data['sentiment'] == 'negative']['tokens'] for word in tokens], 2))
negative_trigram_model = FreqDist(ngrams([word for tokens in data[data['sentiment'] == 'negative']['tokens'] for word in tokens], 3))

In [20]:
def classify_review(review, n):
    tokens = preprocess_text_for_classifier(review)
    positive_log_prob = math.log(prior_positive)
    negative_log_prob = math.log(prior_negative)

    if n == 1:
        # Unigram Likelihood for Positive and Negative Reviews
        for token in tokens:
            positive_log_prob += math.log((positive_unigram_model[token] + 1) / (sum(positive_unigram_model.values()) + len(positive_unigram_model)))
            negative_log_prob += math.log((negative_unigram_model[token] + 1) / (sum(negative_unigram_model.values()) + len(negative_unigram_model)))
    
    if n == 2:
        bigrams = list(ngrams(tokens, 2))
        for bigram in bigrams:
            positive_log_prob += math.log((positive_bigram_model[bigram] + 1) / (sum(positive_bigram_model.values()) + len(positive_bigram_model)))
            negative_log_prob += math.log((negative_bigram_model[bigram] + 1) / (sum(negative_bigram_model.values()) + len(negative_bigram_model)))


    if n == 3:
        trigrams = list(ngrams(tokens, 3))
        for trigram in trigrams:
            positive_log_prob += math.log((positive_trigram_model[trigram] + 1) / (sum(positive_trigram_model.values()) + len(positive_trigram_model)))
            negative_log_prob += math.log((negative_trigram_model[trigram] + 1) / (sum(negative_trigram_model.values()) + len(negative_trigram_model)))

    return 'positive' if positive_log_prob > negative_log_prob else 'negative'


In [21]:
# This script processes a list of movie reviews, classifies each review as either "positive" or "negative",
# and displays the review with a corresponding colored label using ANSI escape codes.
# It also stores the predicted labels and actual labels for further analysis or evaluation purposes.

# ANSI escape codes for colors
class Colors:
    GREEN = '\033[92m'  # Green
    RED = '\033[91m'    # Red
    ENDC = '\033[0m'    # Reset color
    
# Test dataset containing movie reviews and their actual sentiment labels
TestData = [
    {"review": "The movie was fantastic and well-directed.", "label": "positive"},
    {"review": "The movie was terrible and boring.", "label": "negative"},
    {"review": "I loved the acting but hated the script.", "label": "negative"},
    {"review": "This was a waste of time.", "label": "negative"},
    {"review": "I loved this movie.", "label": "positive"},
    {"review": "This is my favourite movie.", "label": "positive"},
    {"review": "The performances were outstanding, but the plot was confusing.", "label": "negative"},
    {"review": "Great visuals and an even better storyline. I highly recommend it!", "label": "positive"},
    {"review": "It was slow and tedious, definitely not my kind of movie.", "label": "negative"},
    {"review": "A beautifully crafted story with excellent acting.", "label": "positive"},
    {"review": "The dialogue felt forced and unnatural, making it hard to enjoy.", "label": "negative"},
    {"review": "Absolutely loved the character development. I could watch it again!", "label": "positive"},
    {"review": "The soundtrack was amazing, but everything else was mediocre at best.", "label": "negative"},
    {"review": "One of the best films I've seen in a while, thoroughly enjoyed it.", "label": "positive"},
    {"review": "Way too long and filled with unnecessary subplots.", "label": "negative"},
    {"review": "A heartwarming story that resonates deeply.", "label": "positive"},
    {"review": "Disappointing from start to finish, couldn't wait for it to end.", "label": "negative"},
    {"review": "The special effects were top-notch, truly breathtaking.", "label": "positive"},
    {"review": "Not worth the hype. I found it pretty dull and predictable.", "label": "negative"},
    {"review": "A thrilling ride with unexpected twists and turns!", "label": "positive"},
    {"review": "The acting felt wooden, and the direction was uninspired.", "label": "negative"},
    {"review": "It was an emotional rollercoaster that kept me engaged.", "label": "positive"},
    {"review": "This movie had no redeeming qualities, a total flop.", "label": "negative"},
    {"review": "An incredible performance by the lead actor, so powerful and moving.", "label": "positive"},
    {"review": "The pacing was all over the place, which ruined the experience for me.", "label": "negative"},
    {"review": "It had a strong message and delivered it with grace and style.", "label": "positive"},
    {"review": "I was bored throughout, nothing exciting ever happened.", "label": "negative"},
    {"review": "Visually stunning with a gripping narrative. Highly recommend!", "label": "positive"},
    {"review": "A cliché story with no originality, just a waste of time.", "label": "negative"},
    {"review": "I felt connected to the characters, and the storyline was heartfelt.", "label": "positive"},
    {"review": "Horrible editing and choppy transitions made it hard to follow.", "label": "negative"},
    {"review": "A must-watch for anyone who enjoys thought-provoking films.", "label": "positive"},
    {"review": "Predictable plot and weak acting, not impressive.", "label": "negative"},
    {"review": "An excellent blend of action, drama, and humor.", "label": "positive"},
    {"review": "The movie tried too hard to be funny, but it fell flat.", "label": "negative"},
    {"review": "An absolute masterpiece that left me speechless.", "label": "positive"},
    {"review": "Overhyped and disappointing, I expected so much more.", "label": "negative"},
    {"review": "A touching story with relatable characters and a powerful message.", "label": "positive"},
    {"review": "It dragged on forever with no real purpose.", "label": "negative"},
    {"review": "I couldn't stop smiling throughout the whole film.", "label": "positive"},
    {"review": "Completely uninspired and forgettable. I don't recommend it.", "label": "negative"},
    {"review": "The humor was spot on, and the dialogue felt natural.", "label": "positive"},
    {"review": "This is one of the worst movies I’ve ever seen. Don’t waste your time.", "label": "negative"},
    {"review": "The plot was intriguing and kept me guessing until the end.", "label": "positive"},
    {"review": "I didn’t understand the hype; it was pretty boring.", "label": "negative"},
    {"review": "The romance was beautifully portrayed and felt genuine.", "label": "positive"},
    {"review": "The action sequences were a mess and hard to follow.", "label": "negative"},
    {"review": "A visually captivating film with deep emotional layers.", "label": "positive"},
    {"review": "I didn't care for the characters or the story at all.", "label": "negative"},
    {"review": "A delightful movie with an uplifting ending.", "label": "positive"},
    {"review": "I struggled to stay awake through the entire thing.", "label": "negative"},
    {"review": "Such a heartwarming and inspiring film. Highly recommended!", "label": "positive"},
]


# Lists to store predicted and actual labels for evaluation
predicted_labels = []
actual_labels = []

# Iterating through the test data, classifying the reviews, and displaying them with colored labels
for item in TestData:
    actual_labels.append(item['label'])  # Store the actual label
    review = item['review']
    result = classify_review(review.lower(), 2)  # Classify the review using your classification function
    
    # Determine color based on the predicted label (positive/negative)
    if result == "positive":
        predicted_labels.append('positive')
        color = Colors.GREEN
    else:
        predicted_labels.append('negative')
        color = Colors.RED
    
    # Print the review with the predicted label in the corresponding color
    print(f"Review: {review} {color}{result.upper()}{Colors.ENDC}")


Review: The movie was fantastic and well-directed. [92mPOSITIVE[0m
Review: The movie was terrible and boring. [91mNEGATIVE[0m
Review: I loved the acting but hated the script. [91mNEGATIVE[0m
Review: This was a waste of time. [91mNEGATIVE[0m
Review: I loved this movie. [92mPOSITIVE[0m
Review: This is my favourite movie. [92mPOSITIVE[0m
Review: The performances were outstanding, but the plot was confusing. [91mNEGATIVE[0m
Review: Great visuals and an even better storyline. I highly recommend it! [92mPOSITIVE[0m
Review: It was slow and tedious, definitely not my kind of movie. [91mNEGATIVE[0m
Review: A beautifully crafted story with excellent acting. [92mPOSITIVE[0m
Review: The dialogue felt forced and unnatural, making it hard to enjoy. [91mNEGATIVE[0m
Review: Absolutely loved the character development. I could watch it again! [92mPOSITIVE[0m
Review: The soundtrack was amazing, but everything else was mediocre at best. [91mNEGATIVE[0m
Review: One of the best fil

In [22]:
from nltk import ConfusionMatrix

# Function to calculate confusion matrix values: True Positives (TP), True Negatives (TN),
# False Positives (FP), and False Negatives (FN)
def calculate_confusion_matrix(actual_labels, predicted_labels, positive_label='positive'):
    # Initialize counts for confusion matrix
    TP = TN = FP = FN = 0
    
    # Loop over actual and predicted labels simultaneously
    for actual, predicted in zip(actual_labels, predicted_labels):
        if actual == positive_label and predicted == positive_label:
            TP += 1  # True Positive: both actual and predicted labels are positive
        elif actual == positive_label and predicted != positive_label:
            FN += 1  # False Negative: actual is positive but predicted is not
        elif actual != positive_label and predicted == positive_label:
            FP += 1  # False Positive: actual is not positive but predicted is positive
        else:
            TN += 1  # True Negative: both actual and predicted labels are not positive
    
    return TP, TN, FP, FN  # Return the counts of TP, TN, FP, FN

# Function to calculate accuracy score
# Formula: Accuracy = (TP + TN) / (TP + TN + FP + FN)
def accuracy_score_nltk(TP, TN, FP, FN):
    return (TP + TN) / (TP + TN + FP + FN)

# Function to calculate precision
# Formula: Precision = TP / (TP + FP)
def precision_score_nltk(TP, FP):
    return TP / (TP + FP) if (TP + FP) > 0 else 0

# Function to calculate recall
# Formula: Recall = TP / (TP + FN)
def recall_score_nltk(TP, FN):
    return TP / (TP + FN) if (TP + FN) > 0 else 0

# Function to calculate F1 Score
# Formula: F1 = 2 * (Precision * Recall) / (Precision + Recall)
def f1_score_nltk(precision, recall):
    return 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

# Calculate confusion matrix counts (True Positives, True Negatives, False Positives, False Negatives)
TP, TN, FP, FN = calculate_confusion_matrix(actual_labels, predicted_labels)

accuracy = accuracy_score_nltk(TP, TN, FP, FN)
precision = precision_score_nltk(TP, FP)
recall = recall_score_nltk(TP, FN)
f1 = f1_score_nltk(precision, recall)

# Print the results of the evaluation metrics
print(f'Accuracy: {accuracy*100}%')  # Display accuracy as a percentage
print(f'Precision: {precision}')     # Display precision
print(f'Recall: {recall}')           # Display recall
print(f'F1 Score: {f1}')             # Display F1 score

Accuracy: 96.15384615384616%
Precision: 1.0
Recall: 0.9230769230769231
F1 Score: 0.9600000000000001


In [156]:
import random

def generate_sentence(model, start_words, n, sentence_length):
    # Determine the type of model by the number of words in each key (unigram, bigram, trigram)
    
    current_words = list(start_words)  # Ensure current_words is a list of words
    sentence = list(current_words)

    for _ in range(sentence_length - len(current_words)):
        if n == 1:  # Unigram model
            next_words = list(model.keys())
            next_word = random.choice(next_words)  # Choose a random word from the unigram model
            sentence.append(next_word)
            current_words = [next_word]
        
        elif n == 2:  # Bigram model
            possible_bigrams = [bigram for bigram in model if bigram[0] == current_words[-1]]
            if possible_bigrams:
                next_bigram = random.choice(possible_bigrams)
                next_word = next_bigram[1]
                sentence.append(next_word)
                current_words = [next_word]  # Update to the last word
            else:
                break
        
        elif n == 3:  # Trigram model
            possible_trigrams = [trigram for trigram in model if trigram[:2] == tuple(current_words[-2:])]
            if possible_trigrams:
                next_trigram = random.choice(possible_trigrams)
                next_word = next_trigram[2]
                sentence.append(next_word)
                current_words = [sentence[-2], sentence[-1]]  # Update to the last two words
            else:
                break

    return ' '.join(sentence)  # Join the sentence as words with spaces


In [158]:
    start_words = list(random.choice(list(bigram_model_freq.keys())))
start_words

['stale', 'ideology']

In [162]:
import random

for i in range(10):
    start_words = ['this']  # Only pick clean words
    generated_sentence = generate_sentence(unigram_model_freq, start_words, 1, sentence_length=10)
    result = classify_review(generated_sentence.lower(), 1)  # Assuming classify_review is defined elsewhere
        
    if result == "positive":
        color = Colors.GREEN
    else:
        color = Colors.RED
    
    print(f"Generated sentenc: {generated_sentence} {color}{result.upper()}{Colors.ENDC}")

Generated sentenc: this worthwhileand cleanup nickson-soul brummie streethawk talentswell stanley cornetto rightof [91mNEGATIVE[0m
Generated sentenc: this flinching brundruge excusefor analysing orenji budapest-vienna garbagefirst career-oriented female-dominated [91mNEGATIVE[0m
Generated sentenc: this tvron abi sonsmichael actorit smithdale biznow tacitly 60s70s actingstrangely [92mPOSITIVE[0m
Generated sentenc: this 5272001 didfinally yunfei crematory usefuleven amann iffr drawnthis predicamant [92mPOSITIVE[0m
Generated sentenc: this hamletcan confessionthere smootherstill 387 spoilerone orsini 1i yabba aggressiveness [92mPOSITIVE[0m
Generated sentenc: this aboooot transvestism miller perused kinnepolis cop-out boloneyavoid age-wise rationalist [91mNEGATIVE[0m
Generated sentenc: this matkondar wall-slamming vocalized subware belannas -lostflix marita purdey repositioning [92mPOSITIVE[0m
Generated sentenc: this thrillif inu tearjerking h3ll pisana dror funpowerful furone

In [163]:
import random

for i in range(10):
    start_words = list(random.choice(list(bigram_model_freq.keys())))
    generated_sentence = generate_sentence(bigram_model_freq, start_words, 2, sentence_length=10)
    result = classify_review(generated_sentence.lower(), 1)  # Classify the review using your classification function
        
    if result == "positive":
        color = Colors.GREEN
    else:
        color = Colors.RED
    
    print(f"Generated sentenc: {generated_sentence} {color}{result.upper()}{Colors.ENDC}")

Generated sentenc: this terrifyingly realistic set especially charityn and catharsis besides jiggling [91mNEGATIVE[0m
Generated sentenc: plastic pancake at heavy commercializing of raj babbar- hilarious until [91mNEGATIVE[0m
Generated sentenc: critique labelled a gorgeous barbara vanity who abuse domestic horse [92mPOSITIVE[0m
Generated sentenc: creative highlight part kader is phoniness incarnate a nitwit bollywood [91mNEGATIVE[0m
Generated sentenc: 2006 following chord the just-this-side-of broad variety either abby julia [92mPOSITIVE[0m
Generated sentenc: high price recently seen didnt actually focusing exclusively in venezuelan [92mPOSITIVE[0m
Generated sentenc: belaboring a dappled forest all identifiable in amer - rising [91mNEGATIVE[0m
Generated sentenc: conscience this vivid always hard whoever slew the necessity - [92mPOSITIVE[0m
Generated sentenc: ramtha isnt provided incredible location from 4th largest science he [91mNEGATIVE[0m
Generated sentenc: or repub

In [None]:
import random

for i in range(10):
    start_words = list(random.choice(list(trigram_model_freq.keys())))
    generated_sentence = generate_sentence(trigram_model_freq, start_words, 3, sentence_length=10)
    result = classify_review(generated_sentence.lower(), 1)  # Classify the review using your classification function
        
    if result == "positive":
        color = Colors.GREEN
    else:
        color = Colors.RED
    
    print(f"Generated sentenc: {generated_sentence} {color}{result.upper()}{Colors.ENDC}")

Generated sentenc: can hint there no saving moment in ice-ts acting ha [91mNEGATIVE[0m
Generated sentenc: street there are avenue to be uncovered here come an [92mPOSITIVE[0m
Generated sentenc: not used up political insight from it david duchovny showed [92mPOSITIVE[0m
Generated sentenc: program had escaped the local is familiar with set costume [92mPOSITIVE[0m
Generated sentenc: released in 1982 by impregnating the local los angeles seasoned [92mPOSITIVE[0m
Generated sentenc: privileged adolescence that would become regent if her mom question [92mPOSITIVE[0m
Generated sentenc: again term used very long ride he ride off alone [92mPOSITIVE[0m
Generated sentenc: horror movie scare you should also check her worst movie [91mNEGATIVE[0m
