In [2]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.classify import NaiveBayesClassifier, accuracy
from nltk.probability import FreqDist

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from IPython.display import clear_output

import pandas as pd
import pickle
import string
import random
import spacy

In [3]:
def enter_to_continue():
    input("Press Enter to continue...")

In [4]:
classifier = None
review = ""
category = ""

def load_dataset():
    df = pd.read_csv('movie-review-cleaned.csv')
    return df

def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return 'a'
    elif tag.startswith('V'):
        return 'v'
    elif tag.startswith('N'):
        return 'n'
    elif tag.startswith('R'):
        return 'r'
    else:
        return 'n'

def preprocess_words(words):
    words = [word for word in words if word.lower() not in stopwords.words('english')]
    words = [word for word in words if word.lower() not in string.punctuation]
    words = [word for word in words if word.isalpha()]

    word_tag = pos_tag(words)

    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word, pos=get_wordnet_pos(tag)) for word, tag in word_tag]

    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words]

    return words

def train_model():
    df = load_dataset().sample(n=3000)

    reviews = [str(review) for review in df['review'].to_list()]
    sentiments = [str(sentiment) for sentiment in df['sentimentScore'].to_list()]

    word_list = []

    for sentence in reviews:
        words = word_tokenize(sentence)

        for word in words:
            word_list.append(word)

    word_list = preprocess_words(word_list)

    labeled_data = list(zip(reviews, sentiments))

    feature_sets = []

    for review, sentiment in labeled_data:
        feature = {}

        check_words = word_tokenize(review)
        check_words = preprocess_words(check_words)

        for word in word_list:
            feature[word] = word in check_words
        
        feature_sets.append((feature, sentiment))

    random.shuffle(feature_sets)

    train_count = int(len(feature_sets) * 0.8)
    train_dataset = feature_sets[:train_count]
    test_dataset = feature_sets[train_count:]

    classifier = NaiveBayesClassifier.train(train_dataset)
    print(f"Accuracy: {accuracy(classifier, test_dataset) * 100 : ,.2f}%")
    
    file = open('model.pickle', 'wb')
    pickle.dump(classifier, file)
    file.close()

    return classifier


In [6]:
def print_menu():
    global review
    displayReview = "No review"

    global category
    displayCategory = "None"

    if review != "":
        displayReview = review

    if category != "":
        displayCategory = category

    print("Movie recommendation based on reviews")
    print(f"Your review: {displayReview}")
    print(f"Category: {displayCategory}")
    print("1. Enter review")
    print("2. View movie recommendation")
    print("3. View NER")
    print("4. Exit")
    choice = input(">> ")
    return choice

def write_review():
    clear_output()

    global review
    global category
    global classifier
    print("Enter your review: ")
    reviewInput = input(">> ")

    if len(reviewInput.split(' ')) < 20:
        print("Review must be at least 20 words")
        enter_to_continue()
        return

    review = reviewInput

    words = word_tokenize(reviewInput)
    words = preprocess_words(words)

    feature = FreqDist(words)
    category = classifier.classify(feature)

    print(f"Review classified as: {category}")
    enter_to_continue()

def view_movie_recommendation():
    clear_output()

    global review
    df = load_dataset()

    reviews = [str(review) for review in df['review'].to_list()]
    titles = [str(title) for title in df['title'].to_list()]

    vectorizer = TfidfVectorizer()
    matrix = vectorizer.fit_transform(reviews)

    query_matrix = vectorizer.transform([review])
    cosine_similarities = cosine_similarity(query_matrix, matrix).flatten()

    related_docs_indices = cosine_similarities.argsort()[::-1][:3]

    for i, idx in enumerate(related_docs_indices):
        print(f"{i+1}. {titles[idx]}")
    enter_to_continue()

def view_ner():
    clear_output()

    df = load_dataset().sample(n=3000)
    reviews = df['review'].to_string()

    spacy_nlp = spacy.load('en_core_web_sm')
    doc = spacy_nlp(reviews)

    categories = {}

    for ent in doc.ents:
        label = ent.label_

        if label not in ['LANGUAGE', 'GPE']:
            continue
        
        if label not in categories:
            categories[label] = []
        
        categories[label].append(ent.text)

    for label, entities in categories.items():
        print(f"{label}: {', '.join(entities)}")

    if len(categories) == 0:
        print("No entities found")

    enter_to_continue()

def main():
    global classifier
    try:
        file = open('model.pickle', 'rb')
        classifier = pickle.load(file)
        file.close()
    except FileNotFoundError:
        classifier = train_model()
    
    while True:
        clear_output()
        choice = print_menu()
        if choice == '1':
            write_review()
        elif choice == '2':
            view_movie_recommendation()
        elif choice == '3':
            view_ner()
        elif choice == '4':
            break
    
    print("Goodbye!")
    
main()

Movie recommendation based on reviews
Your review: nice one
Category: POSITIVE
1. Enter review
2. View movie recommendation
3. View NER
4. Exit
Goodbye!
