In [6]:
import nltk
#nltk.download('vader_lexicon')
from nltk.stem import WordNetLemmatizer
from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()
#nltk.download('wordnet') 
#nltk.download('punkt')
from stop_words import get_stop_words
import pandas as pd
import os
from statistics import mean
from random import shuffle

df_test = pd.read_csv('./data/imdb/Test.csv').head(1_000)
df_train = pd.read_csv('./data/imdb/Train.csv').head(1_000)
df_valid = pd.read_csv('./data/imdb/Valid.csv').head(1_000)

stopwords = []
stopwords = get_stop_words('en')

top_100_positive = {}
top_100_negative = {}

def preprocess_text(text):
    tokens = [w for w in nltk.word_tokenize(text) if w.lower() not in stopwords]

    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]

    processed_text = ' '.join(lemmatized_tokens)
    return processed_text

# 3 features:
# - count of words in text matching top_100_positive
# - The average compound score (NLTK VADER model)
# - The average positive score (NLTK VADER model)
def extract_features(text): # function based on https://realpython.com/python-nltk-sentiment-analysis/#training-and-using-a-classifier
    features = dict()
    wordcount = 0
    compound_scores = list()
    positive_scores = list()

    for sentence in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sentence):
            if word.lower() in top_100_positive:
                wordcount += 1
        compound_scores.append(sia.polarity_scores(sentence)["compound"])
        positive_scores.append(sia.polarity_scores(sentence)["pos"])

    # Adding 1 to the final compound score to always have positive numbers
    # since some classifiers you'll use later don't work with negative numbers.
    features["mean_compound"] = mean(compound_scores) + 1
    features["mean_positive"] = mean(positive_scores)
    features["wordcount"] = wordcount

    return features

def preprocess_set_to_features(df, is_train_set):
    df['text'] = df['text'].apply(preprocess_text)

    positive_texts = df.loc[df['label'] == 1, 'text']
    negative_texts = df.loc[df['label'] == 0, 'text']
    
    positive_words = [word.lower() for text in positive_texts for word in nltk.word_tokenize(text)]
    negative_words = [word.lower() for text in negative_texts for word in nltk.word_tokenize(text)]
    if (is_train_set):
        positive_fd = nltk.FreqDist(positive_words)
        negative_fd = nltk.FreqDist(negative_words)
        
        common_set = set(positive_fd).intersection(negative_fd)
        
        for word in common_set:
            del positive_fd[word]
            del negative_fd[word]
        
        top_100_positive = {word for word, count in positive_fd.most_common(100)}
        top_100_negative = {word for word, count in negative_fd.most_common(100)}

    # for each review, add a tuple to the 'features' list
    # each tuple should have a feature dict from the 'extract_features' function
    # and the label for ground truth sentiment
    features = [
        (extract_features(review), "pos") for review in positive_texts
    ]
    features.extend([
        (extract_features(review), "neg") for review in negative_texts
    ])
    return features

train_features = preprocess_set_to_features(df_train, 1)
#valid_features = preprocess_set_to_features(df_valid, 0)
test_features = preprocess_set_to_features(df_test, 0)



In [7]:
from nltk.metrics.scores import (accuracy, precision, recall, f_measure)
import collections

shuffle(train_features)
classifier = nltk.NaiveBayesClassifier.train(train_features)
classifier.show_most_informative_features(10)

shuffle(test_features)
accuracy = nltk.classify.accuracy(classifier, test_features)

refsets = collections.defaultdict(set)
testsets = collections.defaultdict(set)
 
for i, (feats, label) in enumerate(test_features):
    refsets[label].add(i)
    observed = classifier.classify(feats)
    testsets[observed].add(i)
    
print('pos accuracy:', round(accuracy, 2))
print('pos precision:', round(precision(refsets['pos'], testsets['pos']), 2))
print('pos recall:', round(recall(refsets['pos'], testsets['pos']), 2))
print('pos F-measure:', round(f_measure(refsets['pos'], testsets['pos']), 2))
print('neg precision:', round(precision(refsets['neg'], testsets['neg']), 2))
print('neg recall:', round(recall(refsets['neg'], testsets['neg']), 2))
print('neg F-measure:', round(f_measure(refsets['neg'], testsets['neg']), 2))

from sklearn.naive_bayes import (
    BernoulliNB,
    ComplementNB,
    MultinomialNB,
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

classifiers = {
    "BernoulliNB": BernoulliNB(),
    "ComplementNB": ComplementNB(),
    "MultinomialNB": MultinomialNB(),
    "KNeighborsClassifier": KNeighborsClassifier(),
    "DecisionTreeClassifier": DecisionTreeClassifier(),
    "RandomForestClassifier": RandomForestClassifier(),
    "LogisticRegression": LogisticRegression(),
    "MLPClassifier": MLPClassifier(max_iter=1000),
    "AdaBoostClassifier": AdaBoostClassifier(),
}

for name, sklearn_classifier in classifiers.items():
    classifier = nltk.classify.SklearnClassifier(sklearn_classifier)
    classifier.train(train_features)
    accuracy = nltk.classify.accuracy(classifier, test_features)
    print(F"{accuracy:.2%} - {name}")

Most Informative Features
           mean_positive = 0.134             pos : neg    =      1.7 : 1.0
           mean_positive = 0.132             neg : pos    =      1.6 : 1.0
           mean_positive = 0.087             pos : neg    =      1.0 : 1.0
           mean_positive = 0.108             pos : neg    =      1.0 : 1.0
           mean_positive = 0.14500000000000002    pos : neg    =      1.0 : 1.0
           mean_positive = 0.158             pos : neg    =      1.0 : 1.0
           mean_positive = 0.1637142857142857    pos : neg    =      1.0 : 1.0
           mean_positive = 0.167             pos : neg    =      1.0 : 1.0
           mean_positive = 0.1812            pos : neg    =      1.0 : 1.0
           mean_positive = 0.207             pos : neg    =      1.0 : 1.0
pos accuracy: 0.47
pos precision: 0.47
pos recall: 0.96
pos F-measure: 0.63
neg precision: 0.5
neg recall: 0.03
neg F-measure: 0.06
53.00% - BernoulliNB
60.60% - ComplementNB
53.00% - MultinomialNB
69.50% - KNeighbo

In [33]:
import fasttext
import csv
# fasttext processing modified based on https://towardsdatascience.com/fasttext-sentiment-analysis-for-tweets-a-straightforward-guide-9a8c070449a2
# format data to fasttext usable data
def transform_instance(row):
    cur_row = []
    label = "__label__" + ('POSITIVE' if row['label'] else 'NEGATIVE')
    cur_row.append(label)
    cur_row.extend( nltk.word_tokenize(row["text"]))
    return cur_row
def preprocess(input_df, output_file, keep=1):
    with open(output_file, 'w', encoding='utf-8') as csvoutfile:
        csv_writer = csv.writer(csvoutfile, delimiter=' ', lineterminator='\n')
        for _, row in input_df.iterrows():
            row_output = transform_instance(row)
            csv_writer.writerow(row_output)

preprocess(df_train, './data/fasttext/reviews.train')
preprocess(df_valid, './data/fasttext/reviews.valid')

# fasttext model training
hyper_params = {"lr": 0.01,
    "epoch": 20,
    "wordNgrams": 2,
    "dim": 20}
        
# Train the model.
model = fasttext.train_supervised(input='./data/fasttext/reviews.train', **hyper_params)
print("Model trained with the hyperparameter \n {}".format(hyper_params))

# CHECK PERFORMANCE
result = model.test('./data/fasttext/reviews.train')
validation = model.test('./data/fasttext/reviews.valid')
        
# DISPLAY ACCURACY OF TRAINED MODEL
print("precision on training data: "+str(result[1]))
print("recall on training data: "+str(result[2]))
print("precision on validation data: "+str(validation[1]))
print("recall on validation data: "+str(validation[2]))


Model trained with the hyperparameter 
 {'lr': 0.01, 'epoch': 20, 'wordNgrams': 2, 'dim': 20}
precision on training data: 0.528
recall on training data: 0.528
precision on validation data: 0.501
recall on validation data: 0.501
