In [2]:
# AFINN Lexicon
# Instantiate the lexicon
afinn = Afinn()

# Adapt the doc_features function to include features containing AFINN sentiment scores
def doc_features_with_afinn_sentiment_scores(document, word_features): 
    # Remove duplicate words from the document (line of poetry)
    document_words = set(document) 
    # Create a features dict to represent the word features
    features = {}
    # Iterate over the top N vocabulary words (word_features) and create a dict-key for that word, with the dict-value signalling whether the
    # word occurs in the document (line of poetry) or not.
    # Also add a new key-value pair to the features dictionary indicating the sentiment score of the word in question
    for word in word_features:
        features[f"contains({word})"] = (word in document_words)
        features[f"sentiment_score({word})"] = afinn.score(word)
    return features

"""
    A function that takes in a range of values for the different nr of most common words to use as features
    and then calculates the accuracies and average f1-scores for each nr of most common words.
    In contrast to the `calculate_metrics_for_different_vocab_size_features`, this function
    uses `doc_features_with_afinn_sentiment_scores` instead of the `doc_features` function which does
    NOT contain sentiment scores for each word feature.
"""
def afinn_calculate_metrics_for_different_vocab_size_features(
    lowest_num_words_limit, # lower end of range for how many words to use
    highest_num_words_limit, # higher end of range for how many words to use
    all_words, # the freq dist of words ordered by most common to least common
    train_tuples, # training data tuples of form (sample, label)
    val_tuples,
    step_size=50, # interval size between numbers of words to test
):
    top_word_counts = np.arange(lowest_num_words_limit, highest_num_words_limit, step_size)
    accuracies = [] # store accuracies for each nr of top words used in here
    avg_f1_scores = [] # store macro f1 scores for each nr of top words used in here

    # iterate over the array of top-word counts to use (i.e. vocab subset to use in features)
    for vocab_size in top_word_counts:
        print(vocab_size)
        # store the list of top "vocab_size" words to use
        word_features = list(all_words)[:vocab_size]
        # get the featuresets based on the top N word features for training and validation splits
        train_data_featuresets = [(doc_features_with_afinn_sentiment_scores(doc, word_features), label) for (doc, label) in train_tuples]
        validation_data_featuresets = [(doc_features_with_afinn_sentiment_scores(doc, word_features), label) for (doc, label) in val_tuples]
        # train a NB classifier and append accuracy score to the above-defined list
        NBclassifier = nltk.NaiveBayesClassifier.train(train_data_featuresets)
        accuracy = nltk.classify.accuracy(NBclassifier, validation_data_featuresets)
        accuracies.append(accuracy)
        # # now get the macro avg f1 scores (more complicated)
        # # store all the predicted labels here
        validation_predictions= []

        # iterate over each validation featurset and get the predicted label
        for features_dict, label in validation_data_featuresets:
            predicted_label = NBclassifier.classify(features_dict)
            validation_predictions.append(predicted_label)
    
        # Retrieve macro-average F1 score from classification report and store it in avg_f1_scores
        class_report = classification_report(
            original_dataset_validation_labels, 
            validation_predictions,
            output_dict=True,  # Return report as a dictionary (easier to access metrics)
            # Set the score to 0 if "UndefinedMetricWarning" appears because either recall or precision for a class are 0.0
            zero_division=0  
        )
        
        macro_avg_f1 = class_report['macro avg']['f1-score'] 
        avg_f1_scores.append(macro_avg_f1)
    return top_word_counts, accuracies, avg_f1_scores

## RUN THE EXPERIMENT AGAIN USING THE WORDNET NEGATION FUNCTION BUT WITH ADDDED AFINN LEXICON SENTIMENT SCORES

# Use the wordnet_negated tokens (train and validation) for the vocabulary set
vocabulary_list = flatten_list_of_lists(wordnet_negated_train_tokens)
vocabulary_set = set(vocabulary_list)
# create a freq dist of the words, convert to lower case
all_words = nltk.FreqDist(w for w in vocabulary_list)

# Get the accuracies and f1 scores for different numbers of word features to use to choose the best nr of word features for the negated sets
top_word_counts, accuracies, avg_f1_scores =  afinn_calculate_metrics_for_different_vocab_size_features(
                                                                                                    400,
                                                                                                    1301,
                                                                                                    all_words,
                                                                                                    wordnet_negated_train_data_tuples,
                                                                                                    wordnet_negated_validation_data_tuples
                                                                                                  )
# Plot the results
plot_word_feature_counts_against_scores(top_word_counts, accuracies, avg_f1_scores, 
                                        "Accuracy and Macro Avg F1 Score vs. Vocabulary Size (WordNet Negation with AFINN Sentiment Scores)")

In [None]:
# SentiWordNet


########## CODE ADAPTED FROM: https://medium.com/@nidhikayadav/sentiment-analysis-with-python-sentiwordnet-fd07ffc557 ##############################
def get_sentiwordnet_scores_from_tokens(tokens):

    # remove bigrams to calculate sentiment score for the document (tokenized line of poetry
    tokens_without_bigrams = [token for token in tokens if not isinstance(token, tuple)]        
    
    pos_tagged_tokens = nltk.pos_tag(tokens_without_bigrams)
    
    positive_sentiment = 0.0
    negative_sentiment = 0.0
    
    for word_tag_pair in pos_tagged_tokens:
        word = word_tag_pair[0]
        pos_tag = word_tag_pair[1]
        # Convert nltk pos_tags to WordNet format for the most relevant pos tags
        if pos_tag.startswith('J'):
            pos_tag =  wordnet.ADJ
        elif pos_tag.startswith('R'):
            pos_tag =  wordnet.ADV    
        elif pos_tag.startswith('N'):
            pos_tag =  wordnet.NOUN
        else:
            continue
        word_synsets = wordnet.synsets(word, pos=pos_tag)
        if not word_synsets:
            continue
        top_synset = word_synsets[0]
        senti_word_net = swn.senti_synset(top_synset.name())
        positive_sentiment += senti_word_net.pos_score() 
        negative_sentiment +=  senti_word_net.neg_score()
    return positive_sentiment, negative_sentiment

####################################################################################################################################################


# Adapt the doc_features function to include features containing SentiWordNet sentiment scores
def doc_features_with_swn_sentiment_scores(document, word_features): 
    
    # Get the swn pos and neg summed scores for the document (a list of tokens)
    positive_score, negative_score = get_sentiwordnet_scores_from_tokens(document)
  
    # Remove duplicate words from the document (line of poetry)
    document_words = set(document)  
    
    # Create a features dict to represent the word features
    features = {}
    # Iterate over the top N vocabulary words (word_features) and create a dict-key for that word, with the dict-value signalling whether the
    # word occurs in the document (line of poetry) or not.
    # Also add a new key-value pair to the features dictionary indicating the sentiment score of the word in question
    for word in word_features:
        features[f"contains({word})"] = (word in document_words)

    # Add features representing swn positive and negative scores
    features["positive_sentiment"] = positive_score
    features["negative_sentiment"] = negative_score
    
    return features

"""
    A function that takes in a range of values for the different nr of most common words to use as features
    and then calculates the accuracies and average f1-scores for each nr of most common words.
    In contrast to the `afinn_calculate_metrics_for_different_vocab_size_features`, this function
    uses `doc_features_with_swn_sentiment_scores`  instead of the `doc_features_with_afinn_sentiment_scores` function
    to use SentiWordNet summed positive and negative scores instead of AFINN sentiment scores.
"""
def swn_calculate_metrics_for_different_vocab_size_features(
    lowest_num_words_limit, # lower end of range for how many words to use
    highest_num_words_limit, # higher end of range for how many words to use
    all_words, # the freq dist of words ordered by most common to least common
    train_tuples, # training data tuples of form (sample, label)
    val_tuples,
    step_size=50, # interval size between numbers of words to test
):
    top_word_counts = np.arange(lowest_num_words_limit, highest_num_words_limit, step_size)
    accuracies = [] # store accuracies for each nr of top words used in here
    avg_f1_scores = [] # store macro f1 scores for each nr of top words used in here

    # iterate over the array of top-word counts to use (i.e. vocab subset to use in features)
    for vocab_size in top_word_counts:
        print(vocab_size)
        # store the list of top "vocab_size" words to use
        word_features = list(all_words)[:vocab_size]
        # get the featuresets based on the top N word features for training and validation splits
        train_data_featuresets = [(doc_features_with_swn_sentiment_scores(doc, word_features), label) for (doc, label) in train_tuples]
        validation_data_featuresets = [(doc_features_with_swn_sentiment_scores(doc, word_features), label) for (doc, label) in val_tuples]
        # train a NB classifier and append accuracy score to the above-defined list
        NBclassifier = nltk.NaiveBayesClassifier.train(train_data_featuresets)
        accuracy = nltk.classify.accuracy(NBclassifier, validation_data_featuresets)
        accuracies.append(accuracy)
        # # now get the macro avg f1 scores (more complicated)
        # # store all the predicted labels here
        validation_predictions= []

        # iterate over each validation featurset and get the predicted label
        for features_dict, label in validation_data_featuresets:
            predicted_label = NBclassifier.classify(features_dict)
            validation_predictions.append(predicted_label)
    
        # Retrieve macro-average F1 score from classification report and store it in avg_f1_scores
        class_report = classification_report(
            original_dataset_validation_labels, 
            validation_predictions,
            output_dict=True,  # Return report as a dictionary (easier to access metrics)
            # Set the score to 0 if "UndefinedMetricWarning" appears because either recall or precision for a class are 0.0
            zero_division=0  
        )
        
        macro_avg_f1 = class_report['macro avg']['f1-score'] 
        avg_f1_scores.append(macro_avg_f1)
    return top_word_counts, accuracies, avg_f1_scores





## RUN THE EXPERIMENT AGAIN USING THE WORDNET NEGATION FUNCTION BUT WITH ADDDED SENTIWORDNET LEXICON SENTIMENT SCORES

# Use the wordnet_negated tokens (train and validation) for the vocabulary set
vocabulary_list = flatten_list_of_lists(wordnet_negated_train_tokens)
vocabulary_set = set(vocabulary_list)
# create a freq dist of the words, convert to lower case
all_words = nltk.FreqDist(w for w in vocabulary_list)

# Get the accuracies and f1 scores for different numbers of word features to use to choose the best nr of word features for the negated sets
top_word_counts, accuracies, avg_f1_scores =  swn_calculate_metrics_for_different_vocab_size_features(
                                                                                                    400,
                                                                                                    1301,
                                                                                                    all_words,
                                                                                                    wordnet_negated_train_data_tuples,
                                                                                                    wordnet_negated_validation_data_tuples
                                                                                                  )
# Plot the results
plot_word_feature_counts_against_scores(top_word_counts, accuracies, avg_f1_scores, 
                                        "Accuracy and Macro Avg F1 Score vs. Vocabulary Size (WordNet Negation with SentiWordNet Sentiment Scores)")