In [1]:
import nltk
from nltk.corpus import sentence_polarity
from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder
import random

# Download necessary NLTK resources
nltk.download('sentence_polarity')
nltk.download('averaged_perceptron_tagger')

# Load and shuffle the data
documents = [(sent, cat) for cat in sentence_polarity.categories()
             for sent in sentence_polarity.sents(categories=cat)]
random.shuffle(documents)

# Extract the top 1500 words as unigram features
all_words_list = [word for (sent, cat) in documents for word in sent]
all_words = nltk.FreqDist(all_words_list)
word_items = all_words.most_common(1500)
word_features = [word for (word, count) in word_items]

# Define a unigram feature extraction function
def document_features(document, word_features):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['V_{}'.format(word)] = (word in document_words)
    return features

# Create unigram feature sets
featuresets = [(document_features(d, word_features), c) for (d, c) in documents]


[nltk_data] Downloading package sentence_polarity to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package sentence_polarity is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
# Split data into training and test sets
train_set, test_set = featuresets[1000:], featuresets[:1000]

# Train the classifier
classifier = nltk.NaiveBayesClassifier.train(train_set)

# Evaluate accuracy
accuracy = nltk.classify.accuracy(classifier, test_set)
print("Baseline Unigram Accuracy:", accuracy)



Baseline Unigram Accuracy: 0.716


In [3]:
# Generate bigram features
bigram_measures = BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(all_words_list)
bigram_features = finder.nbest(bigram_measures.chi_sq, 500)  # Top 500 bigrams

# Define a bigram feature extraction function
def bigram_document_features(document, word_features, bigram_features):
    document_words = set(document)
    document_bigrams = nltk.bigrams(document)
    features = {}
    for word in word_features:
        features['V_{}'.format(word)] = (word in document_words)
    for bigram in bigram_features:
        features['B_{}_{}'.format(bigram[0], bigram[1])] = (bigram in document_bigrams)
    return features

# Create bigram feature sets
bigram_featuresets = [(bigram_document_features(d, word_features, bigram_features), c) for (d, c) in documents]

# Train and evaluate with bigram features
train_set, test_set = bigram_featuresets[1000:], bigram_featuresets[:1000]
classifier = nltk.NaiveBayesClassifier.train(train_set)
accuracy = nltk.classify.accuracy(classifier, test_set)
print("Baseline Bigram Accuracy:", accuracy)



Baseline Bigram Accuracy: 0.716


In [8]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')





[nltk_data] Downloading package punkt to /home/codespace/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


True

In [9]:
def POS_features(document, word_features):
    # Ensure the document is tokenized correctly
    document_words = set(document)
    tagged_words = nltk.pos_tag(document, tagset='universal')  # Use Universal Tagset
    
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    
    # Initialize counts for POS tags
    numNoun = numVerb = numAdj = numAdverb = 0
    for (word, tag) in tagged_words:
        if tag == 'NOUN': numNoun += 1
        if tag == 'VERB': numVerb += 1
        if tag == 'ADJ': numAdj += 1
        if tag == 'ADV': numAdverb += 1
    
    # Add counts to features
    features['nouns'] = numNoun
    features['verbs'] = numVerb
    features['adjectives'] = numAdj
    features['adverbs'] = numAdverb
    return features


In [12]:
# Define custom POS feature extraction
def custom_POS_features(document, word_features):
    document_words = set(document)
    features = {'contains({})'.format(word): (word in document_words) for word in word_features}

    # Custom suffix-based POS tagging
    suffix_tags = Counter()
    for word in document:
        if word.endswith('ing'):  # Assume gerund/verb
            suffix_tags['verbs'] += 1
        elif word.endswith('ly'):  # Assume adverb
            suffix_tags['adverbs'] += 1
        elif word.endswith('ed'):  # Assume past-tense verb
            suffix_tags['verbs'] += 1
        elif word.endswith('ous') or word.endswith('able') or word.endswith('ive'):  # Assume adjective
            suffix_tags['adjectives'] += 1
        elif word.isalpha() and len(word) > 5:  # Assume longer words as nouns
            suffix_tags['nouns'] += 1

    # Add POS counts to features
    features['nouns'] = suffix_tags['nouns']
    features['verbs'] = suffix_tags['verbs']
    features['adjectives'] = suffix_tags['adjectives']
    features['adverbs'] = suffix_tags['adverbs']
    return features

# Generate feature sets
unigram_featuresets = [(unigram_features(d, word_features), c) for (d, c) in documents]
bigram_featuresets = [(bigram_features_func(d, word_features, bigram_features), c) for (d, c) in documents]
POS_featuresets = [(custom_POS_features(d, word_features), c) for (d, c) in documents]

NameError: name 'unigram_features' is not defined

In [None]:
# Define cross-validation function
def cross_validation_accuracy(num_folds, featuresets):
    subset_size = int(len(featuresets) / num_folds)
    accuracy_list = []
    for i in range(num_folds):
        test_this_round = featuresets[i * subset_size:][:subset_size]
        train_this_round = featuresets[:i * subset_size] + featuresets[(i + 1) * subset_size:]
        classifier = nltk.NaiveBayesClassifier.train(train_this_round)
        accuracy_this_round = nltk.classify.accuracy(classifier, test_this_round)
        print(f"Fold {i + 1} Accuracy:", accuracy_this_round)
        accuracy_list.append(accuracy_this_round)
    print("Mean Accuracy:", sum(accuracy_list) / num_folds)

# Cross-validation for unigrams
print("\nUnigram Cross-Validation (5-fold):")
cross_validation_accuracy(5, featuresets)

# Cross-validation for bigrams
print("\nBigram Cross-Validation (5-fold):")
cross_validation_accuracy(5, bigram_featuresets)

# Cross-validation for POS features
print("\nPOS Cross-Validation (5-fold):")
cross_validation_accuracy(5, POS_featuresets)


In [None]:
# Define evaluation function
def eval_measures(gold, predicted):
    labels = list(set(gold))
    print('\tPrecision\tRecall\t\tF1')
    for lab in labels:
        TP = FP = FN = 0
        for i in range(len(gold)):
            if gold[i] == lab and predicted[i] == lab: TP += 1
            if gold[i] != lab and predicted[i] == lab: FP += 1
            if gold[i] == lab and predicted[i] != lab: FN += 1
        precision = TP / (TP + FP) if TP + FP > 0 else 0
        recall = TP / (TP + FN) if TP + FN > 0 else 0
        f1 = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0
        print(f"{lab}\t{precision:.3f}\t\t{recall:.3f}\t\t{f1:.3f}")

# Evaluate using test set
goldlist = [label for (_, label) in test_set]
predictedlist = [classifier.classify(features) for (features, _) in test_set]
eval_measures(goldlist, predictedlist)
