In [4]:
import nltk
from nltk.corpus import sentence_polarity
import random

# Download necessary NLTK data
nltk.download('sentence_polarity')
nltk.download('stopwords')

# Load the movie review sentences from the NLTK corpus
documents = [(sent, category) for category in sentence_polarity.categories() 
             for sent in sentence_polarity.sents(categories=category)]
random.shuffle(documents)  # Shuffle the documents

# Check the total number of sentences and categories
print(f"Total number of sentences: {len(documents)}")
print(f"Categories: {sentence_polarity.categories()}")


Total number of sentences: 10662
Categories: ['neg', 'pos']


[nltk_data] Downloading package sentence_polarity to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package sentence_polarity is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
# Extract all words and get the 2000 most common words for BOW features
all_words_list = [word.lower() for (sent, _) in documents for word in sent]
all_words = nltk.FreqDist(all_words_list)
word_features = [word for (word, _) in all_words.most_common(2000)]

# Define a function for BOW features
def document_features(document, word_features):
    document_words = set(document)
    features = {f'V_{word}': (word in document_words) for word in word_features}
    return features

# Create feature sets using BOW
featuresets = [(document_features(d, word_features), c) for (d, c) in documents]

# Split into training and testing sets (90/10 split)
train_set, test_set = featuresets[1000:], featuresets[:1000]

# Train a Naive Bayes classifier and evaluate it
classifier = nltk.NaiveBayesClassifier.train(train_set)
baseline_accuracy = nltk.classify.accuracy(classifier, test_set)
print(f'Baseline Accuracy with BOW features: {baseline_accuracy * 100:.2f}%')
classifier.show_most_informative_features(10)


Baseline Accuracy with BOW features: 76.20%
Most Informative Features
            V_engrossing = True              pos : neg    =     19.7 : 1.0
                 V_flaws = True              pos : neg    =     15.7 : 1.0
               V_generic = True              neg : pos    =     15.6 : 1.0
              V_mediocre = True              neg : pos    =     15.6 : 1.0
               V_routine = True              neg : pos    =     14.3 : 1.0
                V_flawed = True              pos : neg    =     13.7 : 1.0
            V_refreshing = True              pos : neg    =     13.7 : 1.0
                  V_flat = True              neg : pos    =     13.4 : 1.0
                V_boring = True              neg : pos    =     13.3 : 1.0
             V_wonderful = True              pos : neg    =     12.6 : 1.0


In [6]:
# Import the readSubjectivity function from Subjectivity.py (ensure the file is in the same directory)
from subjectivity import readSubjectivity  # Adjust the path if necessary

# Load the Subjectivity Lexicon
SLpath = "subjclueslen1-HLTEMNLP05.tff"  # Path to the subjectivity lexicon file
SL = readSubjectivity(SLpath)
print(f"Loaded {len(SL)} words from the subjectivity lexicon.")

# Define features using the Subjectivity Lexicon
def SL_features(document, word_features, SL):
    document_words = set(document)
    features = {f'V_{word}': (word in document_words) for word in word_features}
    weakPos, strongPos, weakNeg, strongNeg = 0, 0, 0, 0
    for word in document_words:
        if word in SL:
            strength, _, _, polarity = SL[word]
            if strength == 'weaksubj' and polarity == 'positive':
                weakPos += 1
            elif strength == 'strongsubj' and polarity == 'positive':
                strongPos += 1
            elif strength == 'weaksubj' and polarity == 'negative':
                weakNeg += 1
            elif strength == 'strongsubj' and polarity == 'negative':
                strongNeg += 1
    features['positivecount'] = weakPos + (2 * strongPos)
    features['negativecount'] = weakNeg + (2 * strongNeg)
    return features

# Create feature sets using the Subjectivity Lexicon features
SL_featuresets = [(SL_features(d, word_features, SL), c) for (d, c) in documents]
train_set, test_set = SL_featuresets[1000:], SL_featuresets[:1000]

# Train and evaluate the classifier
classifier = nltk.NaiveBayesClassifier.train(train_set)
improved_accuracy = nltk.classify.accuracy(classifier, test_set)
print(f'Improved Accuracy with Subjectivity Lexicon features: {improved_accuracy * 100:.2f}%')


Loaded 6885 words from the subjectivity lexicon.
Improved Accuracy with Subjectivity Lexicon features: 77.40%


In [7]:
negationwords = ['no', 'not', 'never', 'none', 'rather', 'hardly', 'scarcely', 
                 'rarely', 'seldom', 'neither', 'nor', "n't"]
# Define the features with negation handling
def NOT_features(document, word_features, negationwords):
    features = {f'V_{word}': False for word in word_features}
    features.update({f'V_NOT{word}': False for word in word_features})
    
    negated = False
    for word in document:
        if word in negationwords or word.endswith("n't"):
            negated = True
            continue
        if word in word_features:
            if negated:
                features[f'V_NOT{word}'] = True
                negated = False
            else:
                features[f'V_{word}'] = True
    return features



In [8]:
# Function to generate features in smaller batches
def generate_featuresets_in_batches(documents, word_features, negationwords, batch_size=100):
    for start in range(0, len(documents), batch_size):
        batch = documents[start:start + batch_size]
        yield [(NOT_features(d, word_features, negationwords), c) for (d, c) in batch]





In [9]:

# Process the data in smaller batches to create the training and testing sets
NOT_featuresets = []
for batch in generate_featuresets_in_batches(documents, word_features, negationwords):
    NOT_featuresets.extend(batch)

train_set, test_set = NOT_featuresets[1000:], NOT_featuresets[:1000]



In [10]:
# Train and evaluate the classifier
classifier = nltk.NaiveBayesClassifier.train(train_set)
accuracy = nltk.classify.accuracy(classifier, test_set)
print(f'Accuracy with Negation features: {accuracy * 100:.2f}%')
classifier.show_most_informative_features(10)

Accuracy with Negation features: 75.50%
Most Informative Features
                V_boring = True              neg : pos    =     30.3 : 1.0
            V_engrossing = True              pos : neg    =     19.7 : 1.0
                  V_warm = True              pos : neg    =     19.0 : 1.0
                  V_dull = True              neg : pos    =     17.8 : 1.0
             V_NOTenough = True              neg : pos    =     16.3 : 1.0
                 V_flaws = True              pos : neg    =     15.7 : 1.0
               V_generic = True              neg : pos    =     15.6 : 1.0
              V_mediocre = True              neg : pos    =     15.6 : 1.0
            V_unexpected = True              pos : neg    =     15.0 : 1.0
                 V_fails = True              neg : pos    =     15.0 : 1.0
