In [7]:
!pip install nltk


Collecting nltk
  Using cached nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting click (from nltk)
  Using cached click-8.1.7-py3-none-any.whl.metadata (3.0 kB)
Collecting joblib (from nltk)
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2024.9.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
Collecting tqdm (from nltk)
  Downloading tqdm-4.66.6-py3-none-any.whl.metadata (57 kB)
Using cached nltk-3.9.1-py3-none-any.whl (1.5 MB)
Downloading regex-2024.9.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (792 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m792.8/792.8 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hUsing cached click-8.1.7-py3-none-any.whl (97 kB)
Downloading joblib-1.4.2-py3-none-any.whl (301 kB)
Downloading tqdm-4.66.6-py3-none-any.whl (78 kB)
Installing collected packages: tqdm, regex, joblib, click, nltk
Successfully installe

In [8]:


# Import necessary libraries
import nltk
from nltk.corpus import sentence_polarity
import random

# Download the necessary NLTK data if you haven't already
nltk.download('sentence_polarity')
nltk.download('stopwords')

# Load the movie review sentences from the NLTK corpus
sentences = sentence_polarity.sents()
documents = [(sent, category) for category in sentence_polarity.categories() 
             for sent in sentence_polarity.sents(categories=category)]
random.shuffle(documents)  # Shuffle the documents

# Check the number of sentences and categories
print(f"Total number of sentences: {len(sentences)}")
print(f"Categories: {sentence_polarity.categories()}")
print("First four sentences:")
for sent in sentences[:4]:
    print(sent)


[nltk_data] Downloading package sentence_polarity to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package sentence_polarity is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Total number of sentences: 10662
Categories: ['neg', 'pos']
First four sentences:
['simplistic', ',', 'silly', 'and', 'tedious', '.']
["it's", 'so', 'laddish', 'and', 'juvenile', ',', 'only', 'teenage', 'boys', 'could', 'possibly', 'find', 'it', 'funny', '.']
['exploitative', 'and', 'largely', 'devoid', 'of', 'the', 'depth', 'or', 'sophistication', 'that', 'would', 'make', 'watching', 'such', 'a', 'graphic', 'treatment', 'of', 'the', 'crimes', 'bearable', '.']
['[garbus]', 'discards', 'the', 'potential', 'for', 'pathological', 'study', ',', 'exhuming', 'instead', ',', 'the', 'skewed', 'melodrama', 'of', 'the', 'circumstantial', 'situation', '.']


In [9]:
# Extract all words and get the 2000 most common words for BOW features
all_words_list = [word.lower() for (sent, _) in documents for word in sent]
all_words = nltk.FreqDist(all_words_list)
word_features = [word for (word, _) in all_words.most_common(2000)]

# Define a function for BOW features
def document_features(document, word_features):
    document_words = set(document)
    features = {f'V_{word}': (word in document_words) for word in word_features}
    return features

# Create feature sets using BOW
featuresets = [(document_features(d, word_features), c) for (d, c) in documents]

# Split into training and testing sets (90/10 split)
train_set, test_set = featuresets[1000:], featuresets[:1000]

# Train a Naive Bayes classifier and evaluate it
classifier = nltk.NaiveBayesClassifier.train(train_set)
accuracy = nltk.classify.accuracy(classifier, test_set)
print(f'Accuracy with BOW features: {accuracy * 100:.2f}%')
classifier.show_most_informative_features(10)


Accuracy with BOW features: 75.80%
Most Informative Features
            V_engrossing = True              pos : neg    =     19.8 : 1.0
              V_mediocre = True              neg : pos    =     16.9 : 1.0
               V_generic = True              neg : pos    =     16.2 : 1.0
                  V_flat = True              neg : pos    =     14.9 : 1.0
               V_routine = True              neg : pos    =     14.9 : 1.0
             V_inventive = True              pos : neg    =     14.4 : 1.0
                V_boring = True              neg : pos    =     12.9 : 1.0
             V_wonderful = True              pos : neg    =     12.7 : 1.0
                  V_dull = True              neg : pos    =     12.5 : 1.0
             V_affecting = True              pos : neg    =     12.4 : 1.0


In [10]:
# Import the readSubjectivity function from Subjectivity.py (ensure the file is in the same directory)
from subjectivity import readSubjectivity  # Adjust the path if necessary

# Load the Subjectivity Lexicon
SLpath = "subjclueslen1-HLTEMNLP05.tff"  # Path to the subjectivity lexicon file
SL = readSubjectivity(SLpath)
print(f"Loaded {len(SL)} words from the subjectivity lexicon.")

# Define features using the Subjectivity Lexicon
def SL_features(document, word_features, SL):
    document_words = set(document)
    features = {f'V_{word}': (word in document_words) for word in word_features}
    weakPos, strongPos, weakNeg, strongNeg = 0, 0, 0, 0
    for word in document_words:
        if word in SL:
            strength, _, _, polarity = SL[word]
            if strength == 'weaksubj' and polarity == 'positive':
                weakPos += 1
            elif strength == 'strongsubj' and polarity == 'positive':
                strongPos += 1
            elif strength == 'weaksubj' and polarity == 'negative':
                weakNeg += 1
            elif strength == 'strongsubj' and polarity == 'negative':
                strongNeg += 1
    features['positivecount'] = weakPos + (2 * strongPos)
    features['negativecount'] = weakNeg + (2 * strongNeg)
    return features

# Create feature sets using the Subjectivity Lexicon features
SL_featuresets = [(SL_features(d, word_features, SL), c) for (d, c) in documents]
train_set, test_set = SL_featuresets[1000:], SL_featuresets[:1000]

# Train and evaluate the classifier
classifier = nltk.NaiveBayesClassifier.train(train_set)
accuracy = nltk.classify.accuracy(classifier, test_set)
print(f'Accuracy with Subjectivity Lexicon features: {accuracy * 100:.2f}%')


Loaded 6885 words from the subjectivity lexicon.
Accuracy with Subjectivity Lexicon features: 75.80%


In [11]:
negationwords = ['no', 'not', 'never', 'none', 'rather', 'hardly', 'scarcely', 
                 'rarely', 'seldom', 'neither', 'nor']

# Define features with negation handling
def NOT_features(document, word_features, negationwords):
    features = {f'V_{word}': False for word in word_features}
    features.update({f'V_NOT{word}': False for word in word_features})
    for i in range(len(document)):
        word = document[i]
        if ((i + 1) < len(document)) and ((word in negationwords) or (word.endswith("n't"))):
            i += 1
            if document[i] in word_features:
                features[f'V_NOT{document[i]}'] = True
        elif word in word_features:
            features[f'V_{word}'] = True
    return features

# Create feature sets using negation handling
NOT_featuresets = [(NOT_features(d, word_features, negationwords), c) for (d, c) in documents]
train_set, test_set = NOT_featuresets[1000:], NOT_featuresets[:1000]

# Train and evaluate the classifier
classifier = nltk.NaiveBayesClassifier.train(train_set)
accuracy = nltk.classify.accuracy(classifier, test_set)
print(f'Accuracy with Negation features: {accuracy * 100:.2f}%')
classifier.show_most_informative_features(10)


: 