In [8]:
import string

def preprocess_nlp_basic(sentence):
    # Step 1: Tokenize by splitting on spaces
    tokens = sentence.split()
    print("Tokens:", tokens)
    
    # Step 2: Remove common English stopwords (basic list)
    stop_words = {
        "a", "an", "the", "and", "or", "in", "on", "at", "to", "for", "of", "are", "is", "was", "were", "be", "been",
        "has", "have", "had", "do", "does", "did", "with", "like", "as", "by", "from", "that", "this", "it", "you", "i"
    }
    tokens_no_stopwords = [word for word in tokens if word.lower() not in stop_words]
    print("After Stopword Removal:", tokens_no_stopwords)
    
    # Step 3: Simple stemming - just remove common suffixes (very basic)
    suffixes = ['ing', 'ly', 'ed', 's', 'es']
    def simple_stem(word):
        for suffix in suffixes:
            if word.lower().endswith(suffix) and len(word) > len(suffix) + 2:
                return word[:-len(suffix)]
        return word
    
    stemmed_tokens = [simple_stem(word) for word in tokens_no_stopwords]
    print("After Stemming:", stemmed_tokens)

# Example sentence
sentence = "NLP techniques are used in virtual assistants like Alexa and Siri."
preprocess_nlp_basic(sentence)


Tokens: ['NLP', 'techniques', 'are', 'used', 'in', 'virtual', 'assistants', 'like', 'Alexa', 'and', 'Siri.']
After Stopword Removal: ['NLP', 'techniques', 'used', 'virtual', 'assistants', 'Alexa', 'Siri.']
After Stemming: ['NLP', 'technique', 'used', 'virtual', 'assistant', 'Alexa', 'Siri.']


In [7]:
import nltk
from nltk import word_tokenize, pos_tag, ne_chunk
import os
import pathlib

# Path to local NLTK data folder
nltk_data_path = os.path.join(str(pathlib.Path.home()), "nltk_data")

# Check and download only if missing (no output shown)
def silent_nltk_download(package):
    try:
        nltk.data.find(package)
    except LookupError:
        nltk.download(package.split("/")[-1], quiet=True)

# Download only if not present
silent_nltk_download('tokenizers/punkt')
silent_nltk_download('taggers/averaged_perceptron_tagger')
silent_nltk_download('chunkers/maxent_ne_chunker')
silent_nltk_download('corpora/words')

# Input sentence
sentence = "Barack Obama served as the 44th President of the United States and won the Nobel Peace Prize in 2009."

# Tokenize, POS tag, and perform Named Entity Recognition
tokens = word_tokenize(sentence)
pos_tags = pos_tag(tokens)
named_entities = ne_chunk(pos_tags)

# Output named entities
print("Named Entities Found:\n")
for chunk in named_entities:
    if hasattr(chunk, 'label'):
        entity = " ".join(c[0] for c in chunk)
        label = chunk.label()
        print(f"Entity: {entity}, Label: {label}")


Named Entities Found:

Entity: Barack, Label: PERSON
Entity: Obama, Label: PERSON
Entity: United States, Label: GPE
Entity: Nobel Peace Prize, Label: ORGANIZATION


In [9]:
import string

def simple_tokenize(text):
    # Replace punctuation with spaces, then split
    translator = str.maketrans(string.punctuation, ' ' * len(string.punctuation))
    clean_text = text.translate(translator)
    tokens = clean_text.split()
    return tokens

def simple_stopword_removal(tokens):
    stopwords = {
        "a", "an", "the", "and", "or", "in", "on", "at", "to", "for", "of",
        "are", "is", "was", "were", "be", "been", "has", "have", "had",
        "do", "does", "did", "with", "like", "as", "by", "from", "that",
        "this", "it", "you", "i"
    }
    return [t for t in tokens if t.lower() not in stopwords]

def simple_stem(tokens):
    suffixes = ['ing', 'ly', 'ed', 's', 'es']
    stemmed = []
    for word in tokens:
        for suffix in suffixes:
            if word.lower().endswith(suffix) and len(word) > len(suffix) + 2:
                word = word[:-len(suffix)]
                break
        stemmed.append(word)
    return stemmed

def preprocess_nlp(text):
    tokens = simple_tokenize(text)
    print("Tokens:", tokens)
    tokens_no_stopwords = simple_stopword_removal(tokens)
    print("After Stopword Removal:", tokens_no_stopwords)
    stemmed_tokens = simple_stem(tokens_no_stopwords)
    print("After Stemming:", stemmed_tokens)

# Example
sentence = "NLP techniques are used in virtual assistants like Alexa and Siri."
preprocess_nlp(sentence)


Tokens: ['NLP', 'techniques', 'are', 'used', 'in', 'virtual', 'assistants', 'like', 'Alexa', 'and', 'Siri']
After Stopword Removal: ['NLP', 'techniques', 'used', 'virtual', 'assistants', 'Alexa', 'Siri']
After Stemming: ['NLP', 'technique', 'used', 'virtual', 'assistant', 'Alexa', 'Siri']


In [10]:
# Simple rule-based sentiment analysis without NLTK

positive_words = {"good", "great", "excellent", "outstanding", "happy", "love", "awesome", "best", "fantastic", "amazing"}
negative_words = {"bad", "terrible", "poor", "hate", "worst", "awful", "disappointing", "high", "expensive"}

def simple_sentiment_analysis(text):
    words = text.lower().split()
    pos_count = sum(word in positive_words for word in words)
    neg_count = sum(word in negative_words for word in words)

    if pos_count > neg_count:
        sentiment = "POSITIVE"
        confidence = pos_count / len(words)
    elif neg_count > pos_count:
        sentiment = "NEGATIVE"
        confidence = neg_count / len(words)
    else:
        sentiment = "NEUTRAL"
        confidence = 0.0

    print(f"Sentiment: {sentiment}")
    print(f"Confidence Score: {confidence:.4f}")

# Example
sentence = "Despite the high price, the performance of the new MacBook is outstanding."
simple_sentiment_analysis(sentence)


Sentiment: NEGATIVE
Confidence Score: 0.0833
