<a href="https://colab.research.google.com/github/mdjabedmollah/ml-learning/blob/main/npl_NLTK.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
"""
NLP fundamentals with Python & NLTK (fixed version)
- Tokenization
- Stopword removal
- Stemming
- POS tagging
- Named Entity Recognition (NER)
"""

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk import pos_tag, ne_chunk

def ensure_resources():
    """
    Downloads all necessary NLTK resources safely.
    Handles newer resource names used in NLTK ≥ 3.9.
    """
    resources = [
        "punkt",
        "punkt_tab",
        "stopwords",
        "averaged_perceptron_tagger",
        "averaged_perceptron_tagger_eng",
        "maxent_ne_chunker",
        "maxent_ne_chunker_tab",
        "words"
    ]
    for res in resources:
        try:
            nltk.download(res, quiet=True)
        except:
            pass

def main():
    ensure_resources()

    text = "Natural Language Processing with Python and NLTK is fun! OpenAI's GPT models are powerful for NLP tasks."

    # 1) Tokenization
    tokens = word_tokenize(text)
    print("Tokens:")
    print(tokens)
    print()

    # 2) Stopword removal
    stop_words = set(stopwords.words("english"))
    filtered = [w for w in tokens if w.lower() not in stop_words]
    print("Filtered (stopwords removed):")
    print(filtered)
    print()

    # 3) Stemming
    ps = PorterStemmer()
    stems = [ps.stem(w) for w in filtered]
    print("Stems:")
    print(stems)
    print()

    # 4) POS tagging
    pos_tags = pos_tag(tokens)
    print("POS tags:")
    print(pos_tags)
    print()

    # 5) Named Entity Recognition (NER)
    chunks = ne_chunk(pos_tags)
    print("Named Entities (tree structure):")
    chunks.pretty_print()
    print(chunks)
    print()

if __name__ == "__main__":
    main()


Tokens:
['Natural', 'Language', 'Processing', 'with', 'Python', 'and', 'NLTK', 'is', 'fun', '!', 'OpenAI', "'s", 'GPT', 'models', 'are', 'powerful', 'for', 'NLP', 'tasks', '.']

Filtered (stopwords removed):
['Natural', 'Language', 'Processing', 'Python', 'NLTK', 'fun', '!', 'OpenAI', "'s", 'GPT', 'models', 'powerful', 'NLP', 'tasks', '.']

Stems:
['natur', 'languag', 'process', 'python', 'nltk', 'fun', '!', 'openai', "'s", 'gpt', 'model', 'power', 'nlp', 'task', '.']

POS tags:
[('Natural', 'JJ'), ('Language', 'NNP'), ('Processing', 'NNP'), ('with', 'IN'), ('Python', 'NNP'), ('and', 'CC'), ('NLTK', 'NNP'), ('is', 'VBZ'), ('fun', 'JJ'), ('!', '.'), ('OpenAI', 'NNP'), ("'s", 'POS'), ('GPT', 'NNP'), ('models', 'NNS'), ('are', 'VBP'), ('powerful', 'JJ'), ('for', 'IN'), ('NLP', 'NNP'), ('tasks', 'NNS'), ('.', '.')]

Named Entities (tree structure):
                                                                                   S                                                           