In [None]:
import nltk
from nltk.corpus import state_union
from nltk.stem import WordNetLemmatizer

# Download the required NLTK corpora and models
nltk.download('state_union')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

# Get a sample text from the state_union corpus
text = state_union.raw("2006-GWBush.txt")[:1000]

# Tokenize and tag the text
tokenized = nltk.word_tokenize(text)
tagged = nltk.pos_tag(tokenized)

# Define a function to lemmatize words
def lemmatize_words(word, pos):
    if pos.startswith('N'):
        return WordNetLemmatizer().lemmatize(word, pos='n')
    elif pos.startswith('V'):
        return WordNetLemmatizer().lemmatize(word, pos='v')
    elif pos.startswith('J'):
        return WordNetLemmatizer().lemmatize(word, pos='a')
    elif pos.startswith('R'):
        return WordNetLemmatizer().lemmatize(word, pos='r')
    else:
        return word

# Lemmatize the tagged text
lemmatized = [(lemmatize_words(word, pos), pos) for word, pos in tagged]

# Print the lemmatized text
print("Lemmatized Text:")
print(lemmatized)

[nltk_data] Downloading package state_union to /root/nltk_data...
[nltk_data]   Unzipping corpora/state_union.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Lemmatized Text:
[('PRESIDENT', 'NNP'), ('GEORGE', 'NNP'), ('W.', 'NNP'), ('BUSH', 'NNP'), ("'S", 'POS'), ('ADDRESS', 'NNP'), ('BEFORE', 'IN'), ('A', 'NNP'), ('JOINT', 'NNP'), ('SESSION', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('CONGRESS', 'NNP'), ('ON', 'NNP'), ('THE', 'NNP'), ('STATE', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('UNION', 'NNP'), ('January', 'NNP'), ('31', 'CD'), (',', ','), ('2006', 'CD'), ('THE', 'NNP'), ('PRESIDENT', 'NNP'), (':', ':'), ('Thank', 'NNP'), ('you', 'PRP'), ('all', 'DT'), ('.', '.'), ('Mr.', 'NNP'), ('Speaker', 'NNP'), (',', ','), ('Vice', 'NNP'), ('President', 'NNP'), ('Cheney', 'NNP'), (',', ','), ('member', 'NNS'), ('of', 'IN'), ('Congress', 'NNP'), (',', ','), ('member', 'NNS'), ('of', 'IN'), ('the', 'DT'), ('Supreme', 'NNP'), ('Court', 'NNP'), ('and', 'CC'), ('diplomatic', 'JJ'), ('corp', 'NN'), (',', ','), ('distinguished', 'JJ'), ('guest', 'NNS'), (',', ','), ('and', 'CC'), ('fellow', 'JJ'), ('citizen', 'NNS'), (':', ':'), ('Today', 'VB'), ('our', 'PR

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk import ne_chunk

# Download the required NLTK corpora and models
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

# Define a function to extract named entities
def extract_entities(text):
    # Tokenize the text
    tokens = word_tokenize(text)

    # Tag the tokens with their part of speech
    tagged = pos_tag(tokens)

    # Use the named entity chunker to extract named entities
    chunked = ne_chunk(tagged)

    # Initialize a list to hold the named entities
    entities = []

    # Iterate over the chunks and extract named entities
    for chunk in chunked:
        if hasattr(chunk, 'label') and chunk.label() in ['PERSON', 'ORGANIZATION', 'LOCATION']:
            entity = ' '.join(c[0] for c in chunk)
            entity_type = chunk.label()
            entities.append((entity, entity_type))

    # Return the named entities
    return entities

# Test the function with a sample text
text = "Barack Obama was born in Hawaii and became the president of the United States. He was the leader of the Democratic Party."
entities = extract_entities(text)
print(entities)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


[('Barack', 'PERSON'), ('Obama', 'PERSON'), ('Democratic Party', 'ORGANIZATION')]
