# Setup

## Installing NLTK using your computer terminal

pip is the Python command to install packages

In [None]:


!pip install nltk


## Importing the package

In [None]:
import nltk

## Generating data

In [None]:

# Download a specific corpus (e.g., 'punkt' for tokenization, splits text into sentences)
nltk.download('punkt')

# Download a different corpus (e.g., 'averaged_perceptron_tagger' for part-of-speech tagging)
nltk.download('averaged_perceptron_tagger')

# Download all NLTK corpora (can take a significant amount of time and disk space)
# nltk.download('all')

print("NLTK corpora downloaded.")


### You can also generate your own data

In [None]:
corpus = [
    "In 2022, Dr. Alice Morgan moved to New York City to join Columbia University as a professor of computational linguistics.",
    "The Amazon rainforest is home to millions of species and is often called the lungs of the Earth.",
    "On July 4th, the fireworks in Washington, D.C. lit up the sky as thousands gathered to celebrate Independence Day.",
    "Tesla announced its new electric truck model in California, focusing on sustainability and autonomous driving.",
    "Researchers at MIT developed a novel algorithm that improves machine translation accuracy in low-resource languages.",
    "The Grand Canyon offers breathtaking views and attracts millions of visitors each year for hiking, rafting, and photography.",
    "In a surprising turn of events, the local bakery in Seattle won the national Best Croissant competition in 2023.",
    "Mount Everest, the tallest mountain in the world, challenges even the most experienced climbers with its altitude and severe weather conditions.",
    "Shakespeare’s plays continue to influence literature and drama courses around the globe, centuries after their creation.",
    "The Nobel Peace Prize was awarded to a climate activist who spearheaded global initiatives on carbon reduction."
]


# Text Preprocessing

## Normalization

In [None]:
# Lowercasing
lower_corpus = [text.lower() for text in corpus]
print("Lowercased Corpus:")
print(lower_corpus)
print("-" * 30)

# Removing punctuation
import string
tokenized_no_punct_corpus = [[word for word in tokens if word not in string.punctuation] for tokens in tokenized_corpus]
print("Tokenized Corpus (no punctuation):")
print(tokenized_no_punct_corpus)
print("-" * 30)

# Removing numbers
tokenized_no_nums_corpus = [[word for word in tokens if not word.isdigit()] for tokens in tokenized_no_punct_corpus]
print("Tokenized Corpus (no numbers):")
print(tokenized_no_nums_corpus)
print("-" * 30)



## Tokenization

In [None]:
# #### Sentence Tokenization
from nltk.tokenize import sent_tokenize
sentence_tokens = [sent_tokenize(text) for text in corpus]
print("Sentence Tokenization:")
print(sentence_tokens)
print("-" * 30)

# #### Word Tokenization
from nltk.tokenize import word_tokenize
word_tokens = [word_tokenize(text) for text in corpus]
print("Word Tokenization:")
print(word_tokens)
print("-" * 30)

# You can also store these tokenized lists for further processing
tokenized_corpus = word_tokens

## Remove stop words

In [None]:
from nltk.corpus import stopwords

# Download the stopwords corpus
nltk.download('stopwords')

# Get the English stop words
stop_words = set(stopwords.words('english'))

# Remove stop words from the tokenized corpus
tokenized_no_stopwords_corpus = [[word for word in tokens if word not in stop_words] for tokens in tokenized_no_nums_corpus]

print("Tokenized Corpus (no stop words):")
print(tokenized_no_stopwords_corpus)
print("-" * 30)

## Lemmatization

In [None]:
from nltk.stem import WordNetLemmatizer

# Download the WordNet corpus
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
lemmatized_corpus = [[lemmatizer.lemmatize(word) for word in tokens] for tokens in tokenized_no_stopwords_corpus]
print("Lemmatized Corpus:")
print(lemmatized_corpus)
print("-" * 30)

# Example with POS tag consideration for better lemmatization
from nltk.corpus import wordnet

def get_wordnet_pos(word):
  """Maps POS tags to WordNet tags."""
  tag = nltk.pos_tag([word])[0][1][0].upper()
  tag_dict = {"J": wordnet.ADJ,
              "N": wordnet.NOUN,
              "V": wordnet.VERB,
              "R": wordnet.ADV}
  return tag_dict.get(tag, wordnet.NOUN)

lemmatized_corpus_pos = [[lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in tokens] for tokens in tokenized_no_stopwords_corpus]
print("Lemmatized Corpus (with POS):")
print(lemmatized_corpus_pos)
print("-" * 30)

## Stemming

In [None]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
stemmed_corpus = [[stemmer.stem(word) for word in tokens] for tokens in tokenized_no_stopwords_corpus]
print("Stemmed Corpus:")
print(stemmed_corpus)
print("-" * 30)

example_words = ["running", "jumps", "played", "flies", "better", "beautifully"]
stemmer = PorterStemmer()

print("More Stemming Examples:")
for word in example_words:
  stemmed_word = stemmer.stem(word)
  print(f"'{word}' stemmed to '{stemmed_word}'")

print("-" * 30)

## Dependency parsing


Dependency parsing: Understanding grammatical relationships between words in a sentence.
It reveals how words modify or relate to each other, forming the sentence structure.

In [None]:
# Install the required parser
nltk.download('dependency_treebank') # Download a dependency parsed corpus for potential future use
nltk.download('universal_tagset') # For Universal POS tags

# Example sentence
sentence = "The quick brown fox jumps over the lazy dog."

# Perform POS tagging (needed for dependency parsing)
tagged_sentence = nltk.pos_tag(nltk.word_tokenize(sentence))

# NLTK doesn't have a built-in high-performance dependency parser.
# However, we can illustrate the concept and use a simple parser if needed
# or demonstrate the structure with a library like SpaCy or StanfordCoreNLP.

# For NLTK, you would typically train a parser or use a pre-trained one if available.
# As a demonstration of the concept and output format:

# To use a statistical dependency parser in NLTK, you'd need a trained model.
# A common approach is to train one on a treebank annotated with dependencies.
# For simplicity and illustration, let's show the desired output structure
# which represents the head-modifier relationships.

# Let's use a simple example to illustrate the concept of head-modifier pairs.
# This is a manual representation, not generated by an NLTK parser out-of-the-box.

dependency_structure = [
    ("fox", "The", "det"),
    ("fox", "quick", "amod"),
    ("fox", "brown", "amod"),
    ("jumps", "fox", "nsubj"),
    ("jumps", "over", "prep"),
    ("over", "dog", "pobj"),
    ("dog", "the", "det"),
    ("dog", "lazy", "amod")
]

print("\nExample sentence:")
print(sentence)

print("\nExample Dependency Relationships (Manual Illustration):")
for head, modifier, relation in dependency_structure:
  print(f"  - '{modifier}' is related to '{head}' with relationship '{relation}'")

# To get actual dependency trees, you would typically use libraries like SpaCy
# or integrate with external parsers like Stanford CoreNLP.
# If you were to use a NLTK trained parser (which is not straightforward without training):
# parser = <trained_nltk_parser>
# tree = parser.parse(tagged_sentence)
# print("\nExample Dependency Tree (Conceptual, requires trained parser):")
# tree.pretty_print()

print("-" * 30)

### With Spacy

In [None]:
!pip install spacy
!python -m spacy download en_core_web_sm

import spacy

# Load the English language model
nlp = spacy.load("en_core_web_sm")

# Example sentence
sentence = "The quick brown fox jumps over the lazy dog."

# Process the sentence with spaCy
doc = nlp(sentence)

print("\nExample Dependency Parsing with spaCy:")
# Iterate over the tokens and print their dependency relations
for token in doc:
  print(f"  Token: {token.text}, Lemma: {token.lemma_}, POS: {token.pos_}, Tag: {token.tag_}, Dep: {token.dep_}, Head: {token.head.text}, Head_POS: {token.head.pos_}")

# Visualize the dependency tree (requires installing displacy if you want a graphical representation)
# from spacy import displacy
# displacy.render(doc, style="dep", jupyter=True)

print("-" * 30)

# NER

In [None]:
# Download the necessary resource for NE chunking
nltk.download('maxent_ne_chunker')
nltk.download('words')

# Example sentence for NER
ner_sentence = "Apple is looking at buying U.K. startup for $1 billion."

# Tokenize the sentence
tokens = nltk.word_tokenize(ner_sentence)

# Part-of-speech tag the tokens
tagged = nltk.pos_tag(tokens)

# Perform Named Entity Recognition
ne_tree = nltk.ne_chunk(tagged)

print("Named Entity Recognition Example:")
print(ne_tree)
print("-" * 30)

# You can also iterate through the chunks to see recognized entities
print("Recognized Entities:")
for chunk in ne_tree:
  if hasattr(chunk, 'label'):
    print(f"  Entity: {' '.join(c[0] for c in chunk)}, Type: {chunk.label()}")

print("-" * 30)