In [None]:
# ---------- INSTALLATION ----------
# Install required NLP libraries
!pip install nltk spacy gensim --quiet

# ---------- IMPORTS ----------
import nltk
import spacy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from gensim.models import Word2Vec

# ---------- DOWNLOAD NLTK RESOURCES ----------
nltk.download('punkt')  # Tokenizer data
nltk.download('stopwords')  # English stopword list
nltk.download('averaged_perceptron_tagger')  # POS tagger
nltk.download('punkt_tab')  # Optional tokenizer table (can be skipped safely)

# ---------- SAMPLE TEXT ----------
text = "Apple is looking at buying a U.K. startup for $1 billion"

# -------------------- 1️⃣ NLTK SECTION --------------------
print("\n====== NLTK Basic Functions ======")

# Tokenization: Splits sentence into words
tokens = word_tokenize(text)
print("\n1. Tokenized words:", tokens)

# Stopword Removal: Removes common words like 'is', 'at', etc.
stop_words = set(stopwords.words('english'))
filtered_tokens = [w for w in tokens if w.lower() not in stop_words]
print("\n2. After Stop Word Removal:", filtered_tokens)

# Stemming: Reduces words to their root forms (e.g., 'buying' → 'buy')
stemmer = PorterStemmer()
stemmed = [stemmer.stem(w) for w in filtered_tokens]
print("\n3. After Stemming:", stemmed)

# POS Tagging: Identifies parts of speech for each word
pos_tags = nltk.pos_tag(tokens)
print("\n4. POS Tags:", pos_tags)

# -------------------- 2️⃣ spaCy SECTION --------------------
print("\n====== spaCy Basic Functions ======")

# Load English NLP model (downloads if not already available)
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    spacy.cli.download("en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")

# Process the text using spaCy
doc = nlp(text)

# Tokenization & Lemmatization: Shows root form of each token
print("\n1. Tokens and Lemmas:")
for token in doc:
    print(f"{token.text} → {token.lemma_}")

# POS Tagging: Grammatical role of each word
print("\n2. POS Tags:")
for token in doc:
    print(f"{token.text} ({token.pos_})")

# Named Entity Recognition (NER): Detects named entities like companies, money, etc.
print("\n3. Named Entities:")
for ent in doc.ents:
    print(f"{ent.text} ({ent.label_})")

# -------------------- 3️⃣ WORD2VEC SECTION --------------------
print("\n====== Word2Vec Basic Example ======")

# Example corpus for training Word2Vec (each list = one sentence)
sentences = [
    ['cat', 'sits', 'on', 'the', 'mat'],
    ['dog', 'plays', 'with', 'ball'],
    ['man', 'reads', 'a', 'book'],
    ['book', 'contains', 'knowledge']
]

# Train Word2Vec model
# vector_size = number of dimensions
# window = context window size
# sg = 1 uses skip-gram; sg = 0 uses CBOW
model = Word2Vec(sentences, vector_size=50, window=2, min_count=1, sg=1)

# Get vector representation of word 'book'
print("\n1. Vector for 'book':")
print(model.wv['book'])

# Find most similar words to 'book'
print("\n2. Words similar to 'book':")
print(model.wv.most_similar('book'))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!




1. Tokenized words: ['Apple', 'is', 'looking', 'at', 'buying', 'a', 'U.K.', 'startup', 'for', '$', '1', 'billion']

2. After Stop Word Removal: ['Apple', 'looking', 'buying', 'U.K.', 'startup', '$', '1', 'billion']

3. After Stemming: ['appl', 'look', 'buy', 'u.k.', 'startup', '$', '1', 'billion']

4. POS Tags: [('Apple', 'NNP'), ('is', 'VBZ'), ('looking', 'VBG'), ('at', 'IN'), ('buying', 'VBG'), ('a', 'DT'), ('U.K.', 'NNP'), ('startup', 'NN'), ('for', 'IN'), ('$', '$'), ('1', 'CD'), ('billion', 'CD')]


1. Tokens and Lemmas:
Apple → Apple
is → be
looking → look
at → at
buying → buy
a → a
U.K. → U.K.
startup → startup
for → for
$ → $
1 → 1
billion → billion

2. POS Tags:
Apple (PROPN)
is (AUX)
looking (VERB)
at (ADP)
buying (VERB)
a (DET)
U.K. (PROPN)
startup (NOUN)
for (ADP)
$ (SYM)
1 (NUM)
billion (NUM)

3. Named Entities:
Apple (ORG)
U.K. (GPE)
$1 billion (MONEY)


1. Vector for 'book':
[-1.0724545e-03  4.7286271e-04  1.0206699e-02  1.8018546e-02
 -1.8605899e-02 -1.4233618e-02  1.