#3. Feature Engineering

**i. One HOT Encoding**

In [1]:
text_feat1 = "the movie was enjoyable"

In [2]:
# Tokenize
tokens = text_feat1.split()

# Build Vocabulary: Unique words in the corpus
vocab = sorted(set(tokens))
word_to_index = {word: idx for idx, word in enumerate(vocab)}

# One-hot encode
one_hot_vectors = []
for word in tokens:
    vector = [0] * len(vocab)
    vector[word_to_index[word]] = 1
    one_hot_vectors.append(vector)

# Display
print("Vocabulary:", word_to_index)
print("\nOne-Hot Encoded Vectors:")
for word, vec in zip(tokens, one_hot_vectors):
    print(f"{word}: {vec}")


Vocabulary: {'enjoyable': 0, 'movie': 1, 'the': 2, 'was': 3}

One-Hot Encoded Vectors:
the: [0, 0, 1, 0]
movie: [0, 1, 0, 0]
was: [0, 0, 0, 1]
enjoyable: [1, 0, 0, 0]


In [None]:
# Limitations of One-Hot Encoding:
# - High-dimensional and sparse (for large vocab, vectors become huge)
# - No information about word meaning (semantic similarity between 'good' and 'great' = 0)
# - Vocabulary-specific: unseen words in test set can't be encoded
# - Not ideal for downstream deep learning (use embeddings instead)

**ii. BOW (BAG OF WORDS)**

In [28]:
import pandas as pd

data = pd.DataFrame({
    'text': [
        'great acting and storyline great',
        'poor dialogue and boring scenes poor boring poor',
        'fantastic direction and cast',
        'bad script with no emotion'
    ],
    'output': [1, 0, 1, 0]
})

print(data)

                                               text  output
0                  great acting and storyline great       1
1  poor dialogue and boring scenes poor boring poor       0
2                      fantastic direction and cast       1
3                        bad script with no emotion       0


In [40]:
from sklearn.feature_extraction.text import CountVectorizer
# cv=CountVectorizer(ngram_range=(1,1))
#cv=CountVectorizer(ngram_range=(2,2))
cv=CountVectorizer(ngram_range=(1,2))

In [41]:
bow= cv.fit_transform(data['text'])

In [35]:
#vocabulary
print(cv.vocabulary_)

{'great': 19, 'acting': 0, 'and': 2, 'storyline': 30, 'great acting': 20, 'acting and': 1, 'and storyline': 5, 'storyline great': 31, 'poor': 23, 'dialogue': 12, 'boring': 8, 'scenes': 26, 'poor dialogue': 25, 'dialogue and': 13, 'and boring': 3, 'boring scenes': 10, 'scenes poor': 27, 'poor boring': 24, 'boring poor': 9, 'fantastic': 17, 'direction': 14, 'cast': 11, 'fantastic direction': 18, 'direction and': 15, 'and cast': 4, 'bad': 6, 'script': 28, 'with': 32, 'no': 21, 'emotion': 16, 'bad script': 7, 'script with': 29, 'with no': 33, 'no emotion': 22}


In [36]:
print(bow[0].toarray())

[[1 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 2 1 0 0 0 0 0 0 0 0 0 1 1 0 0]]


In [37]:
print(bow[1].toarray()) #try more

[[0 0 1 1 0 0 0 0 2 1 1 0 1 1 0 0 0 0 0 0 0 0 0 3 1 1 1 1 0 0 0 0 0 0]]


In [26]:
cv.transform(["great acting and storyline but how"]).toarray() #out of vocabulary prblm solved

array([[1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0]])

**iii. TFIDF (Term Frequency Inverse-Document Frequency)**

In [43]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer()
tfidf.fit_transform(data['text']).toarray()

array([[0.39505606, 0.25215917, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.79011212,
        0.        , 0.        , 0.        , 0.        , 0.39505606,
        0.        ],
       [0.        , 0.16261148, 0.        , 0.50952462, 0.        ,
        0.25476231, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.76428692, 0.25476231, 0.        , 0.        ,
        0.        ],
       [0.        , 0.34578314, 0.        , 0.        , 0.5417361 ,
        0.        , 0.5417361 , 0.        , 0.5417361 , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.4472136 , 0.        , 0.        ,
        0.        , 0.        , 0.4472136 , 0.        , 0.        ,
        0.4472136 , 0.        , 0.        , 0.4472136 , 0.        ,
        0.4472136 ]])

In [44]:

print(tfidf.idf_)
print(tfidf.get_feature_names_out())

[1.91629073 1.22314355 1.91629073 1.91629073 1.91629073 1.91629073
 1.91629073 1.91629073 1.91629073 1.91629073 1.91629073 1.91629073
 1.91629073 1.91629073 1.91629073 1.91629073]
['acting' 'and' 'bad' 'boring' 'cast' 'dialogue' 'direction' 'emotion'
 'fantastic' 'great' 'no' 'poor' 'scenes' 'script' 'storyline' 'with']


**iv. N-Grams Model**

In [30]:
text_ngram1 = "GeeksForGeeks provides great NLP content"

In [34]:
from sklearn.feature_extraction.text import CountVectorizer

# Sample data
corpus = ["GeeksForGeeks provides great NLP content"]

# Create CountVectorizer with n-grams
vectorizer = CountVectorizer(ngram_range=(1, 1))  # unigrams
X = vectorizer.fit_transform(corpus)

In [35]:
# Show results
print("Vocabulary:\n", vectorizer.vocabulary_)
print("\nN-gram Features:")
print(X.toarray())

Vocabulary:
 {'geeksforgeeks': 1, 'provides': 4, 'great': 2, 'nlp': 3, 'content': 0}

N-gram Features:
[[1 1 1 1 1]]


In [36]:
# Create CountVectorizer with n-grams
vectorizer = CountVectorizer(ngram_range=(1, 2))  # unigrams, bigrams
X = vectorizer.fit_transform(corpus)

In [37]:
# Show results
print("Vocabulary:\n", vectorizer.vocabulary_)
print("\nN-gram Features:")
print(X.toarray())

Vocabulary:
 {'geeksforgeeks': 1, 'provides': 7, 'great': 3, 'nlp': 5, 'content': 0, 'geeksforgeeks provides': 2, 'provides great': 8, 'great nlp': 4, 'nlp content': 6}

N-gram Features:
[[1 1 1 1 1 1 1 1 1]]


In [40]:
# Create CountVectorizer with n-grams
vectorizer = CountVectorizer(ngram_range=(2, 2))  # bigrams, now you can try more
X = vectorizer.fit_transform(corpus)

In [41]:
# Show results
print("Vocabulary:\n", vectorizer.vocabulary_)
print("\nN-gram Features:")
print(X.toarray())

Vocabulary:
 {'geeksforgeeks provides': 0, 'provides great': 3, 'great nlp': 1, 'nlp content': 2}

N-gram Features:
[[1 1 1 1]]


**v. Word Embeddings**

1. Word2Vec

In [42]:
!pip install --upgrade pip
!pip uninstall -y gensim numpy scipy
!pip install gensim==4.3.2 numpy==1.24.3 scipy==1.10.1

Collecting pip
  Downloading pip-25.2-py3-none-any.whl.metadata (4.7 kB)
Downloading pip-25.2-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.2
[0mFound existing installation: numpy 2.0.2
Uninstalling numpy-2.0.2:
  Successfully uninstalled numpy-2.0.2
Found existing installation: scipy 1.16.0
Uninstalling scipy-1.16.0:
  Successfully uninstalled scipy-1.16.0
Collecting gensim==4.3.2
  Downloading gensim-4.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.3 kB)
Collecting numpy==1.24.3
  Downloading numpy-1.24.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)
Collecting scipy==1.10.1
  Downloading scipy-1.10.1-cp311-cp311-manyli

In [1]:
import gensim

In [2]:
from gensim.models import Word2Vec, KeyedVectors

In [3]:
import gensim.downloader as api #word2vec is trained on google news corpus of 300 dimension and about 3 million words
wv=api.load('word2vec-google-news-300')



In [4]:
wv.most_similar('cricket')

[('cricketing', 0.8372225761413574),
 ('cricketers', 0.8165745735168457),
 ('Test_cricket', 0.8094819188117981),
 ('Twenty##_cricket', 0.8068488240242004),
 ('Twenty##', 0.7624265551567078),
 ('Cricket', 0.75413978099823),
 ('cricketer', 0.7372578382492065),
 ('twenty##', 0.7316356897354126),
 ('T##_cricket', 0.7304614186286926),
 ('West_Indies_cricket', 0.6987985968589783)]

In [5]:
wv.similarity("hockey", "sports")

0.53541523

In [9]:
vec=wv["king"] - wv['man'] + wv['woman'] #this results eual to the vectors of Queen

In [10]:
vec.shape

(300,)

In [11]:
wv.most_similar([vec])

[('king', 0.8449392318725586),
 ('queen', 0.7300517559051514),
 ('monarch', 0.645466148853302),
 ('princess', 0.6156251430511475),
 ('crown_prince', 0.5818676352500916),
 ('prince', 0.5777117609977722),
 ('kings', 0.5613663792610168),
 ('sultan', 0.5376775860786438),
 ('Queen_Consort', 0.5344247817993164),
 ('queens', 0.5289887189865112)]

**ii. GloVe**

In [12]:
import numpy as np
import pandas as pd
import requests, zipfile, io
import os

# Download GloVe vectors (100-dimensional)
if not os.path.exists("glove.6B.100d.txt"):
    print("Downloading GloVe...")
    r = requests.get("http://nlp.stanford.edu/data/glove.6B.zip")
    z = zipfile.ZipFile(io.BytesIO(r.content))
    z.extract("glove.6B.100d.txt")


Downloading GloVe...


In [13]:
# Load GloVe into dictionary
print("Loading GloVe vectors...")
glove_embeddings = {}
with open("glove.6B.100d.txt", 'r', encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.array(values[1:], dtype='float32')
        glove_embeddings[word] = vector
print("Loaded GloVe.")

Loading GloVe vectors...
Loaded GloVe.


In [14]:
# Sample text
text = "GeeksForGeeks provides great nlp content"

# Average GloVe embedding
def sentence_embedding(sentence, embedding_dict, dim=100):
    words = sentence.lower().split()
    vectors = [embedding_dict[word] for word in words if word in embedding_dict]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(dim)

embedding_vector = sentence_embedding(text, glove_embeddings)
print("\nSentence:", text)
print("GloVe Embedding Vector (100-d):")
print(embedding_vector)


Sentence: GeeksForGeeks provides great nlp content
GloVe Embedding Vector (100-d):
[-2.55629003e-01  5.10675013e-01  1.14457503e-01 -7.91724995e-02
  3.21207523e-01 -2.47885004e-01  5.01724966e-02 -2.46722504e-01
  1.56765580e-02 -3.46251756e-01  4.79699969e-02 -4.49602485e-01
 -2.03704998e-01 -2.04105005e-01  2.68669993e-01 -1.09435499e-01
  2.83295006e-01 -1.01142496e-01 -5.21988515e-03  5.25447488e-01
 -4.81329739e-01 -4.92179990e-02  1.30396634e-01 -1.61736012e-01
 -1.58252507e-01 -6.54775053e-02  1.36207491e-02  3.54979992e-01
 -4.22390014e-01 -2.93450058e-02 -3.79514992e-01  5.23430705e-01
 -1.35989994e-01 -2.31747493e-01  2.57324994e-01  1.99144363e-01
 -1.03391252e-01  2.80031502e-01 -3.81422520e-01 -1.33595005e-01
 -8.55274871e-03 -1.48388416e-01 -4.95285094e-02 -4.30911988e-01
 -1.45562500e-01 -2.05320001e-01 -1.25419796e-02 -3.22815001e-01
  2.35306740e-01 -2.05952004e-01  2.91929990e-02 -2.00350061e-02
 -1.86074972e-02  4.11282480e-01  5.82964858e-03 -1.18292499e+00
  8.92

**6. Dependency Parsing**

In [1]:
text_adv2 = "The quick brown fox jumps over the lazy dog"

In [3]:
# Install spaCy and download model
!pip install spacy
!pip uninstall -y numpy scipy
!pip install numpy==1.24.3 scipy==1.10.1
!python -m spacy download en_core_web_sm

Found existing installation: numpy 2.3.2
Uninstalling numpy-2.3.2:
  Successfully uninstalled numpy-2.3.2
Found existing installation: scipy 1.10.1
Uninstalling scipy-1.10.1:
  Successfully uninstalled scipy-1.10.1
Collecting numpy==1.24.3
  Using cached numpy-1.24.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)
Collecting scipy==1.10.1
  Using cached scipy-1.10.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (58 kB)
Using cached numpy-1.24.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB)
Using cached scipy-1.10.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.1 MB)
Installing collected packages: numpy, scipy
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [scipy]
[1A[2K[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
jaxlib 0.5.1 requires nump

^C


In [1]:
import spacy

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

# Function for dependency parsing
def parse_sentence(text):
    doc = nlp(text)
    print(f"{'Token':<12} {'Dep':<10} {'Head':<12} {'POS':<8}")
    print("-" * 45)
    for token in doc:
        print(f"{token.text:<12} {token.dep_:<10} {token.head.text:<12} {token.pos_:<8}")

# Example usage
text_adv2 = "The quick brown fox jumps over the lazy dog"
print("Dependency Parsing Result:\n")
parse_sentence(text_adv2)

Dependency Parsing Result:

Token        Dep        Head         POS     
---------------------------------------------
The          det        fox          DET     
quick        amod       fox          ADJ     
brown        amod       fox          ADJ     
fox          nsubj      jumps        NOUN    
jumps        ROOT       jumps        VERB    
over         prep       jumps        ADP     
the          det        dog          DET     
lazy         amod       dog          ADJ     
dog          pobj       over         NOUN    


In [3]:
# Example
text_adv2 = "The quick brown fox jumps over the lazy dog"
print("Dependency Parsing Result:\n")
parse_sentence(text_adv2)

Dependency Parsing Result:

Token        Dep        Head         POS     
---------------------------------------------
The          det        fox          DET     
quick        amod       fox          ADJ     
brown        amod       fox          ADJ     
fox          nsubj      jumps        NOUN    
jumps        ROOT       jumps        VERB    
over         prep       jumps        ADP     
the          det        dog          DET     
lazy         amod       dog          ADJ     
dog          pobj       over         NOUN    
