In [1]:
import numpy as np
import pandas as pd

# NLP Feature Extraction Techniques

This notebook covers all major feature extraction methods in NLP:
1. **Bag of Words (CountVectorizer)**
2. **TF-IDF Vectorizer**
3. **N-grams (Unigram, Bigram, Trigram)**
4. **Binary Bag of Words**
5. **Word2Vec**
6. **Hashing Vectorizer**

In [3]:
df = pd.DataFrame({'text':['people watch campusx', 'campusx watch campusx', 'people write comment', 'campusx write comment'], 'output':[1,1,0,0]})

In [4]:
df.head()

Unnamed: 0,text,output
0,people watch campusx,1
1,campusx watch campusx,1
2,people write comment,0
3,campusx write comment,0


In [6]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

In [7]:
bou = cv.fit_transform(df['text'])

In [8]:
# print vocabulary

print(cv.vocabulary_)

{'people': 2, 'watch': 3, 'campusx': 0, 'write': 4, 'comment': 1}


In [9]:
print(bou[0].toarray())
print(bou[1].toarray())

[[1 0 1 1 0]]
[[2 0 0 1 0]]


In [10]:
cv.transform(['campusx watch and write comment of campusx']).toarray()

array([[2, 1, 0, 1, 1]])

## 1. Bag of Words (CountVectorizer) - Already Demonstrated Above

## 2. TF-IDF Vectorizer (Term Frequency - Inverse Document Frequency)
TF-IDF reflects how important a word is to a document in a collection. It penalizes common words and rewards rare but important words.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer()

# Fit and transform the text data
tfidf_matrix = tfidf.fit_transform(df['text'])

# Display vocabulary
print("TF-IDF Vocabulary:")
print(tfidf.vocabulary_)
print("\nFeature Names:", tfidf.get_feature_names_out())

In [None]:
# Display TF-IDF Matrix as DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out())
print("TF-IDF Matrix:")
tfidf_df

In [None]:
# Transform new text using TF-IDF
new_text = ['campusx watch and write comment of campusx']
tfidf_new = tfidf.transform(new_text)
print("TF-IDF for new text:")
print(tfidf_new.toarray())

## 3. N-grams (Unigram, Bigram, Trigram)
N-grams capture word sequences. Unigram (n=1), Bigram (n=2), Trigram (n=3).

In [None]:
# Unigram (single words) - default
cv_unigram = CountVectorizer(ngram_range=(1, 1))
unigram_matrix = cv_unigram.fit_transform(df['text'])
print("Unigram Features:", cv_unigram.get_feature_names_out())
print("Unigram Matrix:\n", unigram_matrix.toarray())

In [None]:
# Bigram (two consecutive words)
cv_bigram = CountVectorizer(ngram_range=(2, 2))
bigram_matrix = cv_bigram.fit_transform(df['text'])
print("Bigram Features:", cv_bigram.get_feature_names_out())
print("\nBigram Matrix:\n", bigram_matrix.toarray())

In [None]:
# Trigram (three consecutive words)
cv_trigram = CountVectorizer(ngram_range=(3, 3))
trigram_matrix = cv_trigram.fit_transform(df['text'])
print("Trigram Features:", cv_trigram.get_feature_names_out())
print("\nTrigram Matrix:\n", trigram_matrix.toarray())

In [None]:
# Combined N-grams (Unigram + Bigram + Trigram)
cv_combined = CountVectorizer(ngram_range=(1, 3))
combined_matrix = cv_combined.fit_transform(df['text'])
print("Combined N-gram Features:", cv_combined.get_feature_names_out())
print("\nCombined N-gram Matrix Shape:", combined_matrix.shape)

## 4. Binary Bag of Words
Instead of word counts, it uses 1 if word is present, 0 if absent.

In [None]:
# Binary Bag of Words
cv_binary = CountVectorizer(binary=True)
binary_matrix = cv_binary.fit_transform(df['text'])

print("Binary BoW Features:", cv_binary.get_feature_names_out())
print("\nBinary Matrix:")
binary_df = pd.DataFrame(binary_matrix.toarray(), columns=cv_binary.get_feature_names_out())
binary_df

In [None]:
# Compare: Regular BoW vs Binary BoW for repeated words
test_text = ['campusx watch campusx']  # 'campusx' appears twice

# Regular Count
regular_count = cv.transform(test_text)
print("Regular BoW (count):", regular_count.toarray())

# Binary Count
binary_count = cv_binary.transform(test_text)
print("Binary BoW (0/1):", binary_count.toarray())

## 5. Hashing Vectorizer
A memory-efficient alternative to CountVectorizer. Uses hashing to map words to features.

In [None]:
from sklearn.feature_extraction.text import HashingVectorizer

# Initialize Hashing Vectorizer with fixed number of features
hv = HashingVectorizer(n_features=10, alternate_sign=False)
hash_matrix = hv.fit_transform(df['text'])

print("Hashing Vectorizer Matrix Shape:", hash_matrix.shape)
print("\nHashing Matrix:")
print(hash_matrix.toarray())

## 6. Word2Vec (Word Embeddings)
Word2Vec creates dense vector representations that capture semantic meaning.

In [None]:
from gensim.models import Word2Vec

# Tokenize the text
tokenized_text = [text.split() for text in df['text']]
print("Tokenized Text:", tokenized_text)

# Train Word2Vec model
# vector_size: dimension of word vectors
# window: context window size
# min_count: minimum word frequency
# sg: 0 for CBOW, 1 for Skip-gram
w2v_model = Word2Vec(sentences=tokenized_text, vector_size=5, window=2, min_count=1, sg=0)

print("\nWord2Vec Vocabulary:", list(w2v_model.wv.key_to_index.keys()))

In [None]:
# Get vector for a specific word
word = 'campusx'
print(f"Vector for '{word}':")
print(w2v_model.wv[word])

# Find similar words
print(f"\nWords similar to '{word}':")
print(w2v_model.wv.most_similar(word, topn=3))

In [None]:
# Create document vectors by averaging word vectors
def get_document_vector(text, model):
    words = text.split()
    word_vectors = [model.wv[word] for word in words if word in model.wv]
    if word_vectors:
        return np.mean(word_vectors, axis=0)
    return np.zeros(model.vector_size)

# Get document vectors for all texts
doc_vectors = np.array([get_document_vector(text, w2v_model) for text in df['text']])
print("Document Vectors Shape:", doc_vectors.shape)
print("\nDocument Vectors:")
print(doc_vectors)

## 7. Word2Vec - Skip-gram Model
Skip-gram predicts context words from the target word (opposite of CBOW).

In [None]:
# Skip-gram model (sg=1)
w2v_skipgram = Word2Vec(sentences=tokenized_text, vector_size=5, window=2, min_count=1, sg=1)

print("Skip-gram Word2Vec Vocabulary:", list(w2v_skipgram.wv.key_to_index.keys()))
print(f"\nVector for 'campusx' (Skip-gram):")
print(w2v_skipgram.wv['campusx'])

## 8. One-Hot Encoding
Each word is represented as a binary vector with only one 1.

In [None]:
# One-Hot Encoding for words
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Get all unique words
all_words = ' '.join(df['text']).split()
unique_words = list(set(all_words))
print("Unique Words:", unique_words)

# Label encode first
le = LabelEncoder()
integer_encoded = le.fit_transform(unique_words)

# One-hot encode
onehot_encoder = OneHotEncoder(sparse_output=False)
integer_encoded = integer_encoded.reshape(-1, 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)

# Display as DataFrame
onehot_df = pd.DataFrame(onehot_encoded, index=unique_words, columns=[f'dim_{i}' for i in range(len(unique_words))])
print("\nOne-Hot Encoding:")
onehot_df

## 9. Character-level N-grams
Extract features at character level instead of word level.

In [None]:
# Character-level N-grams (useful for spelling variations, typos)
cv_char = CountVectorizer(analyzer='char', ngram_range=(2, 3))
char_matrix = cv_char.fit_transform(df['text'])

print("Character N-gram Features (first 20):", cv_char.get_feature_names_out()[:20])
print("\nTotal Character Features:", len(cv_char.get_feature_names_out()))
print("Matrix Shape:", char_matrix.shape)

## 10. Additional Parameters & Techniques

In [None]:
# max_features: Limit vocabulary size to top N most frequent
cv_max = CountVectorizer(max_features=3)
max_matrix = cv_max.fit_transform(df['text'])
print("Max Features (top 3 most frequent):", cv_max.get_feature_names_out())

In [None]:
# min_df and max_df: Control document frequency thresholds
# min_df=2: word must appear in at least 2 documents
# max_df=0.9: word must appear in less than 90% of documents
cv_df = CountVectorizer(min_df=2, max_df=0.9)
df_matrix = cv_df.fit_transform(df['text'])
print("With min_df=2, max_df=0.9:", cv_df.get_feature_names_out())

In [None]:
# Stop words removal
cv_stop = CountVectorizer(stop_words='english')
stop_matrix = cv_stop.fit_transform(df['text'])
print("Without English stop words:", cv_stop.get_feature_names_out())

# Custom stop words
custom_stops = ['watch', 'write']
cv_custom = CountVectorizer(stop_words=custom_stops)
custom_matrix = cv_custom.fit_transform(df['text'])
print("Without custom stop words:", cv_custom.get_feature_names_out())

In [None]:
# Custom tokenizer with preprocessing
import re

def custom_tokenizer(text):
    # Convert to lowercase and extract words
    text = text.lower()
    tokens = re.findall(r'\b[a-z]+\b', text)
    return tokens

cv_custom_token = CountVectorizer(tokenizer=custom_tokenizer)
custom_token_matrix = cv_custom_token.fit_transform(df['text'])
print("Custom Tokenizer Features:", cv_custom_token.get_feature_names_out())

## 11. Summary Comparison Table

In [None]:
# Summary table comparing all feature extraction methods
summary_data = {
    'Method': [
        'Bag of Words', 
        'TF-IDF', 
        'Binary BoW', 
        'N-grams',
        'Hashing Vectorizer',
        'Word2Vec (CBOW)',
        'Word2Vec (Skip-gram)',
        'One-Hot Encoding',
        'Character N-grams'
    ],
    'Type': ['Sparse', 'Sparse', 'Sparse', 'Sparse', 'Sparse', 'Dense', 'Dense', 'Sparse', 'Sparse'],
    'Captures Semantics': ['No', 'No', 'No', 'Partial', 'No', 'Yes', 'Yes', 'No', 'No'],
    'Memory Efficient': ['Medium', 'Medium', 'Medium', 'Low', 'High', 'Medium', 'Medium', 'Low', 'Low'],
    'Best Use Case': [
        'Simple text classification',
        'Document ranking, search',
        'Short texts, presence matters',
        'Phrase detection',
        'Large-scale streaming data',
        'Semantic similarity (frequent words)',
        'Semantic similarity (rare words)',
        'Small vocabulary tasks',
        'Spelling variations, typos'
    ]
}

summary_df = pd.DataFrame(summary_data)
print("=== Feature Extraction Methods Comparison ===\n")
summary_df