In [2]:
import pandas as pd

# Input categories
categories = ['cat', 'dog', 'bird']

# Create a DataFrame
df = pd.DataFrame({'Category': categories})

# Perform one-hot encoding
one_hot_encoded = pd.get_dummies(df['Category'])

# Output the one-hot encoded DataFrame
print(one_hot_encoded)


    bird    cat    dog
0  False   True  False
1  False  False   True
2   True  False  False


In [4]:
from nltk.util import ngrams

# Input sentence
sentence = "I love natural language processing"

# Tokenize the sentence
tokens = sentence.split()

# Generate unigrams, bigrams, and trigrams
unigrams = list(ngrams(tokens, 1))
bigrams = list(ngrams(tokens, 2))
trigrams = list(ngrams(tokens, 3))

# Display the n-grams
print("Unigrams:", unigrams)
print("Bigrams:", bigrams)
print("Trigrams:", trigrams)


Unigrams: [('I',), ('love',), ('natural',), ('language',), ('processing',)]
Bigrams: [('I', 'love'), ('love', 'natural'), ('natural', 'language'), ('language', 'processing')]
Trigrams: [('I', 'love', 'natural'), ('love', 'natural', 'language'), ('natural', 'language', 'processing')]


In [None]:
# Unigrams: [('I',), ('love',), ('natural',), ('language',), ('processing',)]
# Bigrams: [('I', 'love'), ('love', 'natural'), ('natural', 'language'), ('language', 'processing')]
# Trigrams: [('I', 'love', 'natural'), ('love', 'natural', 'language'), ('natural', 'language', 'processing')]


## Bag of Words (BoW) Vectorization from Scratch


In [5]:
from collections import Counter
import numpy as np

def bow_vectorizer(corpus):
    """
    Implements Bag of Words (BoW) vectorization from scratch.

    :param corpus: List of sentences (documents) as strings
    :return: Vocabulary, BoW matrix
    """
    # Tokenize and create a vocabulary
    tokenized_corpus = [sentence.lower().split() for sentence in corpus]
    vocabulary = sorted(set(word for sentence in tokenized_corpus for word in sentence))

    # Create BoW matrix
    bow_matrix = np.zeros((len(corpus), len(vocabulary)), dtype=int)
    for i, sentence in enumerate(tokenized_corpus):
        word_count = Counter(sentence)
        for j, word in enumerate(vocabulary):
            bow_matrix[i, j] = word_count[word]

    return vocabulary, bow_matrix

# Example corpus
corpus = [
    "I love natural language processing",
    "Natural language processing is amazing",
    "I love programming and data science"
]

vocabulary, bow_matrix = bow_vectorizer(corpus)

# Display the results
print("Vocabulary:", vocabulary)
print("BoW Matrix:")
print(bow_matrix)


Vocabulary: ['amazing', 'and', 'data', 'i', 'is', 'language', 'love', 'natural', 'processing', 'programming', 'science']
BoW Matrix:
[[0 0 0 1 0 1 1 1 1 0 0]
 [1 0 0 0 1 1 0 1 1 0 0]
 [0 1 1 1 0 0 1 0 0 1 1]]


## Computing TF-IDF Matrix Using Scikit-learn


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Example corpus
corpus = [
    "I love natural language processing",
    "Natural language processing is amazing",
    "I love programming and data science"
]

# Initialize the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Compute the TF-IDF matrix
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)

# Extract feature names (vocabulary)
feature_names = tfidf_vectorizer.get_feature_names_out()

# Convert TF-IDF matrix to array
tfidf_array = tfidf_matrix.toarray()

# Display the results
print("Feature Names:", feature_names)
print("TF-IDF Matrix:")
print(tfidf_array)


Feature Names: ['amazing' 'and' 'data' 'is' 'language' 'love' 'natural' 'processing'
 'programming' 'science']
TF-IDF Matrix:
[[0.         0.         0.         0.         0.5        0.5
  0.5        0.5        0.         0.        ]
 [0.51741994 0.         0.         0.51741994 0.3935112  0.
  0.3935112  0.3935112  0.         0.        ]
 [0.         0.46735098 0.46735098 0.         0.         0.35543247
  0.         0.         0.46735098 0.46735098]]
