In [1]:
from sklearn.feature_extraction.text import CountVectorizer

# Sample text data
corpus = ["I love NLP", "NLP is amazing", "I love deep learning"]

# Convert text to a Bag of Words model
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)

# Convert sparse matrix to an array
print("Feature Names:", vectorizer.get_feature_names_out())
print("BoW Matrix:\n", X.toarray())


Feature Names: ['amazing' 'deep' 'is' 'learning' 'love' 'nlp']
BoW Matrix:
 [[0 0 0 0 1 1]
 [1 0 1 0 0 1]
 [0 1 0 1 1 0]]


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert text to TF-IDF vectors
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(corpus)

print("TF-IDF Feature Names:", tfidf_vectorizer.get_feature_names_out())
print("TF-IDF Matrix:\n", X_tfidf.toarray())


TF-IDF Feature Names: ['amazing' 'deep' 'is' 'learning' 'love' 'nlp']
TF-IDF Matrix:
 [[0.         0.         0.         0.         0.70710678 0.70710678]
 [0.62276601 0.         0.62276601 0.         0.         0.4736296 ]
 [0.         0.62276601 0.         0.62276601 0.4736296  0.        ]]


In [3]:
from sklearn.feature_extraction.text import CountVectorizer

corpus = ["I love NLP and deep learning"]
vectorizer = CountVectorizer(ngram_range=(1,2))  # Unigrams and Bigrams
X = vectorizer.fit_transform(corpus)

print("N-Grams:", vectorizer.get_feature_names_out())


N-Grams: ['and' 'and deep' 'deep' 'deep learning' 'learning' 'love' 'love nlp'
 'nlp' 'nlp and']


In [4]:
from nltk.util import skipgrams
from nltk.tokenize import word_tokenize

text = "I love natural language processing"
tokens = word_tokenize(text)

# Generate skip-grams (2-grams with a skip distance of 2)
skip_grams = list(skipgrams(tokens, 2, 2))
print(skip_grams)


[('I', 'love'), ('I', 'natural'), ('I', 'language'), ('love', 'natural'), ('love', 'language'), ('love', 'processing'), ('natural', 'language'), ('natural', 'processing'), ('language', 'processing')]


In [5]:
from tensorflow.keras.preprocessing.text import Tokenizer

# Define a sample text corpus
texts = ["I love NLP", "Deep learning is powerful"]

# Tokenize the text
tokenizer = Tokenizer(num_words=10, oov_token="<UNK>")
tokenizer.fit_on_texts(texts)

# Convert text to sequences
sequences = tokenizer.texts_to_sequences(["I love AI"])
print("Tokenized Output:", sequences)


Tokenized Output: [[2, 3, 1]]
