# Text Representations
## Bag Of Words

In [6]:
# Import libraries
from sklearn.feature_extraction.text import CountVectorizer

# Create sample documents
documents = ["This is the first document.",
              "This document is the second document.",
              "And this is the third one."]

# Create the Bag-of-Words model
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(documents)

# Print the feature names and the document-term matrix
print("Feature Names:", vectorizer.get_feature_names_out())
print("Document-Term Matrix:\n", X.toarray())

Feature Names: ['and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']
Document-Term Matrix:
 [[0 1 1 1 0 0 1 0 1]
 [0 2 0 1 0 1 1 0 1]
 [1 0 0 1 1 0 1 1 1]]


## N-Grams

In [7]:
# Import libraries
from sklearn.feature_extraction.text import CountVectorizer

# Create sample documents
documents = ["This is the first document.",
              "This document is the second document.",
              "And this is the third one."]

# Create the Bag-of-Words model with unigrams, bigrams, and trigrams
vectorizer = CountVectorizer(ngram_range=(1, 3))
X = vectorizer.fit_transform(documents)

# Print the feature names and the document-term matrix
print("Feature Names:", vectorizer.get_feature_names_out())
print("Document-Term Matrix:\n", X.toarray())


Feature Names: ['and' 'and this' 'and this is' 'document' 'document is' 'document is the'
 'first' 'first document' 'is' 'is the' 'is the first' 'is the second'
 'is the third' 'one' 'second' 'second document' 'the' 'the first'
 'the first document' 'the second' 'the second document' 'the third'
 'the third one' 'third' 'third one' 'this' 'this document'
 'this document is' 'this is' 'this is the']
Document-Term Matrix:
 [[0 0 0 1 0 0 1 1 1 1 1 0 0 0 0 0 1 1 1 0 0 0 0 0 0 1 0 0 1 1]
 [0 0 0 2 1 1 0 0 1 1 0 1 0 0 1 1 1 0 0 1 1 0 0 0 0 1 1 1 0 0]
 [1 1 1 0 0 0 0 0 1 1 0 0 1 1 0 0 1 0 0 0 0 1 1 1 1 1 0 0 1 1]]


# TF-IDF

In [9]:
# Import libraries
from sklearn.feature_extraction.text import TfidfVectorizer

# Create sample documents
documents = ["This is the first document.",
              "This document is the second document.",
              "And this is the third one."]

# Create the TF-IDF model
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(documents)

# Print the feature names and the TF-IDF matrix
print("Feature Names:", tfidf_vectorizer.get_feature_names_out())
print("TF-IDF Matrix:\n", X_tfidf.toarray())

Feature Names: ['and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']
TF-IDF Matrix:
 [[0.         0.46941728 0.61722732 0.3645444  0.         0.
  0.3645444  0.         0.3645444 ]
 [0.         0.7284449  0.         0.28285122 0.         0.47890875
  0.28285122 0.         0.28285122]
 [0.49711994 0.         0.         0.29360705 0.49711994 0.
  0.29360705 0.49711994 0.29360705]]


# Word Embedding : Word2Vec

In [24]:
# Install gensim library if not already installed
# !pip3 install "gensim==3.8.2
import gensim
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

# Create sample documents
documents = ["This is the first document.",
              "This document is the second document.",
              "And this is the third one."]

# Tokenize the documents
tokenized_documents = [word_tokenize(doc.lower()) for doc in documents]

# Train Word2Vec model
model = Word2Vec(sentences=tokenized_documents, vector_size=100, window=5, min_count=1, workers=4)

# Example: Get vector representation for the word 'document'
vector_representation = model.wv['document']
print("Vector Representation for 'document':", vector_representation)

ModuleNotFoundError: No module named 'gensim'