In [None]:
import pandas as pd
import numpy as np

# Sample data
data = {
    'text': [
        'The cat sat on mat',
        'The cat sleeps on mat',
        'Hello world this is AI',
        'this is AI from world '
    ],
    'target': [1, 0, 1, 0]
}

# Create DataFrame
df = pd.DataFrame(data)

# Display DataFrame
df


Unnamed: 0,text,target
0,The cat sat on mat,1
1,The cat sleeps on mat,0
2,Hello world this is AI,1
3,this is AI from world,0


#One hot encoding

In [None]:
from sklearn.preprocessing import OneHotEncoder
# Tokenize the text data
tokenized_text = df['text'].apply(lambda x: x.split())
tokenized_text

0        [The, cat, sat, on, mat]
1     [The, cat, sleeps, on, mat]
2    [Hello, world, this, is, AI]
3     [this, is, AI, from, world]
Name: text, dtype: object

In [None]:
# Flatten the list of lists and find unique words
unique_words = sorted(set(word for sentence in tokenized_text for word in sentence))
print(len(unique_words))
print(unique_words)

['AI',
 'Hello',
 'The',
 'cat',
 'from',
 'is',
 'mat',
 'on',
 'sat',
 'sleeps',
 'this',
 'world']

In [None]:
# One-hot encode the words
encoder = OneHotEncoder(sparse=False)
encoder.fit([[word] for word in unique_words])

# Encode each sentence
one_hot_encoded = tokenized_text.apply(lambda x: encoder.transform([[word] for word in x]))

# Display the results
for index, row in df.iterrows():
    print(f"Original sentence: {row['text']}")
    print(f"One-hot encoded vectors:\n{one_hot_encoded[index]}\n")

Original sentence: The cat sat on mat
One-hot encoded vectors:
[[0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]]

Original sentence: The cat sleeps on mat
One-hot encoded vectors:
[[0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]]

Original sentence: Hello world this is AI
One-hot encoded vectors:
[[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]

Original sentence: this is AI from world 
One-hot encoded vectors:
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 



#Bag of words model

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer()
bow=cv.fit_transform(df['text'])

In [None]:
print(cv.get_feature_names_out())

['ai' 'cat' 'from' 'hello' 'is' 'mat' 'on' 'sat' 'sleeps' 'the' 'this'
 'world']


In [None]:
print(len(cv.vocabulary_))
print(cv.vocabulary_)

12
{'the': 9, 'cat': 1, 'sat': 7, 'on': 6, 'mat': 5, 'sleeps': 8, 'hello': 3, 'world': 11, 'this': 10, 'is': 4, 'ai': 0, 'from': 2}


In [None]:
for index, row in df.iterrows():
    print(f"Original sentence: {row['text']}")
    print(f"bow encoded vectors:\n{bow[index].toarray()}\n")

Original sentence: The cat sat on mat
bow encoded vectors:
[[0 1 0 0 0 1 1 1 0 1 0 0]]

Original sentence: The cat sleeps on mat
bow encoded vectors:
[[0 1 0 0 0 1 1 0 1 1 0 0]]

Original sentence: Hello world this is AI
bow encoded vectors:
[[1 0 0 1 1 0 0 0 0 0 1 1]]

Original sentence: this is AI from world 
bow encoded vectors:
[[1 0 1 0 1 0 0 0 0 0 1 1]]



In [None]:
cv.transform(['The cat sat on bench']).toarray()

array([[0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0]])

#N-grams
 - (2,2) bigrams
 - (3,3) trigrams
 - (4,4) fourgrams
 - (5,5) fivegrams etc...

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(ngram_range=(2,2))#try ngram_range=(3,3),(4,4),(1,2),(1,3),(1,4),(2,3),(2,4),(3,4)
#here range can be taken from 1 to 5 , since each row in the data set contain 5 words
bow=cv.fit_transform(df['text'])

In [None]:
print(cv.get_feature_names_out())
print(len(cv.vocabulary_))
print(cv.vocabulary_)

['ai from' 'cat sat' 'cat sleeps' 'from world' 'hello world' 'is ai'
 'on mat' 'sat on' 'sleeps on' 'the cat' 'this is' 'world this']
12
{'the cat': 9, 'cat sat': 1, 'sat on': 7, 'on mat': 6, 'cat sleeps': 2, 'sleeps on': 8, 'hello world': 4, 'world this': 11, 'this is': 10, 'is ai': 5, 'ai from': 0, 'from world': 3}


In [None]:
for index, row in df.iterrows():
    print(f"Original sentence: {row['text']}")
    print(f"bow encoded vectors:\n{bow[index].toarray()}\n")

Original sentence: The cat sat on mat
bow encoded vectors:
[[0 1 0 0 0 0 1 1 0 1 0 0]]

Original sentence: The cat sleeps on mat
bow encoded vectors:
[[0 0 1 0 0 0 1 0 1 1 0 0]]

Original sentence: Hello world this is AI
bow encoded vectors:
[[0 0 0 0 1 1 0 0 0 0 1 1]]

Original sentence: this is AI from world 
bow encoded vectors:
[[1 0 0 1 0 1 0 0 0 0 1 0]]



#TF-IDF vectorizer

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer()
tf_vector=tfidf.fit_transform(df['text'])

In [None]:
print(tfidf.vocabulary_)
print(tfidf.idf_)
print(tfidf.get_feature_names_out())

{'the': 9, 'cat': 1, 'sat': 7, 'on': 6, 'mat': 5, 'sleeps': 8, 'hello': 3, 'world': 11, 'this': 10, 'is': 4, 'ai': 0, 'from': 2}
[1.51082562 1.51082562 1.91629073 1.91629073 1.51082562 1.51082562
 1.51082562 1.91629073 1.91629073 1.51082562 1.51082562 1.51082562]
['ai' 'cat' 'from' 'hello' 'is' 'mat' 'on' 'sat' 'sleeps' 'the' 'this'
 'world']


In [None]:
for index, row in df.iterrows():
    print(f"Original sentence: {row['text']}")
    print(f"tfidf encoded vectors:\n{tf_vector[index].toarray()}\n")

Original sentence: The cat sat on mat
tfidf encoded vectors:
[[0.         0.4222466  0.         0.         0.         0.4222466
  0.4222466  0.53556627 0.         0.4222466  0.         0.        ]]

Original sentence: The cat sleeps on mat
tfidf encoded vectors:
[[0.         0.4222466  0.         0.         0.         0.4222466
  0.4222466  0.         0.53556627 0.4222466  0.         0.        ]]

Original sentence: Hello world this is AI
tfidf encoded vectors:
[[0.4222466  0.         0.         0.53556627 0.4222466  0.
  0.         0.         0.         0.         0.4222466  0.4222466 ]]

Original sentence: this is AI from world 
tfidf encoded vectors:
[[0.4222466  0.         0.53556627 0.         0.4222466  0.
  0.         0.         0.         0.         0.4222466  0.4222466 ]]

