# Self Implementation

In [1]:
document_corpus = ["this is a good phone phone",
                    "this is a bad mobile phone",
                    "she is a good good cat",
                    "he has a bad temper temper",
                    "this mobile phone phone is not good good"
                    ]

In [2]:
data_corpus = set()

for row in document_corpus:
    for word in row.split():
        if word not in data_corpus:
            data_corpus.add(word)

data_corpus = sorted(data_corpus)

print(data_corpus)

['a', 'bad', 'cat', 'good', 'has', 'he', 'is', 'mobile', 'not', 'phone', 'she', 'temper', 'this']


### Index based Encoding

In [4]:
res = len(max(document_corpus, key=len).split(" "))
res

8

In [7]:
index_based_encoding = []

for row in document_corpus:
    row_encoding = []
    split = row.split(" ")
    for i in range(res):
        if i <= len(split)-1:
            row_encoding.append(data_corpus.index(split[i])+1)
        else:
            row_encoding.append(0)
    index_based_encoding.append(row_encoding)

print(index_based_encoding)

[[13, 7, 1, 4, 10, 10, 0, 0], [13, 7, 1, 2, 8, 10, 0, 0], [11, 7, 1, 4, 4, 3, 0, 0], [6, 5, 1, 2, 12, 12, 0, 0], [13, 8, 10, 10, 7, 9, 4, 4]]


### Bag of word

**Binary BOW**

In [8]:
one_hot_encoding = []

for row in document_corpus:
    row_encoding = []
    split = row.split(" ")
    for word in data_corpus:
        if word in split:
            row_encoding.append(1)
        else:
            row_encoding.append(0)
    one_hot_encoding.append(row_encoding)

print(one_hot_encoding)

[[1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1], [1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1], [1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0], [1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0], [0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1]]


**BOW**

In [10]:
one_hot_encoding = []

for row in document_corpus:
    row_encoding = []
    split = row.split(" ")
    for word in data_corpus:
        count = split.count(word)
        row_encoding.append(count)         
    one_hot_encoding.append(row_encoding)

print(one_hot_encoding)

[[1, 0, 0, 1, 0, 0, 1, 0, 0, 2, 0, 0, 1], [1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1], [1, 0, 1, 2, 0, 0, 1, 0, 0, 0, 1, 0, 0], [1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 2, 0], [0, 0, 0, 2, 0, 0, 1, 1, 1, 2, 0, 0, 1]]


### TF-IDF (Term Frequency - Inverse Document Frequency)

In [12]:
tf_dict = {}
i = 0

for row in document_corpus:
    row_dict = {}
    split = row.split(" ")
    for word in split:
        if word not in row_dict.keys():
            row_dict[word] = split.count(word)
    tf_dict[i] = row_dict
    i += 1

print(tf_dict)

{0: {'this': 1, 'is': 1, 'a': 1, 'good': 1, 'phone': 2}, 1: {'this': 1, 'is': 1, 'a': 1, 'bad': 1, 'mobile': 1, 'phone': 1}, 2: {'she': 1, 'is': 1, 'a': 1, 'good': 2, 'cat': 1}, 3: {'he': 1, 'has': 1, 'a': 1, 'bad': 1, 'temper': 2}, 4: {'this': 1, 'mobile': 1, 'phone': 2, 'is': 1, 'not': 1, 'good': 2}}


In [16]:
import math


def calculate_tf(word, sentence_num):
    row_dict = tf_dict[int(sentence_num)]
    return row_dict[word]/sum(row_dict.values())


def calculate_idf(word):
    doc_num = 0
    for key, value in tf_dict.items():
        if word in value.keys():
            doc_num += 1
    return math.log(len(data_corpus) / doc_num + 1)


def tf_idf(word, sentence_num):
    return round(calculate_tf(word, sentence_num) * calculate_idf(word), 5)

In [17]:
tf_idf_encoding = []
for i in range(len(document_corpus)):
    row = document_corpus[i]
    split = row.split(" ")
    row_encoding = []
    for word in data_corpus:
        if word in split:
            row_encoding.append(tf_idf(word, i))
        else:
            row_encoding.append(0)
    tf_idf_encoding.append(row_encoding)

print(tf_idf_encoding)

[[0.24115, 0, 0, 0.279, 0, 0, 0.24115, 0, 0, 0.55799, 0, 0, 0.279], [0.24115, 0.33582, 0, 0, 0, 0, 0.24115, 0.33582, 0, 0.279, 0, 0, 0.279], [0.24115, 0, 0.43984, 0.55799, 0, 0, 0.24115, 0, 0, 0, 0.43984, 0, 0], [0.24115, 0.33582, 0, 0, 0.43984, 0.43984, 0, 0, 0, 0, 0, 0.87969, 0], [0, 0, 0, 0.41849, 0, 0, 0.18086, 0.25186, 0.32988, 0.41849, 0, 0, 0.20925]]


# Python Library Implementation

### BOW Encoding

In [20]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(document_corpus)
print(vectorizer.get_feature_names_out())

['bad' 'cat' 'good' 'has' 'he' 'is' 'mobile' 'not' 'phone' 'she' 'temper'
 'this']


In [21]:
print(X.toarray())

[[0 0 1 0 0 1 0 0 2 0 0 1]
 [1 0 0 0 0 1 1 0 1 0 0 1]
 [0 1 2 0 0 1 0 0 0 1 0 0]
 [1 0 0 1 1 0 0 0 0 0 2 0]
 [0 0 2 0 0 1 1 1 2 0 0 1]]


### TF-IDF Encoding

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(document_corpus)
print(vectorizer.get_feature_names_out())

['bad' 'cat' 'good' 'has' 'he' 'is' 'mobile' 'not' 'phone' 'she' 'temper'
 'this']


In [25]:
print(X.toarray())

[[0.         0.         0.38611275 0.         0.         0.32481022
  0.         0.         0.7722255  0.         0.         0.38611275]
 [0.50860988 0.         0.         0.         0.         0.35516134
  0.50860988 0.         0.42219214 0.         0.         0.42219214]
 [0.         0.49317635 0.6605719  0.         0.         0.27784695
  0.         0.         0.         0.49317635 0.         0.        ]
 [0.31283963 0.         0.         0.38775666 0.38775666 0.
  0.         0.         0.         0.         0.77551332 0.        ]
 [0.         0.         0.54659234 0.         0.         0.22990535
  0.32923666 0.40808036 0.54659234 0.         0.         0.27329617]]
