**1. Build a Tf-Idf vectorizer and compare results with Sklearn**

In [18]:
from collections import Counter
from tqdm import tqdm
from scipy.sparse import csr_matrix
import math
from operator import itemgetter
from sklearn.preprocessing import normalize
import numpy as np
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import os

### 1.1 Corpus

In [19]:
## sklearn collection of string documents
corpus = [
     'this is the first document',
     'this document is the second document',
     'and this is the third one',
     'is this the first document',
]

In [20]:
def IDF(dataset,unique_words ):
  idf_dict = {}
  N = len(dataset)
  for i in unique_words:
    cnt = 0
    for row in dataset:
      if i in row.split(" "):
        cnt += 1
    idf_dict[i] = 1+math.log((1+N)/(1+cnt))
  return idf_dict

In [21]:
def fit(dataset):
  unique_words = set() # at first will intialize an empty set
  # check if its list or not
  if isinstance(dataset, (list,)):
    for row in dataset:
      for word in row.split(" "):
        if len(word) <2:
          continue
        unique_words.add(word)
    unique_words = sorted(list(unique_words))
    vocab = {j:i for i,j in enumerate(unique_words)}
    idf_values = IDF(dataset, unique_words)
    return idf_values, vocab
  else:
    print("you need to pass list of sentence")

idf_values, vocab = fit(corpus)

In [22]:
print(vocab)

{'and': 0, 'document': 1, 'first': 2, 'is': 3, 'one': 4, 'second': 5, 'the': 6, 'third': 7, 'this': 8}


In [23]:
def transform(dataset, vocab):
  rows = []
  columns = []
  values = []
  if isinstance(dataset, (list,)):
    for idx, document in enumerate(tqdm(dataset)):
      # for each document in the dataset it will return a dict type object where key is the word and values is its frequency
      word_freq = dict(Counter(document.split()))
      #for each unique word in the document
      for word, freq in word_freq.items():
        if len(word) <2 :
          continue
        # we will check if there is any vocabulary that we build with the fit function
        # dict.get() function will return the values
        col_index = vocab.get(word, -1) # retieving the dimension number of the word
        # if the word exists
        if col_index != -1:
          # we will store the index of the document
          rows.append(idx)
          # we are stoing the dimensions of the word
          columns.append(col_index)
          # we are storing the frequency of the word
          tf = freq/len(document)
          idf_ = idf_values[word]
          tfidf = tf*idf_
          values.append(tfidf)
          sparse_matrix = csr_matrix((values, (rows,columns)), shape = (len(dataset),len(vocab) ))
    return normalize(sparse_matrix,norm = 'l2' )
  else:
    print('you need to pass list of strings')

print(transform(corpus, vocab))

100%|██████████| 4/4 [00:00<00:00, 438.04it/s]

  (0, 1)	0.46979138557992045
  (0, 2)	0.5802858236844359
  (0, 3)	0.3840852409148149
  (0, 6)	0.3840852409148149
  (0, 8)	0.3840852409148149
  (1, 1)	0.6876235979836938
  (1, 3)	0.2810886740337529
  (1, 5)	0.5386476208856763
  (1, 6)	0.2810886740337529
  (1, 8)	0.2810886740337529
  (2, 0)	0.511848512707169
  (2, 3)	0.267103787642168
  (2, 4)	0.511848512707169
  (2, 6)	0.267103787642168
  (2, 7)	0.511848512707169
  (2, 8)	0.267103787642168
  (3, 1)	0.46979138557992045
  (3, 2)	0.5802858236844359
  (3, 3)	0.3840852409148149
  (3, 6)	0.3840852409148149
  (3, 8)	0.3840852409148149





In [24]:
#Converting the sparse matrix to dense matrix with regards to 1 particular document
print(transform(corpus, vocab)[0].toarray() )

100%|██████████| 4/4 [00:00<00:00, 403.41it/s]

[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]





## Sklearn Implementation

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectorizer.fit(corpus)
skl_output = vectorizer.transform(corpus)

In [26]:
print(vectorizer.get_feature_names_out())

['and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']


In [27]:
print(vectorizer.idf_)

[1.91629073 1.22314355 1.51082562 1.         1.91629073 1.91629073
 1.         1.91629073 1.        ]


In [28]:
skl_output.shape

(4, 9)

In [29]:
print(skl_output[0])

  (0, 8)	0.38408524091481483
  (0, 6)	0.38408524091481483
  (0, 3)	0.38408524091481483
  (0, 2)	0.5802858236844359
  (0, 1)	0.46979138557992045


## Task 2

In [30]:
# Below is the code to load the cleaned_strings loaded from a pickle file
# Here corpus is a list type

import pickle
with open('cleaned_strings','rb') as f:
  corpus_2 = pickle.load(f)

# printing the length of the corpus of loaded
print('Number of documents in corpus= ', len(corpus_2))

Number of documents in corpus=  746


In [35]:
def fit(corpus_2):
  #list of documents inside documents
  lst_0 = list()
  j = 0
  for i in corpus_2:
    lst_0.append([])
    lst_0[j].append(i)
    j += 1

  lst_1 = [] # separate each word
  for i in lst_0:
    for j in i:
      lst_1.append(j.split(" "))
    unique_words = set() #at first we will intialize an emply set
    if isinstance(corpus_2, (list,)):
      for row in corpus_2:
        # for each review in the dataset
        for word in row.split(" "):
          if len(word) <2:
            continue
          unique_words.add(word)
      unique_words = sorted(list(unique_words))

    # Calculate the Idf values

    i = 0
    idf_val = []
    N = len(lst_1)
    for sent in range(len(lst_1)):
      for word in range(len(unique_words)):
        cnt = 0
        for i in range(len(lst_1)):
          if lst_1[i].count(unique_words[word]) != 0:
            cnt += 1
        idf = 1+(math.log(1+N)/(1+cnt))
        idf_val.append(idf)
      break
    my_idf = np.array(idf)

    # create max feature
    vocab_0 = dict(zip(unique_words,idf_val))
    sorted(vocab_0.values())
    a = Counter(vocab_0)
    vocab = a.most_common(50)

    vocab = dict(vocab)
    max_feature = list(vocab.keys())
    return max_feature

In [36]:
max_feature = fit(corpus_2)
print(max_feature)

['aailiyah', 'abandoned', 'ability', 'abroad', 'absolutely', 'abstruse', 'abysmal', 'academy', 'accents', 'accessible', 'acclaimed', 'accolades', 'accurate', 'accurately', 'accused', 'achievement', 'achille', 'ackerman', 'act', 'acted', 'acting', 'action', 'actions', 'actor', 'actors', 'actress', 'actresses', 'actually', 'adams', 'adaptation', 'add', 'added', 'addition', 'admins', 'admiration', 'admitted', 'adorable', 'adrift', 'adventure', 'advise', 'aerial', 'aesthetically', 'affected', 'affleck', 'afraid', 'africa', 'afternoon', 'age', 'aged', 'ages']


In [37]:
def transform(corpus_2, max_factor):
  # Creating list of documents inside documents
  lst_0 = list()
  j = 0
  for i in corpus_2:
    lst_0.append([])
    lst_0[j].append(i)
    j += 1

  # Separating each word
  lst_1 = []
  for i in lst_0:
    for j in i:
      lst_1.append(j.split(" "))

  # Computing tf
  i = 0
  Values = []
  for sent in tqdm(range(len(lst_1))):
    Values.append([])
    for word in range(len(max_feature)):
      tf = lst_1[sent].count(max_feature[word])/len(lst_1[sent])
      # Calculate Idf
      N = len(lst_1)
      cnt = 0
      for i in range(len(lst_1)):
        if lst_1[i].count(max_feature[word]) != 0:
          cnt += 1
      idf = 1 + (math.log(1+N)/ (1+cnt))
      # Calulcate Tf Idf
      tf_idf = tf*idf
      Values[sent].append(tf_idf)
  # normalizing
  normalized_val = normalize(Values, norm = 'l2')

  # sparse matrix
  sparse_val = csr_matrix(normalized_val)
  return sparse_val

In [39]:
b = transform(corpus_2, max_feature)

100%|██████████| 746/746 [00:15<00:00, 48.35it/s]


In [40]:
print("Tfidf of sparse matrix =\n",b ,'\n')
print("Tfidf of dense matrix = \n", b.toarray())

Tfidf of sparse matrix =
   (2, 20)	1.0
  (10, 36)	1.0
  (15, 20)	1.0
  (17, 20)	1.0
  (19, 4)	0.4036161695726854
  (19, 23)	0.4036161695726854
  (19, 24)	0.3274908632526112
  (19, 27)	0.3890062622991358
  (19, 32)	0.6446796395862537
  (26, 19)	1.0
  (28, 27)	1.0
  (36, 24)	1.0
  (41, 20)	1.0
  (49, 20)	1.0
  (56, 14)	1.0
  (60, 44)	1.0
  (62, 39)	1.0
  (65, 23)	1.0
  (68, 43)	1.0
  (72, 24)	1.0
  (86, 20)	1.0
  (104, 24)	1.0
  (134, 4)	0.5260794797568754
  (134, 24)	0.4268565928552014
  (134, 47)	0.7355500187715204
  :	:
  (644, 17)	0.4216657118243333
  (644, 18)	0.3137367916106911
  (644, 20)	0.45968143855276994
  (644, 23)	0.16263630331159218
  (644, 24)	0.13196176809297808
  (644, 49)	0.4216657118243333
  (649, 20)	1.0
  (658, 20)	1.0
  (660, 4)	1.0
  (667, 41)	1.0
  (669, 20)	1.0
  (673, 23)	1.0
  (688, 27)	1.0
  (697, 12)	1.0
  (706, 2)	0.9145079097363547
  (706, 20)	0.4045680202755074
  (707, 24)	1.0
  (710, 47)	1.0
  (712, 27)	1.0
  (718, 18)	1.0
  (722, 20)	0.26294834741835266