In [1]:
import re
import pandas as pd
from math import log
import numpy as np

In [2]:
class Tokenizer():
    def __init__(self):
        self.word_dict = {'oov': 0}
        self.fit_checker = False
  
    def preprocessing(self, sequences):
        result = []
        for i in range(len(sequences)):
            result.append(re.findall("[\w]+", sequences[i]))
            for j in range(len(result[i])):
                result[i][j] = result[i][j].lower()
        return result
  
    def fit(self, sequences):
        self.fit_checker = False
        tokens = self.preprocessing(sequences)
        num = 1
        for i in range(len(tokens)):
            for j in range(len(tokens[i])):
                if tokens[i][j] not in self.word_dict:
                    self.word_dict[tokens[i][j]] = num
                    num += 1
                else :
                    pass
        self.fit_checker = True

    def transform(self, sequences):
        result = []
        tokens = self.preprocessing(sequences)
        if self.fit_checker:
            for z in range(len(tokens)):
                temp=[]
                for x in range(len(tokens[z])):
                    if tokens[z][x] in self.word_dict.keys():
                        temp.append(self.word_dict.get(tokens[z][x]))
                    else:
                        temp.append(self.word_dict.get('oov'))
                result.append(temp)
            return result
        else:
            raise Exception("Tokenizer instance is not fitted yet.")

    def fit_transform(self, sequences):
        self.fit(sequences)
        result = self.transform(sequences)
        return result

In [3]:
class TfidfVectorizer:
    def __init__(self,tokenizer):
        self.tokenizer = tokenizer
        self.fit_checker = False
  
    def fit(self, sequences):
        tokenized = self.tokenizer.fit_transform(sequences)
        self.vocab = list(self.tokenizer.word_dict.keys())
        self.token = self.tokenizer.preprocessing(sequences)
        self.idf = []
        for t in self.vocab:
            df=0
            for j in self.token:
                if t in j:
                    df=df+1
                else :
                    pass
            self.idf.append(log(len(sequences)/(df+1)))
        self.fit_checker = True

    def transform(self, sequences):
        if self.fit_checker:
            result =[]
            tokenized = self.tokenizer.transform(sequences)
            #token = self.tokenizer.preprocessing(sequences)
            #vocab = list(self.tokenizer.word_dict.keys())
            for i in range(len(tokenized)):
                result.append([])
                d = self.token[i]
                for j in range(len(self.vocab)):
                    t = self.vocab[j]
                    result[-1].append(d.count(t))
            tf_ = np.array(result)
            idf_ = np.array(self.idf)
            self.tfidf_matrix = pd.DataFrame(tf_ * idf_.transpose(), columns=self.vocab)
            return self.tfidf_matrix
        else:
            raise Exception("TfidfVectorizer instance is not fitted yet.")

  
    def fit_transform(self, sequences):
        self.fit(sequences)
        return self.transform(sequences)

In [4]:
sequence=['I go to school.','I LIKE Pizza!','Get out','Watch out!!']
T=Tokenizer()
print(T.fit_transform(sequence))
C=TfidfVectorizer(T)
print(C.fit_transform(sequence))

[[1, 2, 3, 4], [1, 5, 6], [7, 8], [9, 8]]
   oov         i        go        to    school      like     pizza       get  \
0  0.0  0.287682  0.693147  0.693147  0.693147  0.000000  0.000000  0.000000   
1  0.0  0.287682  0.000000  0.000000  0.000000  0.693147  0.693147  0.000000   
2  0.0  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.693147   
3  0.0  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   

        out     watch  
0  0.000000  0.000000  
1  0.000000  0.000000  
2  0.287682  0.000000  
3  0.287682  0.693147  
