In [1]:
import re
import pandas as pd
from math import log
import numpy as np

In [58]:
class Tokenizer():
    def __init__(self):
        self.word_dict = {'oov': 0}
        self.fit_checker = False
  
    def preprocessing(self, sequences):
        result = []
        for i in range(len(sequences)):
            result.append(re.findall("[\w]+", sequences[i]))
            for j in range(len(result[i])):
                result[i][j] = result[i][j].lower()
        return result
  
    def fit(self, sequences):
        self.fit_checker = False
        tokens = self.preprocessing(sequences)
        num = 1
        for i in range(len(tokens)):
            for j in range(len(tokens[i])):
                if tokens[i][j] not in self.word_dict:
                    self.word_dict[tokens[i][j]] = num
                    num += 1
                else :
                    pass
        self.fit_checker = True

    def transform(self, sequences):
        result = []
        tokens = self.preprocessing(sequences)
        if self.fit_checker:
            for z in range(len(tokens)):
                temp=[]
                for x in range(len(tokens[z])):
                    if tokens[z][x] in self.word_dict.keys():
                        temp.append(self.word_dict.get(tokens[z][x]))
                    else:
                        temp.append(self.word_dict.get('oov'))
                result.append(temp)
            return result
        else:
            raise Exception("Tokenizer instance is not fitted yet.")

    def fit_transform(self, sequences):
        self.fit(sequences)
        result = self.transform(sequences)
        return result

In [139]:
class TfidfVectorizer:
    def __init__(self,tokenizer):
        self.tokenizer = tokenizer
        self.fit_checker = False
  
    def fit(self, sequences):
        tokenized = self.tokenizer.fit_transform(sequences)
        vocab = list(self.tokenizer.word_dict.keys())
        result = []
        for t in vocab:
            df=0
            for j in token:
                if t in j:
                    df=df+1
                else :
                    pass
            result.append(log(len(sequences)/(df+1)))
        self.idf = np.array(result)
        self.fit_checker = True

    def transform(self, sequences):
        if self.fit_checker:
            result =[]
            tokenized = self.tokenizer.transform(sequences)
            token = self.tokenizer.preprocessing(sequences)
            for i in range(len(tokenized)):
                result.append([])
                d = token[i]
                for j in range(len(vocab)):
                    t = vocab[j]
                    result[-1].append(d.count(t))
            tf_ = np.array(result)
            self.tfidf_matrix = pd.DataFrame(tf_ * self.idf.transpose(), columns=vocab)
            return self.tfidf_matrix
        else:
            raise Exception("TfidfVectorizer instance is not fitted yet.")

  
    def fit_transform(self, sequences):
        self.fit(sequences)
        return self.transform(sequences)

In [140]:
sequence=['I go to school.','I LIKE Pizza!','Get out']
T=Tokenizer()
C=TfidfVectorizer(T)
C.fit_transform(sequence)

Unnamed: 0,oov,i,go,to,school,like,pizza,get,out
0,0.0,0.0,0.405465,0.405465,0.405465,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.405465,0.405465,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.405465,0.405465
