<a href="https://colab.research.google.com/github/namjunwoo223/wanted_pre_onboarding/blob/main/wanted_preonboarding_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import re
from math import log
from itertools import chain

class Tokenizer():
  def __init__(self):
    self.word_dict = {'oov': 0}
    self.fit_checker = False
  
  def preprocessing(self, sequences):
    result = []
    temp = list(map(lambda x : re.sub("[^a-zA-Z0-9]", " ", x.lower()).split(" "), sequences))

    for i in temp:
      result.append([x for x in i if x != ''])

    return result
    
  def fit(self, sequences):
    self.fit_checker = False
    
    words_set = set(chain(*self.preprocessing(sequences)))
    
    for idx, i in enumerate(words_set):
        self.word_dict[i] = idx+1
    
    self.fit_checker = True
  
  def transform(self, sequences):
    result = []
    tokens = self.preprocessing(sequences)
    
    if self.fit_checker:
        for i in tokens:
            result.append(list(map(lambda x : self.word_dict[x], i)))
        return result
    else:
        raise Exception("Tokenizer instance is not fitted yet.")
      
  def fit_transform(self, sequences):
    self.fit(sequences)
    result = self.transform(sequences)
 
    return result

class TfidfVectorizer:
    def __init__(self, tokenizer):
      self.tokenizer = tokenizer
      self.fit_checker = False
      self.idf_mat = None
      self.tfidf_matrix = []
      
    def fit(self, sequences):
        tokenized = self.tokenizer.fit_transform(sequences)
        toknized_list = list(chain(*tokenized))
        n = len(tokenized)
        
        idf = [0 for x in range(len(set(toknized_list)))]
        
        for i in tokenized:
            for j in set(i):
                idf[j-1] += 1

        for i in range(len(idf)):
            idf[i] = log(n/(1+idf[i])) #idf[i] == df(d,t)
        print(idf)
        self.idf_mat = idf
        self.fit_checker = True
    
    def transform(self, sequences):
        if self.fit_checker:
            tokenized = self.tokenizer.transform(sequences)

            for i in tokenized:
                temp = []
                for j in i:
                    temp.append(i.count(j) * self.idf_mat[j-1]) #tf * idf
                
                self.tfidf_matrix.append(temp)
            
            return self.tfidf_matrix
        else:  
            raise Exception("TfidfVectorizer instance is not fitted yet.")
          
    
    def fit_transform(self, sequences):
        self.fit(sequences)
        return self.transform(sequences)

In [2]:
lst = ['I go to school.', 'I LIKE pizza!']

In [3]:
#Tokenizer 1번 문제
tok = Tokenizer()
tok.preprocessing(lst)

[['i', 'go', 'to', 'school'], ['i', 'like', 'pizza']]

In [4]:
#Tokenizer 2번 문제
tok = Tokenizer()
tok.fit(lst)
tok.word_dict

{'go': 4, 'i': 1, 'like': 3, 'oov': 0, 'pizza': 5, 'school': 6, 'to': 2}

In [5]:
#Tokenizer 3번 문제
tok = Tokenizer()
tok.fit(lst)
tok.transform(lst)

[[1, 4, 2, 6], [1, 3, 5]]

In [6]:
#TF-IDF 1번 문제
tok = Tokenizer()
tfidf = TfidfVectorizer(tok)
tfidf.fit(lst)

[-0.40546510810816444, 0.0, 0.0, 0.0, 0.0, 0.0]


In [7]:
#TF-IDF 2번 문제
tok = Tokenizer()
tfidf = TfidfVectorizer(tok)
tfidf.fit(lst)
tfidf.transform(lst)


[-0.40546510810816444, 0.0, 0.0, 0.0, 0.0, 0.0]


[[-0.40546510810816444, 0.0, 0.0, 0.0], [-0.40546510810816444, 0.0, 0.0]]