# 원티드 프리온보딩 코스 선발과제

## 문제 1) Tokenizer 생성하기

In [None]:
import re

class Tokenizer():
  def __init__(self):
    self.word_dict = {'oov': 0}
    self.fit_checker = False
  
  def preprocessing(self, sequences):
    result = []
    '''
    문제 1-1.
      check) 영어와 특수문자만 있다고 가정함.(숫자, 타언어 등은 고려하지 않음.)
    '''
    for sequence in sequences:
      result.append(re.sub('[^a-z ]','', sequence.lower()).split())

    return result
  
  def fit(self, sequences):
    self.fit_checker = False
    '''
    문제 1-2.
    '''
    tokens = self.preprocessing(sequences)
    for token in tokens:
      for word in token:
        if word not in self.word_dict.keys():
          self.word_dict[word] = len(self.word_dict)

    self.fit_checker = True
  
  def transform(self, sequences):
    result = []
    tokens = self.preprocessing(sequences)
    if self.fit_checker:
      '''
      문제 1-3.
      '''
      for token in tokens:
        result.append([self.word_dict[word] if word in self.word_dict.keys() else self.word_dict['oov'] for word in token])

      return result
    else:
      raise Exception("Tokenizer instance is not fitted yet.")
      
  def fit_transform(self, sequences):
    self.fit(sequences)
    result = self.transform(sequences)
    return result

In [None]:
tokenizer = Tokenizer()
input1 = ['I go to school.', 'I LIKE pizza!']

print('문제 1-1')
print(tokenizer.preprocessing(input1))
print('-' * 100)

print('문제 1-2')
tokenizer.fit(input1)
print(tokenizer.word_dict)
print('-' * 100)

print('문제 1-3')
print(tokenizer.transform(input1))
print('-' * 100)

문제 1-1
[['i', 'go', 'to', 'school'], ['i', 'like', 'pizza']]
----------------------------------------------------------------------------------------------------
문제 1-2
{'oov': 0, 'i': 1, 'go': 2, 'to': 3, 'school': 4, 'like': 5, 'pizza': 6}
----------------------------------------------------------------------------------------------------
문제 1-3
[[1, 2, 3, 4], [1, 5, 6]]
----------------------------------------------------------------------------------------------------


## 문제 2) TfidfVectorizer 생성하기

In [None]:
import math

class TfidfVectorizer:
  def __init__(self, tokenizer):
    self.tokenizer = tokenizer
    self.fit_checker = False
  
  def fit(self, sequences):
    tokenized = self.tokenizer.fit_transform(sequences)
    '''
    문제 2-1.
    '''
    self.idf_matrix = []
    n = len(sequences)
    df_list = [0] * (max(sum(tokenized, [])) + 1)

    for i in range(len(df_list)):
      for token in tokenized:
        if i in token:
          df_list[i] += 1

    self.idf_matrix = [math.log(n/(1+df)) for df in df_list[1:]]
    self.fit_checker = True
    
  def transform(self, sequences):
    if self.fit_checker:
      tokenized = self.tokenizer.transform(sequences)
      '''
      문제 2-2.
      '''
      self.tfidf_matrix = []
      
      for token in tokenized:
        tfidf_list = []
        for i in range(len(self.idf_matrix)):
          tfidf_list.append(self.idf_matrix[i] * token.count(i+1))
        
        self.tfidf_matrix.append(tfidf_list)

      return self.tfidf_matrix
    else:
      raise Exception("TfidfVectorizer instance is not fitted yet.")

  
  def fit_transform(self, sequences):
    self.fit(sequences)
    return self.transform(sequences)

In [None]:
tfidf = TfidfVectorizer(Tokenizer())
input2 = ['I go to school.', 'I LIKE pizza!']

print('문제 2-1')
tfidf.fit(input2)
print(tfidf.idf_matrix)
print('-' * 100)

print('문제 2-2')
print(tfidf.transform(input2))
print('-' * 100)

문제 2-1
[-0.40546510810816444, 0.0, 0.0, 0.0, 0.0, 0.0]
----------------------------------------------------------------------------------------------------
문제 2-2
[[-0.40546510810816444, 0.0, 0.0, 0.0, 0.0, 0.0], [-0.40546510810816444, 0.0, 0.0, 0.0, 0.0, 0.0]]
----------------------------------------------------------------------------------------------------


In [None]:
# 추가 예시
input3 = [
  'i want apple',
  'i want banana',
  'long yellow banana banana',
  'i like frute'
] 

tfidf = TfidfVectorizer(Tokenizer())
tfidf.fit_transform(input3)

[[0.0, 0.28768207245178085, 0.6931471805599453, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.28768207245178085, 0.0, 0.28768207245178085, 0.0, 0.0, 0.0, 0.0],
 [0.0,
  0.0,
  0.0,
  0.5753641449035617,
  0.6931471805599453,
  0.6931471805599453,
  0.0,
  0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.6931471805599453, 0.6931471805599453]]