In [1]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
words = ["anh", 'em', 'gia đình', 'bạn bè','anh', 'em']
le.fit(words)

print('Class of words:', le.classes_)

# Dua ve dinh dang so
x = le.transform(words)
print('Convert to number: ', x)

# Bien doi sang class:
print('Convert to classes: ', le.inverse_transform(x))

Class of words: ['anh' 'bạn bè' 'em' 'gia đình']
Convert to number:  [0 2 3 1 0 2]
Convert to classes:  ['anh' 'em' 'gia đình' 'bạn bè' 'anh' 'em']


## OnhotEncoder

In [2]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np

oh = OneHotEncoder()


classes_indices = list(zip(le.classes_, np.arange(len(le.classes_))))
print('classes_indices: ', classes_indices)

oh.fit(classes_indices)
print('One-hot categories and indices:', oh.categories_)

# list words to onehot
words_indices = list(zip(words,x))
print('words and corresponding indices: ', words_indices)
one_hot = oh.transform(words_indices).toarray()
print('Tranform words into one-hot matricesL \n', one_hot)
print('Inverse transform to categories from one hot matrices: \n', oh.inverse_transform(one_hot))

classes_indices:  [('anh', 0), ('bạn bè', 1), ('em', 2), ('gia đình', 3)]
One-hot categories and indices: [array(['anh', 'bạn bè', 'em', 'gia đình'], dtype=object), array([0, 1, 2, 3], dtype=object)]
words and corresponding indices:  [('anh', 0), ('em', 2), ('gia đình', 3), ('bạn bè', 1), ('anh', 0), ('em', 2)]
Tranform words into one-hot matricesL 
 [[1. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0. 0. 0. 1.]
 [0. 1. 0. 0. 0. 1. 0. 0.]
 [1. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 1. 0.]]
Inverse transform to categories from one hot matrices: 
 [['anh' 0]
 ['em' 2]
 ['gia đình' 3]
 ['bạn bè' 1]
 ['anh' 0]
 ['em' 2]]


In [3]:
import scipy.linalg as ln 
import numpy as np 
from underthesea import word_tokenize

# sentence = 'Khoa học dữ liệu là một lĩnh vực đòi hỏi kiến thức về toán và lập trình. Tôi rất yêu thích Khoa học dữ liệu.'
sentence = 'Khoa học dữ liệu là một lĩnh vực đòi hỏi kiến thức về toán và lập trình. Tôi rất yêu thích Khoa học dữ liệu.'

token = word_tokenize(sentence)
print('tokenization of sentences: ', token)

tokenization of sentences:  ['Khoa học', 'dữ liệu', 'là', 'một', 'lĩnh vực', 'đòi hỏi', 'kiến thức', 'về', 'toán', 'và', 'lập trình', '.', 'Tôi', 'rất', 'yêu thích', 'Khoa học', 'dữ liệu', '.']


In [4]:
from scipy.sparse import coo_matrix
# tao ma tran spare
row = [0,1,2,3,4,5,6,7,8,9,11,12,13]
col = [1,2,3,4,5,6,7,8,9,10,12,13,14]
data = [2,1,1,1,1,1,1,1,1,1,1,1,1]
X = coo_matrix((data, (row, col)), shape = (15,15)).toarray()
X

array([[0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [5]:
# Pha tich suy bien
U, S_diag, V = ln.svd(X)

print('Shape of U: ', U.shape)
print('Length of diagonal: ', len(S_diag))
print('Shape of V: ', V.shape)

Shape of U:  (15, 15)
Length of diagonal:  15
Shape of V:  (15, 15)


In [7]:
import numpy as np
S_truncate = np.zeros(shape=(6,15))
np.fill_diagonal(S_truncate, S_diag[:6])
print('S truncate: \n', S_truncate)
print('Word Embeding 6 dimensionality: \n', np.dot(S_truncate, V))

S truncate: 
 [[2. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
Word Embeding 6 dimensionality: 
 [[0. 2. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]]
