In [1]:
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import mysql.connector

# Establish a connection to the MySQL database
connection = mysql.connector.connect(
    host='127.0.0.1',
    port=13306,
    user='root',
    password='root',
    database='pyml'
)

# Read the table data using pandas
query = "SELECT contents, JSON_UNQUOTE(JSON_EXTRACT(tags, '$[0].slug')) AS slug FROM viblo_interview"
df = pd.read_sql(query, connection)

# Close the database connection
connection.close()

  df = pd.read_sql(query, connection)


In [2]:
df.head()
df['slug'] = df['slug'].apply(lambda x: str(x).replace("b'", "").replace("'", ""))
print(df['slug'][0])
str(df['slug'][0]) == 'frontend-development'

frontend-development


True

In [3]:
from keras.models import Sequential
from keras import backend as K, initializers, regularizers, constraints, Model
from keras.layers import Embedding, Flatten, MaxPooling1D, Dense, LSTM, Bidirectional, Attention, Layer, Input, Activation, Dropout, SpatialDropout1D
from keras_preprocessing.text import Tokenizer
from keras_preprocessing import sequence
from gensim import corpora, models
from gensim.models import KeyedVectors
import numpy as np
from keras.utils import to_categorical
import tensorflow as tf

# Add attention layer to the deep learning network
class attention(Layer):
    def __init__(self,**kwargs):
        super(attention,self).__init__(**kwargs)

    def build(self,input_shape):
        self.W=self.add_weight(name='attention_weight', shape=(input_shape[-1],1), 
                               initializer='random_normal', trainable=True)
        self.b=self.add_weight(name='attention_bias', shape=(input_shape[1],1), 
                               initializer='zeros', trainable=True)        
        super(attention, self).build(input_shape)

    def call(self,x):
        print('current stack:', x.shape, self.W.shape, self.b.shape)
        # Alignment scores. Pass them through tanh function
        e = K.tanh(K.dot(x,self.W)+self.b)
        # Remove dimension of size 1
        e = K.squeeze(e, axis=-1)
        print('call: e', e.shape)
        # Compute the weights
        alpha = K.softmax(e)
        print('call: alpha', alpha.shape)
        # Reshape to tensorFlow format
        alpha = K.expand_dims(alpha, axis=-1)
        print('call: alpha expand_dimp', alpha.shape)
        # Compute the context vector
        context = x * alpha
        context = K.sum(context, axis=1)
        print('call: context', context.shape)
        return context
    
w2v_model = KeyedVectors.load_word2vec_format('../model/wiki.vi.model.bin', binary=True)

class ModelBuild:
    def __init__(self):
        self.tokenizer = Tokenizer()
        self.model = None
        self.label_mapping = []
        self.flipped_label_mapping = {}

    def convertToCategories(self, y_class):
        # Example list of label texts
        labels = y_class

        # Create a dictionary to map unique labels to integers
        self.label_mapping = {label: i for i, label in enumerate(set(labels))}
        self.flipped_label_mapping = {value: key for key, value in self.label_mapping.items()}
        # Convert labels to corresponding integers
        integer_labels = [self.label_mapping[label] for label in labels]

        # One-hot encode the integer labels
        encoded_labels = to_categorical(integer_labels)
        return encoded_labels

    def build_model_word2vec(self, X_text, y_class):        
        self.tokenizer.fit_on_texts(X_text)
        sequences = self.tokenizer.texts_to_sequences(X_text)

        maxlen = 100
        X = sequence.pad_sequences(sequences, maxlen=maxlen)

        num_classes = y_class.nunique()
        y_train = self.convertToCategories(y_class)

        print('num_classes', num_classes)
        embedding_dim = 400
        word_index = self.tokenizer.word_index
        num_words = min(len(word_index) + 1, len(w2v_model.index_to_key))
        embedding_matrix = np.zeros((num_words, embedding_dim))

        print('num_words:', num_words)
        for word, i in word_index.items():
            if i >= num_words:
                continue
            if word in w2v_model.index_to_key:
                embedding_matrix[i] = w2v_model.get_vector(word)
        model = Sequential()
        model.add(Embedding(num_words, embedding_dim, input_length=maxlen, weights=[embedding_matrix], trainable=False))
        model.add(Bidirectional(LSTM(64, return_sequences=True, input_shape=(maxlen, ))))
        # model.add(Flatten())
        model.add(attention()) # pass a list of two tensors 
        model.add(Dense(num_classes, activation='sigmoid'))
        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
        model.summary()
        model.fit(X, y_train, epochs=10, batch_size=32, validation_split=0.2)
        self.model = model

        return model

    def build_model_seq_word2vec(self, X_text, y_class):        
        self.tokenizer.fit_on_texts(X_text)
        sequences = self.tokenizer.texts_to_sequences(X_text)

        maxlen = 100
        X = sequence.pad_sequences(sequences, maxlen=maxlen)

        num_classes = y_class.nunique()
        y_train = self.convertToCategories(y_class)

        print('num_classes', num_classes)
        embedding_dim = 400
        word_index = self.tokenizer.word_index
        num_words = min(len(word_index) + 1, len(w2v_model.index_to_key))
        embedding_matrix = np.zeros((num_words, embedding_dim))

        print('num_words:', num_words)
        for word, i in word_index.items():
            if i >= num_words:
                continue
            if word in w2v_model.index_to_key:
                embedding_matrix[i] = w2v_model.get_vector(word)

        inputs = Input(name='inputs', shape=[maxlen])
        layer = Embedding(num_words, embedding_dim, input_length=maxlen)(inputs)
        layer = LSTM(64, return_sequences=True)(layer)
        layer = Bidirectional(LSTM(64, return_sequences=True))(layer)
        layer = Attention()([layer, layer])
        # layer = Flatten()(layer)
        layer = MaxPooling1D()(layer)
        layer = Dense(num_classes, activation='sigmoid')(layer)
        # layer = Activation('sigmoid')(layer)
        model = Model(inputs=inputs,outputs=layer)
        # model = Sequential()
        # model.add(Embedding(num_words, embedding_dim, input_length=maxlen, weights=[embedding_matrix], trainable=False))
        # model.add(Bidirectional(LSTM(64, return_sequences=True, input_shape=(maxlen, ))))
        # # model.add(Flatten())
        # model.add(attention()) # pass a list of two tensors 
        # model.add(Dense(num_classes, activation='sigmoid'))
        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
        model.summary()
        model.fit(X, y_train, epochs=10, batch_size=32, validation_split=0.2)
        self.model = model

        return model

    def predict(self, X_text):
        sequences = self.tokenizer.texts_to_sequences(X_text)
        maxlen = 100
        X = sequence.pad_sequences(sequences, maxlen=maxlen)
        t = self.model.predict(X)
        max_indices = np.argmax(t, axis=1)
        print('predict', max_indices, self.flipped_label_mapping)
        return np.vectorize(self.flipped_label_mapping.get)(max_indices)


model = ModelBuild()
model.build_model_seq_word2vec(df['contents'], df['slug'])

num_classes 104
num_words: 3701
Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 inputs (InputLayer)            [(None, 100)]        0           []                               
                                                                                                  
 embedding (Embedding)          (None, 100, 400)     1480400     ['inputs[0][0]']                 
                                                                                                  
 lstm (LSTM)                    (None, 100, 64)      119040      ['embedding[0][0]']              
                                                                                                  
 bidirectional (Bidirectional)  (None, 100, 128)     66048       ['lstm[0][0]']                   
                                                              

2023-07-01 15:12:38.533786: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


ValueError: in user code:

    File "/Users/ngocp/.pyenv/versions/3.10.3/lib/python3.10/site-packages/keras/engine/training.py", line 1249, in train_function  *
        return step_function(self, iterator)
    File "/Users/ngocp/.pyenv/versions/3.10.3/lib/python3.10/site-packages/keras/engine/training.py", line 1233, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/Users/ngocp/.pyenv/versions/3.10.3/lib/python3.10/site-packages/keras/engine/training.py", line 1222, in run_step  **
        outputs = model.train_step(data)
    File "/Users/ngocp/.pyenv/versions/3.10.3/lib/python3.10/site-packages/keras/engine/training.py", line 1024, in train_step
        loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "/Users/ngocp/.pyenv/versions/3.10.3/lib/python3.10/site-packages/keras/engine/training.py", line 1082, in compute_loss
        return self.compiled_loss(
    File "/Users/ngocp/.pyenv/versions/3.10.3/lib/python3.10/site-packages/keras/engine/compile_utils.py", line 265, in __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "/Users/ngocp/.pyenv/versions/3.10.3/lib/python3.10/site-packages/keras/losses.py", line 152, in __call__
        losses = call_fn(y_true, y_pred)
    File "/Users/ngocp/.pyenv/versions/3.10.3/lib/python3.10/site-packages/keras/losses.py", line 284, in call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "/Users/ngocp/.pyenv/versions/3.10.3/lib/python3.10/site-packages/keras/losses.py", line 2176, in binary_crossentropy
        backend.binary_crossentropy(y_true, y_pred, from_logits=from_logits),
    File "/Users/ngocp/.pyenv/versions/3.10.3/lib/python3.10/site-packages/keras/backend.py", line 5680, in binary_crossentropy
        return tf.nn.sigmoid_cross_entropy_with_logits(

    ValueError: `logits` and `labels` must have the same shape, received ((None, 50, 104) vs (None, 104)).


In [None]:
X_test_sentence = [
    'Git Submodule trong trường hợp nào ?',
    'Có sự thừa kế theo cấp bậc giữa các module trong không? Giải thích.',
    'Sự khác biệt giữa từ khóa break và continue trong Java?',
    'Trình bày về Output Buffering trong PHP?',
    'RESTful API là gì?',
    'Viết tắt của php có nghĩa là gì ?',
    'Phân biệt POST và GET trong php?',
    'Cờ HttpOnly có tác dụng gì cho cookie?',
    'so sánh sự khác nhau giữa Mysql và MongoDB',
    'Tại sao phải sử dụng hàm khởi tạo?',
]

results = model.predict(X_test_sentence)
print(results)

predict [21 10 37 21 71 10 71 21 71 21] {0: 'asp-net', 1: 'laravel', 2: 'content-creator', 3: 'data-structures-and-algorithms', 4: 'android-os', 5: 'ai', 6: 'data-science', 7: 'typescript', 8: 'c-2', 9: 'ml', 10: 'ruby', 11: 'rails', 12: 'devops', 13: 'python', 14: 'design-pattern', 15: 'None', 16: 'frontend-development', 17: 'google-cloud-platform', 18: 'jquery', 19: 'mobile-development', 20: 'programming', 21: 'git', 22: 'wordpress', 23: 'java', 24: 'non-tech-jo', 25: 'css', 26: 'chatbot', 27: 'computer-network', 28: 'external-communication', 29: 'nosql', 30: 'artificial-intelligence', 31: 'postgresql', 32: 'nuxt-js', 33: 'dart', 34: 'malware-analysis', 35: 'web-security', 36: 'network-security', 37: 'net', 38: 'security-tools', 39: 'react-native', 40: 'secure-coding', 41: 'redis', 42: 'risk-management', 43: 'source-code', 44: 'cyber-security', 45: 'golang', 46: 'linux', 47: 'agile', 48: 'mysql', 49: 'cloud-security', 50: 'marketing', 51: 'business-analyst', 52: 'project-management',

2023-06-04 18:19:10.312372: W tensorflow/core/grappler/costs/op_level_cost_estimator.cc:690] Error in PredictCost() for the op: op: "Softmax" attr { key: "T" value { type: DT_FLOAT } } inputs { dtype: DT_FLOAT shape { unknown_rank: true } } device { type: "CPU" model: "0" num_cores: 8 environment { key: "cpu_instruction_set" value: "ARM NEON" } environment { key: "eigen" value: "3.4.90" } l1_cache_size: 16384 l2_cache_size: 524288 l3_cache_size: 524288 memory_size: 268435456 } outputs { dtype: DT_FLOAT shape { unknown_rank: true } }
