In [None]:
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [None]:
!pip install emoji

Collecting emoji
  Downloading emoji-2.12.1-py3-none-any.whl (431 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m431.4/431.4 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: emoji
Successfully installed emoji-2.12.1


# Cách xử lý dữ liệu

# Loại bỏ các stopword các emoji các mã cần loại bỏ trong đoạn text

In [None]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')


def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = text.lower().split()  # Chuyển tất cả các từ thành chữ thường
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)

import emoji

def remove_emoji(text):
    return emoji.demojize(text)

import re

def remove_dates(text):
    return re.sub(r'\b(?:\d{1,2}[/\-]\d{1,2}[/\-]\d{2,4}|\d{1,2}[/\-]\d{1,2})\b', '', text)

def remove_numbers_and_special_characters(text):
    return re.sub(r'[^a-zA-Z\s]', '', text)



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
stop_words = set(stopwords.words('english'))
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

# Xử lý dữ liệu cơ bản

In [None]:
import pandas as pd
data = pd.read_csv("data train and test hashtag.csv")

data = data.drop(["Unnamed: 0"], axis = 1)
data = data.dropna()
data.isna().sum()
from collections import Counter

topic_counts = Counter(data['Topic'])
sorted_topics = sorted(topic_counts, key=topic_counts.get, reverse=True)
label_mapping = {topic: i for i, topic in enumerate(sorted_topics)}
data['Label'] = data['Topic'].map(label_mapping)


In [None]:
label_mapping

{'Fiction': 0,
 'other_1': 1,
 'others': 2,
 'Biography & Autobiography': 3,
 'other_2': 4,
 'Fiction Classics': 5,
 'History': 6,
 'Juvenile Fiction': 7,
 'Sci-fi Fantasy': 8,
 'Philosophy': 9,
 'Romance': 10,
 'Short Stories': 11,
 'Ebook': 12,
 'Drama': 13,
 'Health And Fitness': 14,
 'Literary Criticism': 15,
 'Marketing': 16,
 'Religion': 17,
 'Self-Improvement': 18,
 'Erotica': 19,
 'Business': 20,
 'Poetry': 21,
 'Humor': 22,
 'Comics & Graphic Novels': 23,
 'Mystery': 24,
 'Biography': 25,
 "Children's Classics": 26,
 'Health': 27,
 'Horror-Gothic': 28,
 'Juvenile Nonfiction': 29,
 'Religious': 30,
 'Food/Recipes': 31}

In [None]:

data['Descr'] = data['Description'].apply(remove_stopwords)
data['Descr'] = data['Description'].apply(remove_emoji)
data['Descr'] = data['Description'].apply(remove_dates)
data['Descr'] = data['Description'].apply(remove_numbers_and_special_characters)
data['Descr'] = data['Descr'].apply(remove_stopwords)


In [None]:
import numpy as np

# Tính số lượng nhãn duy nhất trong cột label
num_labels = len(data['Label'].unique())

# Tạo một array numpy với kích thước (số dòng, số lượng nhãn duy nhất)
one_hot_labels = np.zeros((len(data), num_labels))

# Đánh dấu những dòng tương ứng với nhãn của nó
for i, label in enumerate(data['Label']):
    one_hot_labels[i][label] = 1

# Thêm array one-hot vào DataFrame
data['one_hot_label'] = list(one_hot_labels)

# In ra DataFrame sau khi thêm cột one-hot
data.head()


Unnamed: 0,Title,Description,Topic,Descr,Label,one_hot_label
0,The Devil's Disciple,Set in Colonial America during the Revolutiona...,Fiction,set colonial america revolutionary era play te...,0,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,Taking Chances,Spice-o-meter Rating: This fun romance is a so...,Romance,spiceometer rating fun romance solid maybe sca...,10,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,Verdi : The Story of the Little Boy who Loved ...,This time Tapper moves his focus to Italy in h...,Biography & Autobiography,time tapper moves focus italy voyage among gre...,3,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,Success,The contents of this volume originally appeare...,Business,contents volume originally appeared weekly art...,20,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,Triplanetary,"ONE MAN DISCOVERED THE TRUTH—The Fall of Rome,...",Fiction,one man discovered truththe fall rome wars rac...,0,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [None]:

# Xử lý dữ liệu
X = data["Descr"]  # Mô tả làm đặc trưng đầu vào
y = data["Label"]  # Nhãn là nhãn đầu ra

# Chuyển đổi nhãn thành dạng nhãn nhị phân
mlb = MultiLabelBinarizer(classes=np.arange(31))
y_binary = mlb.fit_transform([[label] for label in y])

# Chia tập dữ liệu thành tập huấn luyện và tập kiểm tra
X_train, X_test, y_train, y_test = train_test_split(X, y_binary, test_size=0.2, random_state=42)

# Tokenize và chuyển đổi văn bản thành sequences
max_words = 1000  # Số lượng từ tối đa trong từ điển
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Padding sequences để có cùng độ dài
max_len = 100  # Độ dài tối đa của mỗi mô tả
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post')

# Xây dựng và huấn luyện mô hình MLP cho mỗi nhãn (giống như trong ví dụ trước)
# ...




In [None]:
y_test

array([[1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]])

# Xử lý dữ liệu cho model 1

In [None]:
import os

# Check if the files exist and print their sizes
if os.path.exists('mlp_model_hashtag1.pkl'):
    print(f"Size of mlp_model_hashtag1.pkl: {os.path.getsize('mlp_model_hashtag1.pkl')} bytes")
else:
    print("mlp_model_hashtag1.pkl does not exist.")

if os.path.exists('mlp_model_hashtag2.pkl'):
    print(f"Size of mlp_model_hashtag2.pkl: {os.path.getsize('mlp_model_hashtag2.pkl')} bytes")
else:
    print("mlp_model_hashtag2.pkl does not exist.")

Size of mlp_model_hashtag1.pkl: 10777106 bytes
Size of mlp_model_hashtag2.pkl: 10818993 bytes


In [None]:
def weighted_binary_crossentropy(y_true, y_pred):
    weight = 2.0  # Trọng số cho nhãn 1
    epsilon = K.epsilon()

    y_true = K.cast(y_true, y_pred.dtype)

    bce = -(weight * y_true * K.log(y_pred + epsilon) + (1 - y_true) * K.log(1 - y_pred + epsilon))
    return K.mean(bce)

def precision(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision


In [None]:
import pickle

# # Load the first model
from keras.models import load_model

# Tải mô hình
model_1= load_model('mlp_model_with_custom_loss1.h5',
                          custom_objects={'weighted_binary_crossentropy': weighted_binary_crossentropy,
                                          'precision': precision})

with open('mlp_model_hashtag2.pkl', 'rb') as file: # Changed to 'rb' mode
    model_2 = pickle.load(file)


In [None]:
import pickle

# Load the first model
with open('mlp_model_hashtag1.pkl', 'rb') as file: # Changed to 'rb' mode
    model_1 = pickle.load(file)

# Load the second model
with open('mlp_model_hashtag2.pkl', 'rb') as file: # Changed to 'rb' mode
    model_2 = pickle.load(file)

In [None]:
import nltk
from nltk.corpus import stopwords
import emoji
import re
import pandas as pd
import numpy as np
from collections import Counter

# Tải xuống stopwords
nltk.download('stopwords')

# Các hàm xử lý văn bản
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = text.lower().split()  # Chuyển tất cả các từ thành chữ thường
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)

def remove_emoji(text):
    return emoji.demojize(text)

def remove_dates(text):
    return re.sub(r'\b(?:\d{1,2}[/\-]\d{1,2}[/\-]\d{2,4}|\d{1,2}[/\-]\d{1,2})\b', '', text)

def remove_numbers_and_special_characters(text):
    return re.sub(r'[^a-zA-Z\s]', '', text)

# Hàm xử lý dữ liệu
def preprocess_data(file_path):
    # Đọc dữ liệu
    data = pd.read_csv(file_path)
    data = data.drop(["Unnamed: 0"], axis=1)
    data = data.dropna()

    # Ánh xạ nhãn thành số
    topic_counts = Counter(data['Topic'])
    sorted_topics = sorted(topic_counts, key=topic_counts.get, reverse=True)
    label_mapping = {topic: i for i, topic in enumerate(sorted_topics)}
    data['Label'] = data['Topic'].map(label_mapping)

    # Áp dụng các hàm xử lý văn bản
    data['Descr'] = data['Description'].apply(remove_stopwords)
    data['Descr'] = data['Description'].apply(remove_emoji)
    data['Descr'] = data['Description'].apply(remove_dates)
    data['Descr'] = data['Description'].apply(remove_numbers_and_special_characters)
    data['Descr'] = data['Descr'].apply(remove_stopwords)

    # Tạo nhãn one-hot
    num_labels = len(data['Label'].unique())
    one_hot_labels = np.zeros((len(data), num_labels))

    for i, label in enumerate(data['Label']):
        one_hot_labels[i][label] = 1

    data['one_hot_label'] = list(one_hot_labels)

    return data

# Sử dụng các hàm
file_path = "data train and test hashtag.csv"
data = preprocess_data(file_path)

# In ra DataFrame sau khi thêm cột one-hot
print(data.head())


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                                               Title  \
0                               The Devil's Disciple   
1                                     Taking Chances   
2  Verdi : The Story of the Little Boy who Loved ...   
3                                            Success   
4                                       Triplanetary   

                                         Description  \
0  Set in Colonial America during the Revolutiona...   
1  Spice-o-meter Rating: This fun romance is a so...   
2  This time Tapper moves his focus to Italy in h...   
3  The contents of this volume originally appeare...   
4  ONE MAN DISCOVERED THE TRUTH—The Fall of Rome,...   

                       Topic  \
0                    Fiction   
1                    Romance   
2  Biography & Autobiography   
3                   Business   
4                    Fiction   

                                               Descr  Label  \
0  set colonial america revolutionary era play te...      0   
1  spic

In [None]:
data

Unnamed: 0,Title,Description,Topic,Descr,Label,one_hot_label
0,The Devil's Disciple,Set in Colonial America during the Revolutiona...,Fiction,set colonial america revolutionary era play te...,0,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,Taking Chances,Spice-o-meter Rating: This fun romance is a so...,Romance,spiceometer rating fun romance solid maybe sca...,10,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,Verdi : The Story of the Little Boy who Loved ...,This time Tapper moves his focus to Italy in h...,Biography & Autobiography,time tapper moves focus italy voyage among gre...,3,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,Success,The contents of this volume originally appeare...,Business,contents volume originally appeared weekly art...,20,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,Triplanetary,"ONE MAN DISCOVERED THE TRUTH—The Fall of Rome,...",Fiction,one man discovered truththe fall rome wars rac...,0,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...,...,...
18095,Journey to the East,This book tells the tale of a man who goes on ...,other_1,book tells tale man goes wonderful amazing jou...,1,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
18096,The Monk Who Sold His Ferrari: A Fable About F...,"Wisdom to Create a Life of Passion, Purpose, a...",Biography & Autobiography,wisdom create life passion purpose peace inspi...,3,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
18097,I Am that,This collection of the timeless teachings of o...,Philosophy,collection timeless teachings one greatest sag...,9,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
18098,The Berlin Phenomenology,Since the three volume edition ofHegel's Philo...,History,since three volume edition ofhegels philosophy...,6,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ..."


In [None]:

import pickle
def load_tokenize_data():
  with open('tokenizer.pkl', 'rb') as file:
    loaded_tokenizer = pickle.load(file)
    return loaded_tokenizer


In [None]:
def preprocess_data_real_time(text):
  text = remove_stopwords(text)
  text = remove_emoji(text)
  text = remove_dates(text)
  text = remove_numbers_and_special_characters(text)
  text = remove_stopwords(text)
  tokenize = load_tokenize_data()
  sequences = tokenize.texts_to_sequences([text])
  padded = pad_sequences(sequences, maxlen=100, padding='post')
  return padded



In [None]:
text = "Eighteen-year-old Ryohei Arisu is sick of his life. School sucks, his love life is a joke, and his future feels like impending doom. As he struggles to exist in a world that can’t be bothered with him, Ryohei feels like everything would be better if he were anywhere else. When a strange fireworks show transports him and his friends to a parallel world, Ryohei thinks all his wishes have come true. But this new world isn’t an empty paradise, it’s a vicious game. And the only way to survive is to play.The first game starts with a bang, but Ryohei manages to beat the clock and save his friends. It’s a short-lived victory, however, as they discover that winning only earns them a few days’ grace period. If they want to get home, they’re going to have to start playing a lot harder."

In [None]:
padd = preprocess_data_real_time(text)

In [None]:
padd

array([[  2, 112,   8,   2, 141,  26, 630,   5, 346,  26, 155,  40, 190,
        870, 193, 342,  86,   5,  98,  61,   4,   5, 552, 245,  22, 529,
          7, 245, 278,  86, 299, 127, 178, 580, 116,  32,  45, 225, 255,
        568,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0]], dtype=int32)

In [None]:
pred_1 = model_1.predict(padd)
pred_1



array([[1.660143e-06]], dtype=float32)

In [None]:
pred_1 = np.round(pred_1)
pred_1
value = pred_1[0, 0]  # Lấy giá trị đầu tiên trong mảng 2 chiều
print(value)  # In ra giá trị của pred_1


0.0


In [None]:
pred_2 = model_2.predict(padd)
pred_2



array([[1.71500451e-05, 7.51414409e-05, 8.52227509e-01, 6.71688095e-02,
        4.05855104e-02, 1.89447419e-05, 3.33799832e-02, 3.07483645e-03,
        2.05468107e-03, 5.35931205e-03, 1.35560369e-03, 1.00999244e-03,
        1.19240567e-05, 3.99695663e-03, 1.45728874e-03, 1.00206948e-04,
        3.81757782e-05, 2.18993300e-04, 2.55937121e-05, 1.12904864e-03,
        1.67034916e-04, 1.05575244e-04, 5.34162464e-05, 7.09888199e-03,
        9.13054373e-06, 9.61292442e-03, 3.10747913e-04, 3.47294321e-04,
        6.11247469e-06, 2.94754398e-03, 8.80501611e-05]], dtype=float32)

In [None]:
# Lấy chỉ số của 3 giá trị lớn nhất (giảm dần)
top_indices = np.argsort(pred_2[0])[::-1][:3]

# Lấy các giá trị dự đoán và chỉ số tương ứng
top_values = pred_2[0][top_indices]

print("Top 3 values:", top_values)
print("Indices of top 3 values:", top_indices)

Top 3 values: [0.8522275  0.06716881 0.04058551]
Indices of top 3 values: [2 3 4]


In [None]:
label_to_topic = {
    0: 'Fiction',
    1: 'other_1',
    2: 'others',
    3: 'Biography & Autobiography',
    4: 'other_2',
    5: 'Fiction Classics',
    6: 'History',
    7: 'Juvenile Fiction',
    8: 'Sci-fi Fantasy',
    9: 'Philosophy',
    10: 'Romance',
    11: 'Short Stories',
    12: 'Ebook',
    13: 'Drama',
    14: 'Health And Fitness',
    15: 'Literary Criticism',
    16: 'Marketing',
    17: 'Religion',
    18: 'Self-Improvement',
    19: 'Erotica',
    20: 'Business',
    21: 'Poetry',
    22: 'Humor',
    23: 'Comics & Graphic Novels',
    24: 'Mystery',
    25: 'Biography',
    26: "Children's Classics",
    27: 'Health',
    28: 'Horror-Gothic',
    29: 'Juvenile Nonfiction',
    30: 'Religious',
    31: 'Food/Recipes'
}