<a href="https://colab.research.google.com/github/oilportrait/test_colab/blob/main/TF_text_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from nltk.corpus import wordnet
import nltk
import random
import re
from sklearn.utils import resample
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import LabelBinarizer
from tensorflow.keras.utils import to_categorical
from transformers import BertTokenizer
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
def get_synonyms(word):
    """ Get synonyms of a word """
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name())
    return list(synonyms)

In [None]:
def synonym_replacement(sentence, n=2):
    """ Replace n words in the sentence with their synonyms """
    words = sentence.split()
    random_words = random.sample(words, min(n, len(words)))
    new_words = []
    for word in words:
        if word in random_words:
            synonyms = get_synonyms(word)
            synonym = word if not synonyms else random.choice(synonyms)
            new_words.append(synonym)
        else:
            new_words.append(word)
    return ' '.join(new_words)

In [None]:
def preprocess_text(text):
    """ Preprocess the text by removing URLs and special characters """
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text

In [None]:
from google.colab import drive
drive.mount('/content/drive')
mbti_data  = pd.read_csv("/content/drive/MyDrive/refer/mbti_1.csv")

Mounted at /content/drive


In [None]:
types_with_least_data = mbti_data['type'].value_counts().tail(10).index
amplified_data = []

for mbti_type in types_with_least_data:
    type_data = mbti_data[mbti_data['type'] == mbti_type]
    mbti_data.loc[mbti_data['type'] == mbti_type, 'posts'] = mbti_data[mbti_data['type'] == mbti_type]['posts'].apply(preprocess_text)

    for _, row in type_data.iterrows():
        amplified_sentence = synonym_replacement(row['posts'])
        amplified_data.append([mbti_type, amplified_sentence])

amplified_df = pd.DataFrame(amplified_data, columns=['type', 'amplified_posts'])

max_size = mbti_data['type'].value_counts().max()
balanced_data = []

for mbti_type in mbti_data['type'].unique():
    type_data = amplified_df[amplified_df['type'] == mbti_type]
    if not type_data.empty and max_size > 0:
        resampled_data = resample(type_data, replace=True, n_samples=max_size, random_state=123)
        balanced_data.append(resampled_data)

balanced_df = pd.concat(balanced_data)

In [None]:
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-768_A-12/1", trainable=True)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []

    for text in texts:
        text = tokenizer.tokenize(text)
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)

        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len

        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)

    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)


In [None]:
X = bert_encode(balanced_df['amplified_posts'].values, tokenizer)
y = balanced_df['type'].values

In [None]:
encoder = LabelBinarizer()
y_encoded = encoder.fit_transform(y)

In [None]:
print("Shape of X[0]:", X[0].shape)
print("Shape of y_encoded:", y_encoded.shape)

if X[0].shape[0] == y_encoded.shape[0]:

    X_train_tokens, X_test_tokens, y_train, y_test = train_test_split(
        X[0], y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)
    X_train_masks, X_test_masks, _, _ = train_test_split(
        X[1], np.zeros(y_encoded.shape[0]), test_size=0.2, random_state=42, stratify=y_encoded)
    X_train_segments, X_test_segments, _, _ = train_test_split(
        X[2], np.zeros(y_encoded.shape[0]), test_size=0.2, random_state=42, stratify=y_encoded)
    X_train = (X_train_tokens, X_train_masks, X_train_segments)
    X_test = (X_test_tokens, X_test_masks, X_test_segments)
else:
    print("Mismatch in the number of samples between X and y_encoded")


Shape of X[0]: (18320, 512)
Shape of y_encoded: (18320, 10)


In [None]:
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
fold_no = 1
for train, val in kfold.split(X_train[0]):

    input_word_ids = Input(shape=(512,), dtype=tf.int32, name="input_word_ids")
    input_mask = Input(shape=(512,), dtype=tf.int32, name="input_mask")
    segment_ids = Input(shape=(512,), dtype=tf.int32, name="segment_ids")

    bert_inputs = {
        'input_word_ids': input_word_ids,
        'input_mask': input_mask,
        'input_type_ids': segment_ids
    }

    outputs = bert_layer(bert_inputs)
    print(outputs.keys())
    pooled_output = outputs['pooled_output']

    clf_output = pooled_output
    out = Dense(len(encoder.classes_), activation='softmax')(clf_output)

    model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5), loss='categorical_crossentropy', metrics=['accuracy'])

    X_train_fold = [X_train[0][train], X_train[1][train], X_train[2][train]]
    y_train_fold = y_train[train]
    X_val_fold = [X_train[0][val], X_train[1][val], X_train[2][val]]
    y_val_fold = y_train[val]

    print(f'Training for fold {fold_no} ...')
    history = model.fit(
        X_train_fold,
        y_train_fold,
        epochs=5,
        batch_size=8,
        validation_data=(X_val_fold, y_val_fold)
    )

    fold_no = fold_no + 1

dict_keys(['sequence_output', 'default', 'encoder_outputs', 'pooled_output'])
Training for fold 1 ...
Epoch 1/3
Epoch 2/3
Epoch 3/3
dict_keys(['sequence_output', 'default', 'encoder_outputs', 'pooled_output'])
Training for fold 2 ...
Epoch 1/3
Epoch 2/3
Epoch 3/3
dict_keys(['sequence_output', 'default', 'encoder_outputs', 'pooled_output'])
Training for fold 3 ...
Epoch 1/3
Epoch 2/3
Epoch 3/3
dict_keys(['sequence_output', 'default', 'encoder_outputs', 'pooled_output'])
Training for fold 4 ...
Epoch 1/3
Epoch 2/3
Epoch 3/3
dict_keys(['sequence_output', 'default', 'encoder_outputs', 'pooled_output'])
Training for fold 5 ...
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [None]:
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f'Test Loss: {test_loss}, Test Accuracy: {test_accuracy}')

Test Loss: 8.44070891616866e-05, Test Accuracy: 1.0


In [None]:
def predict_mbti_probabilities(sentence):
    encoded_sentence = bert_encode([sentence], tokenizer)
    prediction = model.predict(encoded_sentence)
    return dict(zip(encoder.classes_, prediction[0]))

In [None]:
sentence = "I love you"
probabilities = predict_mbti_probabilities(sentence)
print(probabilities)

{'ENFJ': 0.6445151, 'ENTJ': 0.0028850692, 'ESFJ': 0.005440955, 'ESFP': 0.0020330818, 'ESTJ': 0.0010679086, 'ESTP': 0.0007763918, 'ISFJ': 0.26039895, 'ISFP': 0.07188682, 'ISTJ': 0.007228388, 'ISTP': 0.0037673926}


In [None]:
!sudo apt-get update
!sudo apt-get install git-lfs
!git lfs install

0% [Working]            Get:1 http://security.ubuntu.com/ubuntu jammy-security InRelease [110 kB]
0% [Connecting to archive.ubuntu.com] [1 InRelease 8,379 B/110 kB 8%] [Connecte                                                                               Get:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
0% [Connecting to archive.ubuntu.com] [1 InRelease 110 kB/110 kB 100%] [2 InRel0% [Connecting to archive.ubuntu.com] [Waiting for headers] [Waiting for header                                                                               Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease
                                                                               0% [Waiting for headers] [Waiting for headers]                                              Get:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
0% [Waiting for headers] [Waiting for headers] [4 InRelease 1,581 B/1,581 B 100

In [None]:
tokenizer_save_path = "./my_saved_tokenizer"
tokenizer.save_pretrained(tokenizer_save_path)

('./my_saved_tokenizer/tokenizer_config.json',
 './my_saved_tokenizer/special_tokens_map.json',
 './my_saved_tokenizer/vocab.txt',
 './my_saved_tokenizer/added_tokens.json')

In [None]:
saved_model_path = "./my_saved_model"
model.save(saved_model_path, include_optimizer=False)

In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) Y
Token is valid (permission: write).
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your terminal in case you want to set the 'stor

In [None]:
import shutil
from huggingface_hub import Repository

repo_name = "mbti-classification"
username = "purotae"

repo = Repository(local_dir=repo_name, clone_from=f"{username}/{repo_name}", use_auth_token=True)

shutil.move(tokenizer_save_path, repo_name)
shutil.move(saved_model_path, repo_name)

repo.git_add(auto_lfs_track=True)
repo.git_commit("Add fine-tuned BERT model and tokenizer")
repo.git_push()

For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
Cloning https://huggingface.co/purotae/mbti-classification into local empty directory.
Adding files tracked by Git LFS: ['my_saved_model/variables/variables.data-00000-of-00001', 'my_saved_model/variables/variables.index']. This may take a bit of time if the files are large.


Upload file my_saved_model/variables/variables.data-00000-of-00001:   0%|          | 1.00/420M [00:00<?, ?B/s]

Upload file my_saved_model/saved_model.pb:   0%|          | 1.00/14.3M [00:00<?, ?B/s]

Upload file my_saved_model/fingerprint.pb:   2%|1         | 1.00/58.0 [00:00<?, ?B/s]

Upload file my_saved_model/keras_metadata.pb:   0%|          | 1.00/9.16k [00:00<?, ?B/s]

Upload file my_saved_model/variables/variables.index:   0%|          | 1.00/15.4k [00:00<?, ?B/s]

To https://huggingface.co/purotae/mbti-classification
   ecc809c..704fab2  main -> main

   ecc809c..704fab2  main -> main



'https://huggingface.co/purotae/mbti-classification/commit/704fab2465138c8e02900fffb9bcca24cd7c964d'