<a href="https://colab.research.google.com/github/oilportrait/test_colab/blob/main/TF_text_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from nltk.corpus import wordnet
import nltk
import random
import re
from sklearn.utils import resample
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import LabelBinarizer
from tensorflow.keras.utils import to_categorical
from transformers import BertTokenizer
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
def get_synonyms(word):
    """ Get synonyms of a word """
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name())
    return list(synonyms)

In [None]:
def synonym_replacement(sentence, n=2):
    """ Replace n words in the sentence with their synonyms """
    words = sentence.split()
    random_words = random.sample(words, min(n, len(words)))
    new_words = []
    for word in words:
        if word in random_words:
            synonyms = get_synonyms(word)
            synonym = word if not synonyms else random.choice(synonyms)
            new_words.append(synonym)
        else:
            new_words.append(word)
    return ' '.join(new_words)

In [None]:
def preprocess_text(text):
    """ Preprocess the text by removing URLs and special characters """
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    return text

In [None]:
from google.colab import drive
drive.mount('/content/drive')
mbti_data  = pd.read_csv("/content/drive/MyDrive/refer/mbti_1.csv")

Mounted at /content/drive


In [None]:
mbti_data['type'].value_counts()

INFP    1832
INFJ    1470
INTP    1304
INTJ    1091
ENTP     685
ENFP     675
ISTP     337
ISFP     271
ENTJ     231
ISTJ     205
ENFJ     190
ISFJ     166
ESTP      89
ESFP      48
ESFJ      42
ESTJ      39
Name: type, dtype: int64

In [None]:

# URL을 감지하고 제거하는 정규 표현식
url_pattern = r'https?://\S+|www\.\S+'

# 문장에서 URL 제거
mbti_data['posts'] = mbti_data['posts'].replace(url_pattern, '', regex=True)



In [None]:
types_with_least_data = mbti_data['type'].value_counts().tail(12).index
amplified_data = []

additional_types_to_amplify = ['ESTP', 'ESFP', 'ESFJ', 'ESTJ']  # 추가적인 증강이 필요한 유형들
additional_amplification_factor = 4  # 추가 4배 증강

for mbti_type in types_with_least_data:
    type_data = mbti_data[mbti_data['type'] == mbti_type]
    mbti_data.loc[mbti_data['type'] == mbti_type, 'posts'] = type_data['posts'].apply(preprocess_text)

    amplification_factor = additional_amplification_factor if mbti_type in additional_types_to_amplify else 1
    for _ in range(amplification_factor):
        for _, row in type_data.iterrows():
            amplified_sentence = synonym_replacement(row['posts'])
            amplified_data.append([mbti_type, amplified_sentence])

amplified_df = pd.DataFrame(amplified_data, columns=['type', 'amplified_posts'])
combined_data = pd.concat([mbti_data, amplified_df])

In [None]:
amplified_df['type'].value_counts()

ENTP    685
ENFP    675
ESTP    356
ISTP    337
ISFP    271
ENTJ    231
ISTJ    205
ESFP    192
ENFJ    190
ESFJ    168
ISFJ    166
ESTJ    156
Name: type, dtype: int64

In [None]:
combined_data['type'].value_counts()

INFP    1832
INFJ    1470
ENTP    1370
ENFP    1350
INTP    1304
INTJ    1091
ISTP     674
ISFP     542
ENTJ     462
ESTP     445
ISTJ     410
ENFJ     380
ISFJ     332
ESFP     240
ESFJ     210
ESTJ     195
Name: type, dtype: int64

In [None]:
combined_data

Unnamed: 0,type,posts,amplified_posts
0,INFJ,' and intj moments sportscenter not top ten...,
1,ENTP,Im finding the lack of me in these posts very ...,
2,INTP,"'Good one _____ course, to which I say I k...",
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...",
4,ENTJ,Youre firedThats another silly misconception T...,
...,...,...,...
3627,ESTJ,,'I wouldn't want to be on an elevator for a we...
3628,ESTJ,,"'As a personal challenge, I like to pick up ch..."
3629,ESTJ,,' support deleting the Trash section. That tra...
3630,ESTJ,,"hitler was what he was,and i am estj or esfj. ..."


In [None]:
combined_data['amplified_posts'] = combined_data.apply(
    lambda row: row['amplified_posts'] if pd.isna(row['posts']) else row['posts'],
    axis=1
)

In [None]:
combined_data

Unnamed: 0,type,posts,amplified_posts
0,INFJ,' and intj moments sportscenter not top ten...,' and intj moments sportscenter not top ten...
1,ENTP,Im finding the lack of me in these posts very ...,Im finding the lack of me in these posts very ...
2,INTP,"'Good one _____ course, to which I say I k...","'Good one _____ course, to which I say I k..."
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...","'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,Youre firedThats another silly misconception T...,Youre firedThats another silly misconception T...
...,...,...,...
3627,ESTJ,,'I wouldn't want to be on an elevator for a we...
3628,ESTJ,,"'As a personal challenge, I like to pick up ch..."
3629,ESTJ,,' support deleting the Trash section. That tra...
3630,ESTJ,,"hitler was what he was,and i am estj or esfj. ..."


In [None]:
amplified_df = combined_data.drop('posts', axis=1)

In [None]:
amplified_df

Unnamed: 0,type,amplified_posts
0,INFJ,' and intj moments sportscenter not top ten...
1,ENTP,Im finding the lack of me in these posts very ...
2,INTP,"'Good one _____ course, to which I say I k..."
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,Youre firedThats another silly misconception T...
...,...,...
3627,ESTJ,'I wouldn't want to be on an elevator for a we...
3628,ESTJ,"'As a personal challenge, I like to pick up ch..."
3629,ESTJ,' support deleting the Trash section. That tra...
3630,ESTJ,"hitler was what he was,and i am estj or esfj. ..."


In [None]:
target_sample_size = 2000 # 모든 클래스를 2000개의 샘플로 설정
balanced_data = []

for mbti_type in mbti_data['type'].unique():
    type_data = amplified_df[amplified_df['type'] == mbti_type]
    num_samples = len(type_data)

    if num_samples > 0:
        if num_samples < target_sample_size:
            # 오버샘플링: 샘플 수가 목표보다 적은 경우
            resampled_data = resample(type_data, replace=True, n_samples=target_sample_size, random_state=123)
        else:
            # 언더샘플링: 샘플 수가 목표보다 많은 경우
            resampled_data = resample(type_data, replace=False, n_samples=target_sample_size, random_state=123)
        balanced_data.append(resampled_data)

balanced_df = pd.concat(balanced_data)

In [None]:
balanced_df['type'].value_counts()

INFJ    2000
ENTP    2000
INTP    2000
INTJ    2000
ENTJ    2000
ENFJ    2000
INFP    2000
ENFP    2000
ISFP    2000
ISTP    2000
ISFJ    2000
ISTJ    2000
ESTP    2000
ESFP    2000
ESTJ    2000
ESFJ    2000
Name: type, dtype: int64

In [None]:
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-768_A-12/1", trainable=True)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []

    for text in texts:
        text = tokenizer.tokenize(text)
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)

        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len

        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)

    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)


In [None]:
# Prepare data for BERT
X = bert_encode(balanced_df['amplified_posts'].values, tokenizer)
y = balanced_df['type'].values

In [None]:
num_unique_values = balanced_df['type'].nunique()
confirmed_mbti_types = balanced_df['type'].unique()
print("고유한 값들의 수:", num_unique_values)
print("고유한 값들:", confirmed_mbti_types)
all_mbti_types = ['ISTJ', 'ISFJ', 'INFJ', 'INTJ', 'ISTP', 'ISFP', 'INFP', 'INTP',
                  'ESTP', 'ESFP', 'ENFP', 'ENTP', 'ESTJ', 'ESFJ', 'ENFJ', 'ENTJ']
all_types_present = all(mbti in confirmed_mbti_types for mbti in all_mbti_types)

all_types_present

고유한 값들의 수: 16
고유한 값들: ['INFJ' 'ENTP' 'INTP' 'INTJ' 'ENTJ' 'ENFJ' 'INFP' 'ENFP' 'ISFP' 'ISTP'
 'ISFJ' 'ISTJ' 'ESTP' 'ESFP' 'ESTJ' 'ESFJ']


True

In [None]:
# Label encoding for MBTI types
encoder = LabelBinarizer()
y_encoded = encoder.fit_transform(y)

In [None]:

print("y_encoded의 형태:", y_encoded.shape)

print("y_encoded의 첫 5행:", y_encoded[:5])

print("인코딩된 클래스 레이블:", encoder.classes_)

y_encoded의 형태: (32000, 16)
y_encoded의 첫 5행: [[0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0]]
인코딩된 클래스 레이블: ['ENFJ' 'ENFP' 'ENTJ' 'ENTP' 'ESFJ' 'ESFP' 'ESTJ' 'ESTP' 'INFJ' 'INFP'
 'INTJ' 'INTP' 'ISFJ' 'ISFP' 'ISTJ' 'ISTP']


In [None]:
# Checking the shapes
print("Shape of X[0]:", X[0].shape)
print("Shape of y_encoded:", y_encoded.shape)

# If the shapes are consistent, proceed with the split
if X[0].shape[0] == y_encoded.shape[0]:
    # Split each component of X separately
    X_train_tokens, X_test_tokens, y_train, y_test = train_test_split(
        X[0], y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)
    X_train_masks, X_test_masks, _, _ = train_test_split(
        X[1], np.zeros(y_encoded.shape[0]), test_size=0.2, random_state=42, stratify=y_encoded)
    X_train_segments, X_test_segments, _, _ = train_test_split(
        X[2], np.zeros(y_encoded.shape[0]), test_size=0.2, random_state=42, stratify=y_encoded)
    # Now, X_train and X_test are tuples of arrays
    X_train = (X_train_tokens, X_train_masks, X_train_segments)
    X_test = (X_test_tokens, X_test_masks, X_test_segments)
else:
    print("Mismatch in the number of samples between X and y_encoded")


Shape of X[0]: (32000, 512)
Shape of y_encoded: (32000, 16)


In [None]:
# Define k-fold cross-validation
kfold = KFold(n_splits=5, shuffle=True, random_state=42)


In [None]:
# K-fold Cross Validation model evaluation
fold_no = 1
# EarlyStopping 콜백 정의
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',    # 검증 손실을 모니터링
    patience=3,            # 3 에폭 동안 개선되지 않으면 중단
    restore_best_weights=True # 가장 좋은 모델의 가중치를 복원
)
for train, val in kfold.split(X_train[0]):

    # Define the model architecture inside the loop
    input_word_ids = Input(shape=(512,), dtype=tf.int32, name="input_word_ids")
    input_mask = Input(shape=(512,), dtype=tf.int32, name="input_mask")
    segment_ids = Input(shape=(512,), dtype=tf.int32, name="segment_ids")

    # Correctly formatted input for the BERT layer
    bert_inputs = {
        'input_word_ids': input_word_ids,
        'input_mask': input_mask,
        'input_type_ids': segment_ids  # 'segment_ids' might be named 'input_type_ids' in some models
    }

    # BERT layer
    outputs = bert_layer(bert_inputs)
    print(outputs.keys())
    pooled_output = outputs['pooled_output'] # 'pooled_output'만 사용

    # 분류를 위한 출력 계층
    clf_output = pooled_output
    out = Dense(len(encoder.classes_), activation='softmax')(clf_output)

    # 모델 정의
    model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5), loss='categorical_crossentropy', metrics=['accuracy'])


    # Generate data for this fold
    X_train_fold = [X_train[0][train], X_train[1][train], X_train[2][train]]
    y_train_fold = y_train[train]
    X_val_fold = [X_train[0][val], X_train[1][val], X_train[2][val]]
    y_val_fold = y_train[val]

    # Train the model
    print(f'Training for fold {fold_no} ...')
    history = model.fit(
        X_train_fold,
        y_train_fold,
        epochs=5,
        batch_size=8,
        validation_data=(X_val_fold, y_val_fold),
        callbacks=[early_stopping]
    )

    # Increase the fold number
    fold_no = fold_no + 1

dict_keys(['default', 'encoder_outputs', 'pooled_output', 'sequence_output'])
Training for fold 1 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
dict_keys(['default', 'encoder_outputs', 'pooled_output', 'sequence_output'])
Training for fold 2 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
dict_keys(['default', 'encoder_outputs', 'pooled_output', 'sequence_output'])
Training for fold 3 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
dict_keys(['default', 'encoder_outputs', 'pooled_output', 'sequence_output'])
Training for fold 4 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
dict_keys(['default', 'encoder_outputs', 'pooled_output', 'sequence_output'])
Training for fold 5 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5


In [None]:
# Final evaluation on the test set
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f'Test Loss: {test_loss}, Test Accuracy: {test_accuracy}')


Test Loss: 0.20981699228286743, Test Accuracy: 0.9581249952316284


In [None]:
from sklearn.metrics import f1_score
def calculate_f1_score(y_true, y_pred):
    # 실제 레이블과 예측 레이블 사이의 F1 점수 계산
    return f1_score(y_true, y_pred, average='weighted')

In [None]:
y_pred = model.predict(X_test)
y_pred_classes = tf.argmax(y_pred, axis=1)

# 실제 레이블
y_true_classes = tf.argmax(y_test, axis=1)

# F1 점수 계산
f1 = calculate_f1_score(y_true_classes, y_pred_classes)
print("F1 Score:", f1)

F1 Score: 0.958041058385522


In [None]:
# Predict probabilities function remains the same
def predict_mbti_probabilities(sentence):
    encoded_sentence = bert_encode([sentence], tokenizer)
    prediction = model.predict(encoded_sentence)
    return dict(zip(encoder.classes_, prediction[0]))

In [None]:
# Example Usage
sentence = "I love you"
probabilities = predict_mbti_probabilities(sentence)
print(probabilities)

{'ENFJ': 0.05035022, 'ENFP': 0.029673353, 'ENTJ': 0.010324592, 'ENTP': 0.0012459122, 'ESFJ': 0.0018080978, 'ESFP': 0.0038081668, 'ESTJ': 0.0074395295, 'ESTP': 0.00088908407, 'INFJ': 0.037549503, 'INFP': 0.7075851, 'INTJ': 0.008131627, 'INTP': 0.0014476663, 'ISFJ': 0.073002495, 'ISFP': 0.06258215, 'ISTJ': 0.001667325, 'ISTP': 0.0024952546}


In [None]:
sentence = "I cried while watching the movie because I was so sad."
probabilities = predict_mbti_probabilities(sentence)
print(probabilities)

{'ENFJ': 0.00015294302, 'ENFP': 0.0005073159, 'ENTJ': 0.000133497, 'ENTP': 3.1984175e-05, 'ESFJ': 6.617442e-05, 'ESFP': 0.00013976844, 'ESTJ': 6.8490495e-05, 'ESTP': 3.6546655e-05, 'INFJ': 0.00061548455, 'INFP': 0.997456, 'INTJ': 0.00020513423, 'INTP': 0.000115973475, 'ISFJ': 8.604079e-05, 'ISFP': 0.00027192617, 'ISTJ': 3.9314167e-05, 'ISTP': 7.335025e-05}


In [None]:
sentence = "I screwed up this test."
probabilities = predict_mbti_probabilities(sentence)
print(probabilities)

{'ENFJ': 0.001507009, 'ENFP': 0.00093468284, 'ENTJ': 0.00622011, 'ENTP': 0.011261168, 'ESFJ': 0.00033207168, 'ESFP': 0.00081737875, 'ESTJ': 0.0011217702, 'ESTP': 0.0012109146, 'INFJ': 0.0028877729, 'INFP': 0.0041965228, 'INTJ': 0.015448331, 'INTP': 0.91712826, 'ISFJ': 0.00065356254, 'ISFP': 0.0014492847, 'ISTJ': 0.007589445, 'ISTP': 0.0272418}


In [None]:
!sudo apt-get update
!sudo apt-get install git-lfs
!git lfs install


0% [Working]            Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
0% [Connecting to archive.ubuntu.com (91.189.91.81)] [Connecting to security.ub0% [Connecting to archive.ubuntu.com (91.189.91.81)] [Connecting to security.ub                                                                               Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:3 https://ppa.launchpadcontent.net/c2d4u.team/c2d4u4.0+/ubuntu jammy InRelease
Hit:4 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:5 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:6 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:7 http://security.ubuntu.com/ubuntu jammy-security InRelease [110 kB]
Hit:8 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:9 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [119 kB]
Hit:10 http://archive.ubuntu.com/ub

In [None]:
# 토크나이저 저장 경로
tokenizer_save_path = "./my_saved_tokenizer"
tokenizer.save_pretrained(tokenizer_save_path)

('./my_saved_tokenizer/tokenizer_config.json',
 './my_saved_tokenizer/special_tokens_map.json',
 './my_saved_tokenizer/vocab.txt',
 './my_saved_tokenizer/added_tokens.json')

In [None]:
# 모델 저장 경로
saved_model_path = "./my_saved_model"
model.save(saved_model_path, include_optimizer=False)


In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) Y
Token is valid (permission: write).
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your terminal in case you want to set the 'stor

In [None]:
import shutil
from huggingface_hub import Repository

# 허깅페이스 저장소 초기화
repo_name = "mbti-classification-16"  # 저장소 이름
username = "purotae"  # 허깅페이스 사용자명

repo = Repository(local_dir=repo_name, clone_from=f"{username}/{repo_name}", use_auth_token=True)

# 토크나이저와 모델을 저장소로 이동
shutil.move(tokenizer_save_path, repo_name)
shutil.move(saved_model_path, repo_name)

# 저장소에 변경 사항 추가, 커밋 및 푸시
repo.git_add(auto_lfs_track=True)
repo.git_commit("Add fine-tuned BERT model and tokenizer")
repo.git_push()

For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
Cloning https://huggingface.co/purotae/mbti-classification-16 into local empty directory.
Adding files tracked by Git LFS: ['my_saved_model/variables/variables.data-00000-of-00001', 'my_saved_model/variables/variables.index']. This may take a bit of time if the files are large.


Upload file my_saved_model/variables/variables.data-00000-of-00001:   0%|          | 1.00/420M [00:00<?, ?B/s]

Upload file my_saved_model/saved_model.pb:   0%|          | 1.00/14.4M [00:00<?, ?B/s]

Upload file my_saved_model/fingerprint.pb:   2%|1         | 1.00/55.0 [00:00<?, ?B/s]

Upload file my_saved_model/keras_metadata.pb:   0%|          | 1.00/9.16k [00:00<?, ?B/s]

Upload file my_saved_model/variables/variables.index:   0%|          | 1.00/15.4k [00:00<?, ?B/s]

To https://huggingface.co/purotae/mbti-classification-16
   952e576..0af3bac  main -> main

   952e576..0af3bac  main -> main



'https://huggingface.co/purotae/mbti-classification-16/commit/0af3bac103fdf8097c64b66405df3bd4fc30454b'