<a href="https://colab.research.google.com/github/naisargi0903/WT/blob/main/personality_with_questionnaire.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install tensorflow==2.15.0
!pip install tensorflow-addons

In [None]:
import pandas as pd
from transformers import TFBertModel, BertTokenizer
seed_value = 29
import os
os.environ['PYTHONHASHSEED'] = str(seed_value)
import random
random.seed(seed_value)
import numpy as np
np.random.seed(seed_value)
np.set_printoptions(precision=2)
import tensorflow as tf
tf.random.set_seed(seed_value)
import tensorflow_addons as tfa
import tensorflow.keras as keras
import tensorflow.keras.layers as layers
from tensorflow.keras.callbacks import ModelCheckpoint
import re
import matplotlib.pyplot as plt
from sklearn.metrics import auc, roc_curve

In [None]:
N_AXIS = 4
MAX_SEQ_LEN = 128
BERT_NAME = 'bert-base-uncased'
'''
EMOTIONAL AXES:
Introversion (I) – Extroversion (E)
Intuition (N) – Sensing (S)
Thinking (T) – Feeling (F)
Judging (J) – Perceiving (P)
'''
axes = ["I-E","N-S","T-F","J-P"]
classes = {"I":0, "E":1, # axis 1
           "N":0,"S":1, # axis 2
           "T":0, "F":1, # axis 3
           "J":0,"P":1} # axis 4

In [None]:
def text_preprocessing(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text.encode('ascii', 'ignore').decode('ascii')
    if text.startswith("'"):
        text = text[1:-1]
    return text

In [None]:
train_n=6624
val_n=1024
test_n=1024
data = pd.read_csv("/content/mbti_1.csv")
data = data.sample(frac=1)
labels = []
print(data)
for personality in data["type"]:
    pers_vect = []
    for p in personality:
        pers_vect.append(classes[p])
    labels.append(pers_vect)
sentences = data["posts"].apply(str).apply(lambda x: text_preprocessing(x))
labels = np.array(labels, dtype="float32")
train_sentences = list(sentences[:train_n])
y_train = labels[:train_n]
val_sentences = list(sentences[train_n:train_n + val_n])
y_val = labels[train_n:train_n+val_n]
test_sentences = list(sentences[train_n + val_n:train_n + val_n + test_n])
y_test = labels[train_n+val_n:train_n+val_n+test_n]
y_train = labels[:train_n]

In [None]:
def prepare_bert_input(sentences, seq_len, bert_name):
    # Ensure tokenizer handles the input correctly
    tokenizer = BertTokenizer.from_pretrained(bert_name)

    # No need for .tolist(), sentences is already a list
    encodings = tokenizer(sentences, truncation=True, padding='max_length', max_length=seq_len)

    # Prepare the input data for the model
    input = [np.array(encodings["input_ids"]),
             np.array(encodings["attention_mask"]),
             np.array(encodings.get("token_type_ids", np.zeros_like(encodings["input_ids"])))]

    return input

In [None]:
X_train = prepare_bert_input(train_sentences, MAX_SEQ_LEN, BERT_NAME)
X_val = prepare_bert_input(val_sentences, MAX_SEQ_LEN, BERT_NAME)
X_test = prepare_bert_input(test_sentences, MAX_SEQ_LEN, BERT_NAME)


In [None]:
import tensorflow as tf
from transformers import TFBertModel, BertConfig
from tensorflow.keras import layers, Model, Input

# Constants
MAX_SEQ_LEN = 128  # Example value, adjust as needed
BERT_NAME = "bert-base-uncased"  # Example value, adjust as needed
N_AXIS = 4 # Example value, adjust as needed

class CustomBERTModel(Model):
    def __init__(self, bert_name, num_classes):
        super(CustomBERTModel, self).__init__()
        self.bert = TFBertModel.from_pretrained(bert_name)
        self.pooling = layers.GlobalAveragePooling1D()
        self.classifier = layers.Dense(num_classes, activation="sigmoid")

    def call(self, inputs):
        input_ids, attention_mask, token_type_ids = inputs
        bert_outputs = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        pooled_output = self.pooling(bert_outputs.last_hidden_state)
        return self.classifier(pooled_output)

# Create the model
model = CustomBERTModel(BERT_NAME, N_AXIS)

# Define inputs
input_ids = Input(shape=(MAX_SEQ_LEN,), dtype=tf.int32, name='input_ids')
attention_mask = Input(shape=(MAX_SEQ_LEN,), dtype=tf.int32, name='attention_mask')
token_type_ids = Input(shape=(MAX_SEQ_LEN,), dtype=tf.int32, name='token_type_ids')

# Call the model with the inputs
output = model([input_ids, attention_mask, token_type_ids])

# Create the final model
final_model = Model(inputs=[input_ids, attention_mask, token_type_ids], outputs=output)

# Compile the model (optional, depends on your use case)
final_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Print model summary
final_model.summary()

In [None]:
max_epochs = 5
batch_size = 32
opt = tfa.optimizers.RectifiedAdam(learning_rate=3e-5)
loss = keras.losses.BinaryCrossentropy()
best_weights_file = "weights.h5"
auc = keras.metrics.AUC(multi_label=True, curve="ROC")
m_ckpt = ModelCheckpoint(best_weights_file, monitor='val_'+auc.name, mode='max', verbose=2,
                          save_weights_only=True, save_best_only=True)
model.compile(loss=loss, optimizer=opt, metrics=[auc, keras.metrics.BinaryAccuracy()])
model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=max_epochs,
    batch_size=batch_size,
    callbacks=[m_ckpt],
    verbose=2
)

In [None]:
loss = keras.losses.BinaryCrossentropy()
best_weights_file = "weights.h5"
model.load_weights(best_weights_file)
opt = tfa.optimizers.RectifiedAdam(learning_rate=3e-5)
model.compile(loss=loss, optimizer=opt, metrics=[keras.metrics.AUC(multi_label=True, curve="ROC"),
                                                  keras.metrics.BinaryAccuracy()])
predictions = model.predict(X_test)
model.evaluate(X_test, y_test, batch_size=32)

In [None]:
def plot_roc_auc(y_test, y_score, classes):
    assert len(classes) > 1, "len classes must be > 1"
    plt.figure()
    if len(classes) > 2:  # multi-label
        # Compute ROC curve and ROC area for each class
        for i in range(len(classes)):
            fpr, tpr, _ = roc_curve(y_test[:, i], y_score[:, i])
            roc_auc = auc(fpr, tpr)
            plt.plot(fpr, tpr, label='ROC curve of class {0} (area = {1:0.2f})'.format(classes[i], roc_auc))
        # Compute micro-average ROC curve and ROC area
        fpr, tpr, _ = roc_curve(y_test.ravel(), y_score.ravel())
        roc_auc = auc(fpr, tpr)
        # Plot ROC curve
        plt.plot(fpr, tpr, label='micro-average ROC curve (area = {0:0.2f})'.format(roc_auc))
    else:
        fpr, tpr, _ = roc_curve(y_test, y_score)
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, label='ROC curve (area = {0:0.2f})'.format(roc_auc))
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic')
    plt.legend(loc="lower right")
    plt.show()

In [None]:
import numpy as np

# Define your 20 MBTI questions
questions = [
    "How do you prefer to spend your free time—socializing or enjoying time alone?",
    "When making decisions, do you rely more on logic or personal values?",
    "Do you find it easier to focus on details or the bigger picture?",
    "How comfortable are you with making decisions quickly without all the information?",
    "Do you prefer planning everything out or being spontaneous?",
    "How do you handle conflict—do you try to avoid it, or confront it directly?",
    "Do you rely more on past experiences or new data when making decisions?",
    "Do you find interacting with others energizing or draining?",
    "When working on a project, do you prefer to follow instructions or come up with your own approach?",
    "How often do you seek feedback from others when working on something?",
    "Do you prefer a structured schedule or a more flexible approach to your day?",
    "Are you more comfortable expressing your emotions or keeping them to yourself?",
    "When faced with a problem, do you prefer to brainstorm ideas or research proven solutions?",
    "Do you tend to focus on what’s happening right now or think ahead to the future?",
    "How do you usually approach challenges—by seeking guidance or trying to figure it out on your own?",
    "Do you feel more motivated by external rewards (recognition, praise) or internal satisfaction?",
    "How do you deal with new information—do you analyze it carefully or trust your gut instinct?",
    "Do you enjoy discussing abstract ideas or prefer to stick to practical topics?",
    "When working with a team, do you prefer taking the lead or being a supportive member?",
    "Do you tend to base your decisions more on logic or how they’ll affect others emotionally?"
]

# Function to get the MBTI axis result for a given answer
def analyze_answer(sentence, model, axes):
    # Tokenize and prepare input for BERT
    enc_sentences = prepare_bert_input([sentence], MAX_SEQ_LEN, BERT_NAME)

    # Get predictions from the model
    predictions = model.predict(enc_sentences)

    # Convert predictions to MBTI axis values
    pred_axis = []
    mask = (predictions[0] > 0.5).astype(bool)

    for i in range(len(mask)):
        if mask[i]:
            pred_axis.append(axes[i][2])  # Take second character of the axis (E, S, F, P)
        else:
            pred_axis.append(axes[i][0])  # Take first character of the axis (I, N, T, J)

    # Return binary representation for the prediction (0 or 1)
    return np.array([1 if mask[i] else 0 for i in range(len(mask))])

# Initialize an array to hold the cumulative predictions
total_predictions = np.zeros((N_AXIS,))

# Loop through the questions and collect user responses
for question in questions:
    answer = input(question + "\n")

    # Analyze each answer and get MBTI axis prediction
    pred = analyze_answer(answer, model, axes)

    # Accumulate predictions for each axis
    total_predictions += pred

# Compute the average across all answers
average_predictions = total_predictions / len(questions)

# Determine final MBTI type based on average scores
final_personality = []
for i in range(len(average_predictions)):
    if average_predictions[i] > 0.5:
        final_personality.append(axes[i][2])  # Second character if average score > 0.5
    else:
        final_personality.append(axes[i][0])  # First character if average score <= 0.5

# Output the final predicted MBTI personality
print("\nFinal predicted MBTI personality: " + "".join(final_personality))
