In [2]:
import pandas as pd

In [3]:
text_df = pd.read_csv('mbti_1.csv')
text_df.head()

Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...


## text_df Pre-Processing

In [4]:
import re
import nltk
from nltk.corpus import stopwords

#Create a function that will remove HTML tags, non-alphanumeric characters, and symbols using regex, and remove stopwords with nltk
def clean_text(text):
    #Remove HTML tags
    ct = re.sub('<.*?>', '', text)
    
    #Remove non-alphanumeric characters and symbols
    ct = re.sub(r'[^a-zA-Z\s]', '', ct)
    
    #Convert to lowercase
    #ct = ct.lower()
    
    #Import stop words from the English language
    stop_words = set(stopwords.words('english'))
    #Tokenize the current review text
    words = nltk.word_tokenize(ct)
    #Keep word if it's not in stop words that were imported
    filtered_words = [word for word in words if word.lower() not in stop_words]
    #Return cleaned string
    return ' '.join(filtered_words)

#Apply the function to each row of the DataFrame to clean the data
text_df['posts'] = text_df['posts'].apply(clean_text)


### Do one-hot encoding to identify if personality type contains I or E (Introvert or Extrovert), N or S, etc.

In [5]:
#Function to check for 'I' or 'E'
def check_IE(text):
    if 'I' in text:
        return 0
    elif 'E' in text:
        return 1

#Function to check for 'N' or 'S'
def check_NS(text):
    if 'N' in text:
        return 0
    elif 'S' in text:
        return 1

#Function to check for 'F' or 'T'  
def check_FT(text):
    if 'F' in text:
        return 0
    elif 'T' in text:
        return 1
    
#Function to check for 'J' or 'P'
def check_JP(text):
    if 'J' in text:
        return 0
    elif 'P' in text:
        return 1

#Apply the function to the 'type' column
text_df['IE'] = text_df['type'].apply(check_IE)
text_df['NS'] = text_df['type'].apply(check_NS)
text_df['FT'] = text_df['type'].apply(check_FT)
text_df['JP'] = text_df['type'].apply(check_JP)

text_df.head()

Unnamed: 0,type,posts,IE,NS,FT,JP
0,INFJ,httpwwwyoutubecomwatchvqsXHcwekrwhttpmediatumb...,0,0,0,0
1,ENTP,Im finding lack posts alarmingSex boring posit...,1,0,1,1
2,INTP,Good one httpswwwyoutubecomwatchvfHiGbolFFGwOf...,0,0,1,1
3,INTJ,Dear INTP enjoyed conversation day Esoteric ga...,0,0,1,0
4,ENTJ,Youre firedThats another silly misconception a...,1,0,1,0


## Label each personality type from 0-15

In [6]:
pd.set_option('future.no_silent_downcasting', True)

text_df['type'] = text_df['type'].replace(
    ['INTP', 'ISTP', 'ENTP', 'ESTP', 'INFP', 'ISFP', 'ENFP', 'ESFP', 
     'INTJ', 'ISTJ', 'ENTJ', 'ESTJ', 'INFJ', 'ISFJ', 'ENFJ', 'ESFJ'], 
    [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
)

text_df.head()

Unnamed: 0,type,posts,IE,NS,FT,JP
0,12,httpwwwyoutubecomwatchvqsXHcwekrwhttpmediatumb...,0,0,0,0
1,2,Im finding lack posts alarmingSex boring posit...,1,0,1,1
2,0,Good one httpswwwyoutubecomwatchvfHiGbolFFGwOf...,0,0,1,1
3,8,Dear INTP enjoyed conversation day Esoteric ga...,0,0,1,0
4,10,Youre firedThats another silly misconception a...,1,0,1,0


## Separate data into training and testing

In [7]:
from sklearn.model_selection import train_test_split
#Separate data into train and test
#Assuming you want to split into 80% train and 20% test
train_df, test_df = train_test_split(text_df, test_size=0.2, random_state=42)

#Optional: Reset index if needed
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

In [8]:
train_df.to_csv('train.csv')
test_df.to_csv('test.csv')

## Feature Engineering with TF-IDF to predict personality type with XGBoost

In [19]:
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report
import sklearn.metrics as metrics
import pickle

#Prepare training data
X_train = train_df['posts']
y_train = train_df['type']

#Prepare testing data
X_test = test_df['posts']
y_test = test_df['type']

#Feature extraction using TF-IDF
#Initialize TF-IDF vectorizer assigning number of max features
vectorizer = TfidfVectorizer(max_features=10000)

#Fit and transform training data
X_train_tfidf = vectorizer.fit_transform(X_train)

#Transform validation data
X_test_tfidf = vectorizer.transform(X_test)

#Initialize XGBoost classifier
xgb_model = XGBClassifier()

# Train the classifier on the entire training dataset
xgb_model.fit(X_train_tfidf, y_train)

# Save the model to a file using pickle
with open('xgboost_model.pkl', 'wb') as model_file:
    pickle.dump(xgb_model, model_file)
    
with open('tfidf_vectorizer.pkl', 'wb') as vectorizer_file:
    pickle.dump(vectorizer, vectorizer_file)

# Load the model from the file
with open('xgboost_model.pkl', 'rb') as model_file:
    loaded_model = pickle.load(model_file)

# Use the loaded model to make predictions
y_pred = loaded_model.predict(X_test_tfidf)

y_test = y_test.astype(int)
y_pred = y_pred.astype(int)

# Evaluate model performance on validation data
print("Validation Classification Report:")
print(classification_report(y_test, y_pred))

Validation Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.76      0.71       293
           1       0.76      0.55      0.64        67
           2       0.59      0.56      0.58       135
           3       0.62      0.33      0.43        15
           4       0.66      0.81      0.73       370
           5       0.63      0.45      0.53        53
           6       0.62      0.59      0.60       125
           7       1.00      0.12      0.22         8
           8       0.64      0.67      0.65       193
           9       0.80      0.45      0.58        44
          10       0.57      0.39      0.46        44
          11       0.50      0.14      0.22         7
          12       0.62      0.67      0.64       288
          13       0.77      0.38      0.51        45
          14       0.50      0.12      0.20        41
          15       0.33      0.14      0.20         7

    accuracy                           0.65   

In [16]:
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report
import sklearn.metrics as metrics
import pickle

# Load the model from the file
with open('xgboost_model.pkl', 'rb') as model_file:
    loaded_model = pickle.load(model_file)

with open('tfidf_vectorizer.pkl', 'rb') as vectorizer_file:
    loaded_vectorizer = pickle.load(vectorizer_file)
    
#Prepare testing data
X_test = test_df['posts']

y_test = test_df['type']

# Feature extraction using TF-IDF (using the same fitted vectorizer)
X_test_tfidf = loaded_vectorizer.transform(X_test)

# Use the loaded model to make predictions
y_pred = loaded_model.predict(X_test_tfidf)

y_test = y_test.astype(int)
y_pred = y_pred.astype(int)

# Evaluate model performance on validation data
print("Validation Classification Report:")
print(classification_report(y_test, y_pred))

Validation Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.76      0.71       293
           1       0.76      0.55      0.64        67
           2       0.59      0.56      0.58       135
           3       0.62      0.33      0.43        15
           4       0.66      0.81      0.73       370
           5       0.63      0.45      0.53        53
           6       0.62      0.59      0.60       125
           7       1.00      0.12      0.22         8
           8       0.64      0.67      0.65       193
           9       0.80      0.45      0.58        44
          10       0.57      0.39      0.46        44
          11       0.50      0.14      0.22         7
          12       0.62      0.67      0.64       288
          13       0.77      0.38      0.51        45
          14       0.50      0.12      0.20        41
          15       0.33      0.14      0.20         7

    accuracy                           0.65   

## Predict each of the four categories separately

In [None]:
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
import pandas as pd

#Columns 'IE', 'NS', 'FT', and 'JP' are your target columns
target_columns = ['IE', 'NS', 'FT', 'JP']

#Initialize TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=10000)

#Prepare training data
X_train = train_df['posts']
X_test = test_df['posts']

#Fit and transform training data
X_train_tfidf = vectorizer.fit_transform(X_train)

#Transform testing data
X_test_tfidf = vectorizer.transform(X_test)

#Loop over each target column to train and evaluate a classifier
for target in target_columns:
    y_train = train_df[target]
    y_test = test_df[target]
    
    #Initialize XGBoost classifier
    xgb_model = XGBClassifier()
    
    #Train the classifier on the training data
    xgb_model.fit(X_train_tfidf, y_train)
    
    #Predictions on the test data
    y_pred = xgb_model.predict(X_test_tfidf)
    
    #Evaluate model performance on the test data
    print(f"Validation Classification Report for {target}:")
    print(classification_report(y_test, y_pred))

Validation Classification Report for IE:
              precision    recall  f1-score   support

           0       0.87      0.95      0.91      1353
           1       0.73      0.50      0.59       382

    accuracy                           0.85      1735
   macro avg       0.80      0.72      0.75      1735
weighted avg       0.84      0.85      0.84      1735

Validation Classification Report for NS:
              precision    recall  f1-score   support

           0       0.91      0.98      0.94      1489
           1       0.79      0.38      0.51       246

    accuracy                           0.90      1735
   macro avg       0.85      0.68      0.73      1735
weighted avg       0.89      0.90      0.88      1735

Validation Classification Report for FT:
              precision    recall  f1-score   support

           0       0.83      0.84      0.84       937
           1       0.81      0.80      0.81       798

    accuracy                           0.82      1735
   ma

## Using the 10%-70% rule for the TF-IDF

In [None]:
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
import pandas as pd

#Columns 'IE', 'NS', 'FT', and 'JP' are your target columns
target_columns = ['IE', 'NS', 'FT', 'JP']

#Initialize TF-IDF vectorizer with 10-70% rule
vectorizer = TfidfVectorizer(max_features=10000, min_df=0.1, max_df=0.7)

#Prepare training data
X_train = train_df['posts']
X_test = test_df['posts']

#Fit and transform training data
X_train_tfidf = vectorizer.fit_transform(X_train)

#Transform testing data
X_test_tfidf = vectorizer.transform(X_test)

#Loop over each target column to train and evaluate a classifier
for target in target_columns:
    y_train = train_df[target]
    y_test = test_df[target]
    
    #Initialize XGBoost classifier
    xgb_model = XGBClassifier()
    
    #Train the classifier on the training data
    xgb_model.fit(X_train_tfidf, y_train)
    
    #Predictions on the test data
    y_pred = xgb_model.predict(X_test_tfidf)
    
    #Evaluate model performance on the test data
    print(f"Validation Classification Report for {target}:")
    print(classification_report(y_test, y_pred))


Validation Classification Report for IE:
              precision    recall  f1-score   support

           0       0.87      0.95      0.91      1353
           1       0.72      0.50      0.59       382

    accuracy                           0.85      1735
   macro avg       0.80      0.72      0.75      1735
weighted avg       0.84      0.85      0.84      1735

Validation Classification Report for NS:
              precision    recall  f1-score   support

           0       0.90      0.99      0.94      1489
           1       0.81      0.34      0.48       246

    accuracy                           0.90      1735
   macro avg       0.85      0.66      0.71      1735
weighted avg       0.89      0.90      0.88      1735

Validation Classification Report for FT:
              precision    recall  f1-score   support

           0       0.82      0.83      0.83       937
           1       0.80      0.79      0.79       798

    accuracy                           0.81      1735
   ma

## Bring LLM to the task

In [7]:
test_df['type'] = test_df['type'] + 1
test_df.head()

Unnamed: 0,type,posts,IE,NS,FT,JP
0,1,lie avoid unreasonable response something comp...,0,0,1,1
1,9,said zero flexibility little time dating Going...,0,0,1,0
2,1,written bias something Im sure could even say ...,0,0,1,1
3,7,HAuhuHAuh might right Muhicz Im wondering step...,1,0,0,1
4,11,parents kind go college good job people want s...,1,0,1,0


In [8]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import torch.nn.functional as F

tokenizer = AutoTokenizer.from_pretrained("JanSt/albert-base-v2_mbti-classification")
#model = AutoModelForSequenceClassification.from_pretrained("JanSt/albert-base-v2_mbti-classification")
model = torch.load('albertV1.pth')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

AlbertForSequenceClassification(
  (albert): AlbertModel(
    (embeddings): AlbertEmbeddings(
      (word_embeddings): Embedding(30000, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0, inplace=False)
    )
    (encoder): AlbertTransformer(
      (embedding_hidden_mapping_in): Linear(in_features=128, out_features=768, bias=True)
      (albert_layer_groups): ModuleList(
        (0): AlbertLayerGroup(
          (albert_layers): ModuleList(
            (0): AlbertLayer(
              (full_layer_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (attention): AlbertAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768,

### Sample as to how the model predicts

In [9]:
# List of personality types in the order corresponding to the indices
personality_types = ['INTP', 'ISTP', 'ENTP', 'ESTP', 'INFP', 'ISFP', 'ENFP', 'ESFP', 
                     'INTJ', 'ISTJ', 'ENTJ', 'ESTJ', 'INFJ', 'ISFJ', 'ENFJ', 'ESFJ']

# Encode the input text
tokens = tokenizer.encode(r"I am introverted and rarely talk to people.", return_tensors='pt').to(device)

# Get the model's output
result = model(tokens)

# Apply softmax to get probabilities
probabilities = F.softmax(result.logits, dim=-1)

# Convert to percentages
percentages = probabilities * 100

# Print out the percentages for all classes
for idx, percentage in enumerate(percentages[0]):
    personality_type = personality_types[idx]
    print(f"{personality_type}: {percentage.item():.2f}%")

# Print the index of the maximum percentage
print(f"Predicted Personality Type Index: {int(torch.argmax(result.logits))}")

INTP: 2.99%
ISTP: 0.58%
ENTP: 0.90%
ESTP: 0.17%
INFP: 35.76%
ISFP: 6.67%
ENFP: 14.21%
ESFP: 0.20%
INTJ: 9.28%
ISTJ: 2.45%
ENTJ: 0.70%
ESTJ: 0.26%
INFJ: 23.04%
ISFJ: 1.90%
ENFJ: 0.75%
ESFJ: 0.14%
Predicted Personality Type Index: 4


In [10]:
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

# Initialize LabelEncoder
label_encoder = LabelEncoder()
label_encoder.fit(personality_types)  # Fit encoder to personality types

# Function for MBTI prediction
def mbti_prediction(text):
    tokens = tokenizer.encode(text, return_tensors='pt').to(device)
    result = model(tokens)
    return int(torch.argmax(result.logits))+1

# Convert predictions and true labels to integer indices
y_pred_LLM = test_df['posts'].apply(lambda x: mbti_prediction(x[:512]))
y_test_LLM = test_df['type']

# Print the classification report
print(classification_report(y_test_LLM.to_list(), y_pred_LLM.to_list(), target_names=personality_types))

              precision    recall  f1-score   support

        INTP       0.53      0.40      0.46       293
        ISTP       0.44      0.30      0.36        67
        ENTP       0.46      0.35      0.39       135
        ESTP       0.33      0.07      0.11        15
        INFP       0.41      0.59      0.49       370
        ISFP       0.00      0.00      0.00        53
        ENFP       0.29      0.38      0.33       125
        ESFP       0.00      0.00      0.00         8
        INTJ       0.32      0.50      0.39       193
        ISTJ       0.19      0.20      0.20        44
        ENTJ       0.44      0.34      0.38        44
        ESTJ       0.00      0.00      0.00         7
        INFJ       0.41      0.35      0.38       288
        ISFJ       0.44      0.24      0.31        45
        ENFJ       0.54      0.17      0.26        41
        ESFJ       0.00      0.00      0.00         7

    accuracy                           0.40      1735
   macro avg       0.30   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
import pandas as pd
from datasets import DatasetDict, Dataset

valid_df = test_df

#List of columns to remove
columns_to_remove = ['IE', 'NS', 'FT', 'JP']

#Remove columns from DataFrame
train_df = train_df.drop(columns=columns_to_remove)

#Convert pandas DataFrame to Dataset
train_dataset = Dataset.from_pandas(train_df)

#Rename columns
train_dataset = train_dataset.rename_column('posts', 'text')
train_dataset = train_dataset.rename_column('type', 'label')

valid_df = valid_df.drop(columns=columns_to_remove)

#Convert pandas DataFrame to Dataset
valid_dataset = Dataset.from_pandas(valid_df)

#Rename columns
valid_dataset = valid_dataset.rename_column('posts', 'text')
valid_dataset = valid_dataset.rename_column('type', 'label')

#Create a DatasetDict object
dataset = DatasetDict({
    'train': train_dataset,
    'test': valid_dataset
})

#Print combined dataset summary
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 6940
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 1735
    })
})


In [None]:
import evaluate
import torch
import numpy as np
from transformers import TrainingArguments, Trainer

In [None]:
#Function obtained from: https://huggingface.co/docs/transformers/en/training
#Tokenizes each row of the dataset 'text' vector, indicating padding type and truncation.
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

#Tokenize data with function defined above
tokenized_datasets = dataset.map(tokenize_function, batched=True)

#This variable is to monitor evaluation metrics while training and indicating which evaluation strategy which was set to epoch.
training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")

#Move model and tokenizer to GPU if available (Models takes too much time to train in cpu)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

#Function obtained from https://huggingface.co/docs/transformers/en/training
#Function to calculate predictions accuracy while training.
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    #Calculate prediction from logits
    predictions = np.argmax(logits, axis=-1)
    metric = evaluate.load("accuracy")
    #Convert the logits to predictions
    return metric.compute(predictions=predictions, references=labels)

#Create a trainer object which contains the following atributes, model, monitor variable, datasets and evaluation metrics. They were all previously defined.
#Function obtained from: https://huggingface.co/docs/transformers/en/training
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    compute_metrics=compute_metrics,
)

#Use trainer object to train the model with stablished parameters.
trainer.train()

torch.save(model, 'albert.pth')

Map:   0%|          | 0/6940 [00:00<?, ? examples/s]

Map:   0%|          | 0/1735 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/2604 [00:00<?, ?it/s]

{'loss': 2.3861, 'grad_norm': 5.999273777008057, 'learning_rate': 4.0399385560675886e-05, 'epoch': 0.58}


  0%|          | 0/217 [00:00<?, ?it/s]

{'eval_loss': 2.028566360473633, 'eval_accuracy': 0.3648414985590778, 'eval_runtime': 44.5526, 'eval_samples_per_second': 38.943, 'eval_steps_per_second': 4.871, 'epoch': 1.0}
{'loss': 2.1822, 'grad_norm': 26.257640838623047, 'learning_rate': 3.079877112135177e-05, 'epoch': 1.15}
{'loss': 2.0068, 'grad_norm': 31.12150764465332, 'learning_rate': 2.1198156682027652e-05, 'epoch': 1.73}


  0%|          | 0/217 [00:00<?, ?it/s]

{'eval_loss': 1.8431850671768188, 'eval_accuracy': 0.4449567723342939, 'eval_runtime': 44.8238, 'eval_samples_per_second': 38.707, 'eval_steps_per_second': 4.841, 'epoch': 2.0}
{'loss': 1.8417, 'grad_norm': 31.13799476623535, 'learning_rate': 1.1597542242703534e-05, 'epoch': 2.3}
{'loss': 1.7305, 'grad_norm': 68.17082977294922, 'learning_rate': 1.996927803379416e-06, 'epoch': 2.88}


  0%|          | 0/217 [00:00<?, ?it/s]

{'eval_loss': 1.7948306798934937, 'eval_accuracy': 0.45878962536023055, 'eval_runtime': 45.359, 'eval_samples_per_second': 38.25, 'eval_steps_per_second': 4.784, 'epoch': 3.0}
{'train_runtime': 2693.6345, 'train_samples_per_second': 7.729, 'train_steps_per_second': 0.967, 'train_loss': 2.0174466497887114, 'epoch': 3.0}
