**without data augmentation & featue extraction**

# import data

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
# List files in the directory to verify existence
!ls /content/drive/MyDrive/TI14_Personal/mbti_1.csv

/content/drive/MyDrive/TI14_Personal/mbti_1.csv


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.util import ngrams
from gensim.models import KeyedVectors
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
from scipy.stats import uniform
from sklearn.preprocessing import StandardScaler

import nltk
import re
import os
import random
from collections import Counter
from google.colab import files
from nltk import pos_tag, ne_chunk
from nltk.util import ngrams

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

In [None]:
df = pd.read_csv('/content/drive/MyDrive/TI14_Personal/mbti_1.csv')

In [None]:
df

Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...
...,...,...
8670,ISFP,'https://www.youtube.com/watch?v=t8edHB_h908||...
8671,ENFP,'So...if this thread already exists someplace ...
8672,INTP,'So many questions when i do these things. I ...
8673,INFP,'I am very conflicted right now when it comes ...


# Data Preprocess

In [None]:
# Function to remove MBTI type words from the posts
def remove_mbti_words(text):
    mbti_types = ['INFJ', 'INTJ', 'ENFJ', 'ENTJ', 'INFP', 'INTP', 'ENFP', 'ENTP',
                  'ISFJ', 'ISTJ', 'ESFJ', 'ESTJ', 'ISFP', 'ISTP', 'ESFP', 'ESTP']
    for mbti in mbti_types:
        text = re.sub(mbti, '', text, flags=re.IGNORECASE)
    return text

In [None]:
def preprocess_text(text):
    text = remove_mbti_words(text)  # Remove MBTI words first
    # Lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Replace underscores with space
    text = re.sub(r'_', ' ', text)
    # Replace punctuation with a space
    text = re.sub(r'[^\w\s]', ' ', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

In [None]:
# Apply preprocessing
df['posts_cleaned'] = df['posts'].apply(preprocess_text)  # Apply the preprocessing to each post

In [None]:
# Add MBTI dimension columns
df['I/E'] = df['type'].apply(lambda x: x[0])
df['N/S'] = df['type'].apply(lambda x: x[1])
df['F/T'] = df['type'].apply(lambda x: x[2])
df['J/P'] = df['type'].apply(lambda x: x[3])

In [None]:
# Save the cleaned posts
df['tokens'] = df['posts_cleaned'].apply(word_tokenize)

In [None]:
df.head()

Unnamed: 0,type,posts,posts_cleaned,I/E,N/S,F/T,J/P
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,moment sportscenter top ten play prank life ch...,I,N,F,J
1,ENTP,'I'm finding the lack of me in these posts ver...,finding lack post alarming sex boring position...,E,N,T,P
2,INTP,'Good one _____ https://www.youtube.com/wat...,good one course say know blessing curse absolu...,I,N,T,P
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...",dear enjoyed conversation day esoteric gabbing...,I,N,T,J
4,ENTJ,'You're fired.|||That's another silly misconce...,fired another silly misconception approaching ...,E,N,T,J


In [None]:
# List files in the directory to verify existence
!ls /content/drive/MyDrive/TI14_Personal/processed_mbti.csv

/content/drive/MyDrive/TI14_Personal/processed_mbti.csv


In [None]:
df = pd.read_csv('/content/drive/MyDrive/TI14_Personal/preprocessed_mbti1(1).csv')

In [None]:
df

Unnamed: 0,type,posts,posts_cleaned,I/E,N/S,F/T,J/P,tokens
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,moment sportscenter top ten play prank life ch...,I,N,F,J,"['moment', 'sportscenter', 'top', 'ten', 'play..."
1,ENTP,'I'm finding the lack of me in these posts ver...,finding lack post alarming sex boring position...,E,N,T,P,"['finding', 'lack', 'post', 'alarming', 'sex',..."
2,INTP,'Good one _____ https://www.youtube.com/wat...,good one course say know blessing curse absolu...,I,N,T,P,"['good', 'one', 'course', 'say', 'know', 'bles..."
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...",dear enjoyed conversation day esoteric gabbing...,I,N,T,J,"['dear', 'enjoyed', 'conversation', 'day', 'es..."
4,ENTJ,'You're fired.|||That's another silly misconce...,fired another silly misconception approaching ...,E,N,T,J,"['fired', 'another', 'silly', 'misconception',..."
...,...,...,...,...,...,...,...,...
8670,ISFP,'https://www.youtube.com/watch?v=t8edHB_h908||...,always think cat fi doms reason website become...,I,S,F,P,"['always', 'think', 'cat', 'fi', 'doms', 'reas..."
8671,ENFP,'So...if this thread already exists someplace ...,thread already exists someplace else heck dele...,E,N,F,P,"['thread', 'already', 'exists', 'someplace', '..."
8672,INTP,'So many questions when i do these things. I ...,many question thing would take purple pill pic...,I,N,T,P,"['many', 'question', 'thing', 'would', 'take',..."
8673,INFP,'I am very conflicted right now when it comes ...,conflicted right come wanting child honestly m...,I,N,F,P,"['conflicted', 'right', 'come', 'wanting', 'ch..."


# Split data

In [None]:
# Check distribution before splitting
print("Distribution before splitting:")
print(df['type'].value_counts(normalize=True))

Distribution before splitting:
type
INFP    0.211182
INFJ    0.169452
INTP    0.150317
INTJ    0.125764
ENTP    0.078963
ENFP    0.077810
ISTP    0.038847
ISFP    0.031239
ENTJ    0.026628
ISTJ    0.023631
ENFJ    0.021902
ISFJ    0.019135
ESTP    0.010259
ESFP    0.005533
ESFJ    0.004841
ESTJ    0.004496
Name: proportion, dtype: float64


In [None]:
# Split data into training and testing sets (80% train, 20% test)
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['type'], random_state=42)

In [None]:
# Check distribution after splitting
print("\nDistribution in training set:")
print(train_df['type'].value_counts(normalize=True))

print("\nDistribution in test set:")
print(test_df['type'].value_counts(normalize=True))


Distribution in training set:
type
INFP    0.211239
INFJ    0.169452
INTP    0.150288
INTJ    0.125793
ENTP    0.078963
ENFP    0.077810
ISTP    0.038905
ISFP    0.031268
ENTJ    0.026657
ISTJ    0.023631
ENFJ    0.021902
ISFJ    0.019164
ESTP    0.010231
ESFP    0.005476
ESFJ    0.004755
ESTJ    0.004467
Name: proportion, dtype: float64

Distribution in test set:
type
INFP    0.210951
INFJ    0.169452
INTP    0.150432
INTJ    0.125648
ENTP    0.078963
ENFP    0.077810
ISTP    0.038617
ISFP    0.031124
ENTJ    0.026513
ISTJ    0.023631
ENFJ    0.021902
ISFJ    0.019020
ESTP    0.010375
ESFP    0.005764
ESFJ    0.005187
ESTJ    0.004611
Name: proportion, dtype: float64


In [None]:
train_df

Unnamed: 0,type,posts,posts_cleaned,I/E,N/S,F/T,J/P,tokens
8331,INFP,'this is actually exactly what i expected! :l...,actually exactly expected laughing introversio...,I,N,F,P,"['actually', 'exactly', 'expected', 'laughing'..."
1290,ISTP,"'Nope. Not now, not ever. I'm too busy with ...",nope ever busy work cause adrenaline rush acti...,I,S,T,P,"['nope', 'ever', 'busy', 'work', 'cause', 'adr..."
1982,ENFJ,'Yes peace is the absence of conflict - your I...,yes peace absence conflict friend suxx hardd i...,E,N,F,J,"['yes', 'peace', 'absence', 'conflict', 'frien..."
769,INFP,"'I apologize for the delayed response, but tha...",apologize delayed response thank taking time s...,I,N,F,P,"['apologize', 'delayed', 'response', 'thank', ..."
8339,INFP,"'Nightglow, I can't even imagine what you must...",nightglow even imagine must struggling right d...,I,N,F,P,"['nightglow', 'even', 'imagine', 'must', 'stru..."
...,...,...,...,...,...,...,...,...
4273,INFP,'I'm annoyed. I'm sick of negative associatio...,annoyed sick negative association uncontrolled...,I,N,F,P,"['annoyed', 'sick', 'negative', 'association',..."
2698,INFP,'My dad just told me that he loved me for I th...,dad told loved think first time life mean said...,I,N,F,P,"['dad', 'told', 'loved', 'think', 'first', 'ti..."
7435,ENTJ,"'I have dated a few INFJs, including my curren...",dated including current partner year probably ...,E,N,T,J,"['dated', 'including', 'current', 'partner', '..."
1843,INTP,'People who are unable to replace social norms...,people unable replace social norm rational eff...,I,N,T,P,"['people', 'unable', 'replace', 'social', 'nor..."


# Model

In [None]:
# Define a mapping from MBTI type to dimensions
def get_mbti_dimensions(mbti_type):
    ie = 1 if mbti_type[0] == 'I' else 0
    ns = 1 if mbti_type[1] == 'N' else 0
    ft = 1 if mbti_type[2] == 'F' else 0
    jp = 1 if mbti_type[3] == 'J' else 0
    return ie, ns, ft, jp

# Apply the function to both train and test sets
train_df[['I/E', 'N/S', 'F/T', 'J/P']] = train_df['type'].apply(lambda x: pd.Series(get_mbti_dimensions(x)))
test_df[['I/E', 'N/S', 'F/T', 'J/P']] = test_df['type'].apply(lambda x: pd.Series(get_mbti_dimensions(x)))

In [None]:
# Define the RandomForest model
rf_model = RandomForestClassifier(random_state=42, max_depth=10)

# rf_model = RandomForestClassifier(random_state=42)


In [None]:
train_df

Unnamed: 0,type,posts,posts_cleaned,I/E,N/S,F/T,J/P,tokens
8331,INFP,'this is actually exactly what i expected! :l...,actually exactly expected laughing introversio...,1,1,1,0,"['actually', 'exactly', 'expected', 'laughing'..."
1290,ISTP,"'Nope. Not now, not ever. I'm too busy with ...",nope ever busy work cause adrenaline rush acti...,1,0,0,0,"['nope', 'ever', 'busy', 'work', 'cause', 'adr..."
1982,ENFJ,'Yes peace is the absence of conflict - your I...,yes peace absence conflict friend suxx hardd i...,0,1,1,1,"['yes', 'peace', 'absence', 'conflict', 'frien..."
769,INFP,"'I apologize for the delayed response, but tha...",apologize delayed response thank taking time s...,1,1,1,0,"['apologize', 'delayed', 'response', 'thank', ..."
8339,INFP,"'Nightglow, I can't even imagine what you must...",nightglow even imagine must struggling right d...,1,1,1,0,"['nightglow', 'even', 'imagine', 'must', 'stru..."
...,...,...,...,...,...,...,...,...
4273,INFP,'I'm annoyed. I'm sick of negative associatio...,annoyed sick negative association uncontrolled...,1,1,1,0,"['annoyed', 'sick', 'negative', 'association',..."
2698,INFP,'My dad just told me that he loved me for I th...,dad told loved think first time life mean said...,1,1,1,0,"['dad', 'told', 'loved', 'think', 'first', 'ti..."
7435,ENTJ,"'I have dated a few INFJs, including my curren...",dated including current partner year probably ...,0,1,0,1,"['dated', 'including', 'current', 'partner', '..."
1843,INTP,'People who are unable to replace social norms...,people unable replace social norm rational eff...,1,1,0,0,"['people', 'unable', 'replace', 'social', 'nor..."


In [None]:
string_to_float_train = train_df['tokens']
string_to_float_test = test_df['tokens']

In [None]:
from sklearn.preprocessing import LabelEncoder

XTrain_float_tokens = LabelEncoder().fit_transform(string_to_float_train).astype(float)
XTest_float_tokens = LabelEncoder().fit_transform(string_to_float_test).astype(float)


In [None]:
train_combined_features = XTrain_float_tokens
test_combined_features = XTest_float_tokens

In [None]:
train_combined_features = train_combined_features.reshape(-1, 1)
test_combined_features = test_combined_features.reshape(-1, 1)

In [None]:
# Train the model on class type
rf_model.fit(train_combined_features, train_df['type'])

In [None]:
train_combined_features #ini cuma encode dari string ke float

array([[  48.],
       [3831.],
       [6867.],
       ...,
       [1034.],
       [4190.],
       [6817.]])

In [None]:
train_combined_features # ini yg klo ada feature extraction

array([[  48.],
       [3831.],
       [6867.],
       ...,
       [1034.],
       [4190.],
       [6817.]])

In [None]:
# Predict on the train set and map to dimensions
train_predictions = rf_model.predict(train_combined_features)
train_pred_dimensions = pd.DataFrame([get_mbti_dimensions(pred) for pred in train_predictions], columns=['I/E', 'N/S', 'F/T', 'J/P'])

In [None]:

# Predict on the test set and map to dimensions
test_predictions = rf_model.predict(test_combined_features)
test_pred_dimensions = pd.DataFrame([get_mbti_dimensions(pred) for pred in test_predictions], columns=['I/E', 'N/S', 'F/T', 'J/P'])

In [None]:
# Initialize lists to store metrics for each dimension
average_accuracies = []
precisions = []
recalls = []
f1_scores = []

# Calculate metrics for each dimension and their averages
for dimension in ['I/E', 'N/S', 'F/T', 'J/P']:
    train_accuracy = accuracy_score(train_df[dimension], train_pred_dimensions[dimension])
    test_accuracy = accuracy_score(test_df[dimension], test_pred_dimensions[dimension])

    # Calculate the average accuracy for this dimension
    average_accuracy = (train_accuracy + test_accuracy) / 2
    average_accuracies.append(average_accuracy)

    # Calculate precision, recall, and f1-score for the test set
    precision = precision_score(test_df[dimension], test_pred_dimensions[dimension])
    recall = recall_score(test_df[dimension], test_pred_dimensions[dimension])
    f1 = f1_score(test_df[dimension], test_pred_dimensions[dimension])

    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)

    print(f"Train Accuracy for {dimension}: {train_accuracy}")
    print(f"Test Accuracy for {dimension}: {test_accuracy}")
    print(f"Average Accuracy for {dimension}: {average_accuracy}")
    print(f"Precision for {dimension}: {precision}")
    print(f"Recall for {dimension}: {recall}")
    print(f"F1-Score for {dimension}: {f1}\n")

# If you want to calculate an overall average accuracy, precision, recall, and f1-score across all dimensions
overall_average_accuracy = sum(average_accuracies) / len(average_accuracies)
overall_precision = sum(precisions) / len(precisions)
overall_recall = sum(recalls) / len(recalls)
overall_f1_score = sum(f1_scores) / len(f1_scores)

print(f"Overall Average Accuracy: {overall_average_accuracy}")
print(f"Overall Precision: {overall_precision}")
print(f"Overall Recall: {overall_recall}")
print(f"Overall F1-Score: {overall_f1_score}")

Train Accuracy for I/E: 0.7802593659942363
Test Accuracy for I/E: 0.7613832853025937
Average Accuracy for I/E: 0.770821325648415
Precision for I/E: 0.7690058479532164
Recall for I/E: 0.9857571214392804
F1-Score for I/E: 0.8639947437582128

Train Accuracy for N/S: 0.8632564841498559
Test Accuracy for N/S: 0.8593659942363112
Average Accuracy for N/S: 0.8613112391930835
Precision for N/S: 0.8613518197573656
Recall for N/S: 0.9973244147157191
F1-Score for N/S: 0.9243645381277124

Train Accuracy for F/T: 0.6190201729106628
Test Accuracy for F/T: 0.5268011527377522
Average Accuracy for F/T: 0.5729106628242076
Precision for F/T: 0.5419034090909091
Recall for F/T: 0.8125665601703941
F1-Score for F/T: 0.6501917341286748

Train Accuracy for J/P: 0.6736311239193083
Test Accuracy for J/P: 0.5544668587896253
Average Accuracy for J/P: 0.6140489913544669
Precision for J/P: 0.39086294416243655
Recall for J/P: 0.22416302765647744
F1-Score for J/P: 0.28492136910268273

Overall Average Accuracy: 0.704773

# SVM

In [None]:
# Define the SVM model
svm_model = SVC(random_state=42)

In [None]:
# Train the model on class type
svm_model.fit(train_combined_features, train_df['type'])

In [None]:
# Predict on the train set and map to dimensions
train_predictions = svm_model.predict(train_combined_features)
train_pred_dimensions = pd.DataFrame([get_mbti_dimensions(pred) for pred in train_predictions], columns=['I/E', 'N/S', 'F/T', 'J/P'])

In [None]:
# Predict on the test set and map to dimensions
test_predictions = svm_model.predict(test_combined_features)
test_pred_dimensions = pd.DataFrame([get_mbti_dimensions(pred) for pred in test_predictions], columns=['I/E', 'N/S', 'F/T', 'J/P'])

In [None]:
# Initialize lists to store metrics for each dimension
average_accuracies = []
precisions = []
recalls = []
f1_scores = []

# Calculate metrics for each dimension and their averages
for dimension in ['I/E', 'N/S', 'F/T', 'J/P']:
    train_accuracy = accuracy_score(train_df[dimension], train_pred_dimensions[dimension])
    test_accuracy = accuracy_score(test_df[dimension], test_pred_dimensions[dimension])

    # Calculate the average accuracy for this dimension
    average_accuracy = (train_accuracy + test_accuracy) / 2
    average_accuracies.append(average_accuracy)

    # Calculate precision, recall, and f1-score for the test set
    precision = precision_score(test_df[dimension], test_pred_dimensions[dimension])
    recall = recall_score(test_df[dimension], test_pred_dimensions[dimension])
    f1 = f1_score(test_df[dimension], test_pred_dimensions[dimension])

    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)

    print(f"Train Accuracy for {dimension}: {train_accuracy}")
    print(f"Test Accuracy for {dimension}: {test_accuracy}")
    print(f"Average Accuracy for {dimension}: {average_accuracy}")
    print(f"Precision for {dimension}: {precision}")
    print(f"Recall for {dimension}: {recall}")
    print(f"F1-Score for {dimension}: {f1}\n")

# If you want to calculate an overall average accuracy, precision, recall, and f1-score across all dimensions
overall_average_accuracy = sum(average_accuracies) / len(average_accuracies)
overall_precision = sum(precisions) / len(precisions)
overall_recall = sum(recalls) / len(recalls)
overall_f1_score = sum(f1_scores) / len(f1_scores)

print(f"Overall Average Accuracy: {overall_average_accuracy}")
print(f"Overall Precision: {overall_precision}")
print(f"Overall Recall: {overall_recall}")
print(f"Overall F1-Score: {overall_f1_score}")

Train Accuracy for I/E: 0.7697406340057636
Test Accuracy for I/E: 0.7688760806916427
Average Accuracy for I/E: 0.7693083573487032
Precision for I/E: 0.7688760806916427
Recall for I/E: 1.0
F1-Score for I/E: 0.8693385467579016

Train Accuracy for N/S: 0.8621037463976945
Test Accuracy for N/S: 0.861671469740634
Average Accuracy for N/S: 0.8618876080691642
Precision for N/S: 0.861671469740634
Recall for N/S: 1.0
F1-Score for N/S: 0.9256965944272446

Train Accuracy for F/T: 0.5410662824207493
Test Accuracy for F/T: 0.5412103746397694
Average Accuracy for F/T: 0.5411383285302593
Precision for F/T: 0.5412103746397694
Recall for F/T: 1.0
F1-Score for F/T: 0.7023186237845923

Train Accuracy for J/P: 0.604178674351585
Test Accuracy for J/P: 0.6040345821325649
Average Accuracy for J/P: 0.604106628242075
Precision for J/P: 0.0
Recall for J/P: 0.0
F1-Score for J/P: 0.0

Overall Average Accuracy: 0.6941102305475504
Overall Precision: 0.5429394812680115
Overall Recall: 0.75
Overall F1-Score: 0.624338

  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# Calculate precision, recall, and f1-score for the test set
precision = precision_score(test_df['J/P'], test_pred_dimensions['J/P'], pos_label=0)
recall = recall_score(test_df['J/P'], test_pred_dimensions['J/P'], pos_label=0)
f1 = f1_score(test_df['J/P'], test_pred_dimensions['J/P'], pos_label=0)

precisions.append(precision)
recalls.append(recall)
f1_scores.append(f1)

print(f"Train Accuracy for {dimension}: {train_accuracy}")
print(f"Test Accuracy for {dimension}: {test_accuracy}")
print(f"Average Accuracy for {dimension}: {average_accuracy}")
print(f"Precision for {dimension}: {precision}")
print(f"Recall for {dimension}: {recall}")
print(f"F1-Score for {dimension}: {f1}\n")

Train Accuracy for J/P: 0.604178674351585
Test Accuracy for J/P: 0.6040345821325649
Average Accuracy for J/P: 0.604106628242075
Precision for J/P: 0.6040345821325649
Recall for J/P: 1.0
F1-Score for J/P: 0.7531440891124687



# lightGBM

In [None]:
# Define the LightGBM model
lgbm_model = LGBMClassifier(random_state=42)

In [None]:
# Train the model on class type
lgbm_model.fit(train_combined_features, train_df['type'])

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000452 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 6940, number of used features: 1
[LightGBM] [Info] Start training from score -3.821177
[LightGBM] [Info] Start training from score -2.553488
[LightGBM] [Info] Start training from score -3.624701
[LightGBM] [Info] Start training from score -2.538782
[LightGBM] [Info] Start training from score -5.348549
[LightGBM] [Info] Start training from score -5.207471
[LightGBM] [Info] Start training from score -5.411070
[LightGBM] [Info] Start training from score -4.582377
[LightGBM] [Info] Start training from score -1.775183
[LightGBM] [Info] Start training from score -1.554764
[LightGBM] [Info] Start training from score -2.073121
[LightGBM] [Info] Start training from score -1.895201
[LightGBM] [Info] Start training from score -3.954708
[LightGBM] [

In [None]:
# Predict on the train set and map to dimensions
train_predictions = lgbm_model.predict(train_combined_features)
train_pred_dimensions = pd.DataFrame([get_mbti_dimensions(pred) for pred in train_predictions], columns=['I/E', 'N/S', 'F/T', 'J/P'])

In [None]:
# Predict on the test set and map to dimensions
test_predictions = lgbm_model.predict(test_combined_features)
test_pred_dimensions = pd.DataFrame([get_mbti_dimensions(pred) for pred in test_predictions], columns=['I/E', 'N/S', 'F/T', 'J/P'])

In [None]:
# Initialize lists to store metrics for each dimension
average_accuracies = []
precisions = []
recalls = []
f1_scores = []

# Calculate metrics for each dimension and their averages
for dimension in ['I/E', 'N/S', 'F/T', 'J/P']:
    train_accuracy = accuracy_score(train_df[dimension], train_pred_dimensions[dimension])
    test_accuracy = accuracy_score(test_df[dimension], test_pred_dimensions[dimension])

    # Calculate the average accuracy for this dimension
    average_accuracy = (train_accuracy + test_accuracy) / 2
    average_accuracies.append(average_accuracy)

    # Calculate precision, recall, and f1-score for the test set
    precision = precision_score(test_df[dimension], test_pred_dimensions[dimension])
    recall = recall_score(test_df[dimension], test_pred_dimensions[dimension])
    f1 = f1_score(test_df[dimension], test_pred_dimensions[dimension])

    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)

    print(f"Train Accuracy for {dimension}: {train_accuracy}")
    print(f"Test Accuracy for {dimension}: {test_accuracy}")
    print(f"Average Accuracy for {dimension}: {average_accuracy}")
    print(f"Precision for {dimension}: {precision}")
    print(f"Recall for {dimension}: {recall}")
    print(f"F1-Score for {dimension}: {f1}\n")

# If you want to calculate an overall average accuracy, precision, recall, and f1-score across all dimensions
overall_average_accuracy = sum(average_accuracies) / len(average_accuracies)
overall_precision = sum(precisions) / len(precisions)
overall_recall = sum(recalls) / len(recalls)
overall_f1_score = sum(f1_scores) / len(f1_scores)

print(f"Overall Average Accuracy: {overall_average_accuracy}")
print(f"Overall Precision: {overall_precision}")
print(f"Overall Recall: {overall_recall}")
print(f"Overall F1-Score: {overall_f1_score}")

Train Accuracy for I/E: 0.7654178674351585
Test Accuracy for I/E: 0.7688760806916427
Average Accuracy for I/E: 0.7671469740634006
Precision for I/E: 0.7688760806916427
Recall for I/E: 1.0
F1-Score for I/E: 0.8693385467579016

Train Accuracy for N/S: 0.8621037463976945
Test Accuracy for N/S: 0.861671469740634
Average Accuracy for N/S: 0.8618876080691642
Precision for N/S: 0.861671469740634
Recall for N/S: 1.0
F1-Score for N/S: 0.9256965944272446

Train Accuracy for F/T: 0.5753602305475505
Test Accuracy for F/T: 0.500864553314121
Average Accuracy for F/T: 0.5381123919308357
Precision for F/T: 0.5310110450297366
Recall for F/T: 0.6656017039403621
F1-Score for F/T: 0.5907372400756143

Train Accuracy for J/P: 0.5894812680115273
Test Accuracy for J/P: 0.5146974063400577
Average Accuracy for J/P: 0.5520893371757924
Precision for J/P: 0.3798449612403101
Recall for J/P: 0.35662299854439594
F1-Score for J/P: 0.3678678678678679

Overall Average Accuracy: 0.6798090778097983
Overall Precision: 0.63

# tes


#Hyperparameter Tune

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

In [None]:
# Define the adjusted parameter grid for RandomForest
param_dist_rf = {
    'n_estimators': randint(50, 150),  # Fewer trees
    'max_depth': [3, 5, 7, 10],  # Further restrict depth
    'min_samples_split': randint(5, 15),  # Increase min_samples_split
    'min_samples_leaf': randint(2, 5),  # Increase min_samples_leaf
    'max_features': ['auto', 'sqrt'],  # Limit the number of features
    'bootstrap': [True, False]
}

In [None]:
# Create a RandomForest model
rf_model = RandomForestClassifier(random_state=42)

In [None]:
# Randomized search on hyperparameters
rf_random_search = RandomizedSearchCV(estimator=rf_model, param_distributions=param_dist_rf,
                                      n_iter=15, cv=4, verbose=1, random_state=42, n_jobs=-1)

In [None]:
# Fit the random search model
rf_random_search.fit(train_combined_features, train_df['type'])

Fitting 4 folds for each of 15 candidates, totalling 60 fits


32 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
26 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1145, in wrapper
    estimator._validate_params()
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/_param_validation.py", line 96, in validate_parameter_constraints
    raise InvalidParameterError(
sklea

In [None]:
# Best hyperparameters
print("Best RF Parameters:", rf_random_search.best_params_)

# Use the best model
best_rf_model = rf_random_search.best_estimator_

Best RF Parameters: {'bootstrap': True, 'max_depth': 7, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 11, 'n_estimators': 124}


In [None]:
# Evaluate on dimensions using the best_rf_model
print("Random Forest Results:")
for dimension in ['I/E', 'N/S', 'F/T', 'J/P']:
    # Train predictions
    train_predictions = best_rf_model.predict(train_combined_features)
    train_pred_dimensions = pd.DataFrame([get_mbti_dimensions(pred) for pred in train_predictions], columns=['I/E', 'N/S', 'F/T', 'J/P'])
    train_accuracy = accuracy_score(train_df[dimension], train_pred_dimensions[dimension])
    train_precision = precision_score(train_df[dimension], train_pred_dimensions[dimension])
    train_recall = recall_score(train_df[dimension], train_pred_dimensions[dimension])
    train_f1 = f1_score(train_df[dimension], train_pred_dimensions[dimension])

    # Test predictions
    test_predictions = best_rf_model.predict(test_combined_features)
    test_pred_dimensions = pd.DataFrame([get_mbti_dimensions(pred) for pred in test_predictions], columns=['I/E', 'N/S', 'F/T', 'J/P'])
    test_accuracy = accuracy_score(test_df[dimension], test_pred_dimensions[dimension])
    test_precision = precision_score(test_df[dimension], test_pred_dimensions[dimension])
    test_recall = recall_score(test_df[dimension], test_pred_dimensions[dimension])
    test_f1 = f1_score(test_df[dimension], test_pred_dimensions[dimension])

    # Print results
    print(f"Train Accuracy for {dimension}: {train_accuracy}")
    print(f"Test Accuracy for {dimension}: {test_accuracy}")
    print(f"Average Accuracy for {dimension}: {(train_accuracy + test_accuracy) / 2}\n")
    print(f"Train Precision for {dimension}: {train_precision}")
    print(f"Test Precision for {dimension}: {test_precision}")
    print(f"Train Recall for {dimension}: {train_recall}")
    print(f"Test Recall for {dimension}: {test_recall}")
    print(f"Train F1 Score for {dimension}: {train_f1}")
    print(f"Test F1 Score for {dimension}: {test_f1}\n\n")

Random Forest Results:
Train Accuracy for I/E: 0.7706051873198847
Test Accuracy for I/E: 0.7688760806916427
Average Accuracy for I/E: 0.7697406340057638

Train Precision for I/E: 0.7704847085978073
Test Precision for I/E: 0.7688760806916427
Train Recall for I/E: 0.9998128041931861
Test Recall for I/E: 1.0
Train F1 Score for I/E: 0.870294932377383
Test F1 Score for I/E: 0.8693385467579016


Train Accuracy for N/S: 0.8621037463976945
Test Accuracy for N/S: 0.861671469740634
Average Accuracy for N/S: 0.8618876080691642

Train Precision for N/S: 0.8621037463976945
Test Precision for N/S: 0.861671469740634
Train Recall for N/S: 1.0
Test Recall for N/S: 1.0
Train F1 Score for N/S: 0.9259459877737367
Test F1 Score for N/S: 0.9256965944272446


Train Accuracy for F/T: 0.5564841498559078
Test Accuracy for F/T: 0.5429394812680115
Average Accuracy for F/T: 0.5497118155619596

Train Precision for F/T: 0.5506964205481504
Test Precision for F/T: 0.5440821256038647
Train Recall for F/T: 0.97922769640

## SVM

In [None]:
from scipy.stats import uniform

In [None]:
# Define the parameter distribution for SVM
param_dist_svm = {
    'C': uniform(0.1, 10),  # Regularization parameter
    'kernel': ['linear', 'rbf'],  # Kernel type
    'gamma': ['scale', 'auto'],  # Kernel coefficient
    'degree': randint(2, 5),  # Degree for 'poly' kernel
    'coef0': uniform(0, 1)  # Independent term in kernel function for 'poly' and 'sigmoid'
}

In [None]:
# Create an SVM model
svm_model = SVC(random_state=42)

# Randomized search on hyperparameters
svm_random_search = RandomizedSearchCV(estimator=svm_model, param_distributions=param_dist_svm,
                                       n_iter=15, cv=4, verbose=1, random_state=42, n_jobs=-1)

In [None]:
# Fit the random search model
svm_random_search.fit(train_combined_features, train_df['type'])

Fitting 4 folds for each of 15 candidates, totalling 60 fits


In [None]:
# Best hyperparameters
print("Best SVM Parameters:", svm_random_search.best_params_)

# Use the best model
best_svm_model = svm_random_search.best_estimator_

In [None]:
# Evaluate on dimensions
print("SVM Results:")
for dimension in ['I/E', 'N/S', 'F/T', 'J/P']:
    # Train predictions
    train_predictions = best_svm_model.predict(train_combined_features)
    train_pred_dimensions = pd.DataFrame([get_mbti_dimensions(pred) for pred in train_predictions], columns=['I/E', 'N/S', 'F/T', 'J/P'])
    train_accuracy = accuracy_score(train_df[dimension], train_pred_dimensions[dimension])
    train_precision = precision_score(train_df[dimension], train_pred_dimensions[dimension])
    train_recall = recall_score(train_df[dimension], train_pred_dimensions[dimension])
    train_f1 = f1_score(train_df[dimension], train_pred_dimensions[dimension])

    # Test predictions
    test_predictions = best_svm_model.predict(test_combined_features)
    test_pred_dimensions = pd.DataFrame([get_mbti_dimensions(pred) for pred in test_predictions], columns=['I/E', 'N/S', 'F/T', 'J/P'])
    test_accuracy = accuracy_score(test_df[dimension], test_pred_dimensions[dimension])
    test_precision = precision_score(test_df[dimension], test_pred_dimensions[dimension])
    test_recall = recall_score(test_df[dimension], test_pred_dimensions[dimension])
    test_f1 = f1_score(test_df[dimension], test_pred_dimensions[dimension])

    # Print results
    print(f"Train Accuracy for {dimension}: {train_accuracy}")
    print(f"Test Accuracy for {dimension}: {test_accuracy}")
    print(f"Average Accuracy for {dimension}: {(train_accuracy + test_accuracy) / 2}\n")
    print(f"Train Precision for {dimension}: {train_precision}")
    print(f"Test Precision for {dimension}: {test_precision}")
    print(f"Train Recall for {dimension}: {train_recall}")
    print(f"Test Recall for {dimension}: {test_recall}")
    print(f"Train F1 Score for {dimension}: {train_f1}")
    print(f"Test F1 Score for {dimension}: {test_f1}\n\n")

## LightGBM

In [None]:
# Define the parameter grid for LightGBM
param_dist_lgbm = {
    'num_leaves': [31, 50, 100],
    'learning_rate': [0.1, 0.01, 0.05],
    'n_estimators': [100, 200, 500],
    'min_child_samples': [20, 30],
    'subsample': [0.8, 0.9, 1.0]
}

In [None]:
# Create a LightGBM model
lgbm_model = LGBMClassifier(random_state=42)

# Randomized search on hyper parameters
lgbm_random_search = RandomizedSearchCV(estimator=lgbm_model, param_distributions=param_dist_lgbm,
                                        n_iter=10, cv=3, verbose=1, random_state=42, n_jobs=-1)

NameError: name 'LGBMClassifier' is not defined

In [None]:
# Fit the random search model
lgbm_random_search.fit(train_combined_features, train_df['type'])

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000203 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 6940, number of used features: 1
[LightGBM] [Info] Start training from score -3.821177
[LightGBM] [Info] Start training from score -2.553488
[LightGBM] [Info] Start training from score -3.624701
[LightGBM] [Info] Start training from score -2.538782
[LightGBM] [Info] Start training from score -5.348549
[LightGBM] [Info] Start training from score -5.207471
[LightGBM] [Info] Start training from score -5.411070
[LightGBM] [Info] Start training from score -4.582377
[LightGBM] [Info] Start training from score -1.775183
[LightGBM] [Info] Start training from score -1.554764
[LightGBM] [Info] Start training from score -2.073121
[LightGBM] [Info] Start training from score -1.895201
[Ligh

In [None]:
# Best hyperparameters
print("Best LGBM Parameters:", lgbm_random_search.best_params_)

# Use the best model
best_lgbm_model = lgbm_random_search.best_estimator_

Best LGBM Parameters: {'subsample': 0.9, 'num_leaves': 31, 'n_estimators': 100, 'min_child_samples': 20, 'learning_rate': 0.01}


In [None]:
# Evaluate on dimensions
for dimension in ['I/E', 'N/S', 'F/T', 'J/P']:
    # Predict on the train set
    train_predictions = best_lgbm_model.predict(train_combined_features)
    train_pred_dimensions = pd.DataFrame([get_mbti_dimensions(pred) for pred in train_predictions], columns=['I/E', 'N/S', 'F/T', 'J/P'])
    train_accuracy = accuracy_score(train_df[dimension], train_pred_dimensions[dimension])

    # Predict on the test set
    test_predictions = best_lgbm_model.predict(test_combined_features)
    test_pred_dimensions = pd.DataFrame([get_mbti_dimensions(pred) for pred in test_predictions], columns=['I/E', 'N/S', 'F/T', 'J/P'])
    test_accuracy = accuracy_score(test_df[dimension], test_pred_dimensions[dimension])

    print(f"LightGBM Train Accuracy for {dimension}: {train_accuracy}")
    print(f"LightGBM Test Accuracy for {dimension}: {test_accuracy}")
    print(f"Average Accuracy for {dimension}: {(train_accuracy + test_accuracy) / 2}\n")

LightGBM Train Accuracy for I/E: 0.7697406340057636
LightGBM Test Accuracy for I/E: 0.7688760806916427
Average Accuracy for I/E: 0.7693083573487032

LightGBM Train Accuracy for N/S: 0.8621037463976945
LightGBM Test Accuracy for N/S: 0.861671469740634
Average Accuracy for N/S: 0.8618876080691642

LightGBM Train Accuracy for F/T: 0.556628242074928
LightGBM Test Accuracy for F/T: 0.5285302593659942
Average Accuracy for F/T: 0.5425792507204611

LightGBM Train Accuracy for J/P: 0.6021613832853026
LightGBM Test Accuracy for J/P: 0.5573487031700288
Average Accuracy for J/P: 0.5797550432276657

