**Data augmentation✅ without feature extraction**

# import data

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
# List files in the directory to verify existence
!ls /content/drive/MyDrive/TI14_Personal/mbti_1.csv

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.util import ngrams
from gensim.models import KeyedVectors
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
from scipy.stats import uniform
from sklearn.preprocessing import StandardScaler

import nltk
import re
import os
import random
from collections import Counter
from google.colab import files
from nltk import pos_tag, ne_chunk
from nltk.util import ngrams

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

In [None]:
df = pd.read_csv('/content/drive/MyDrive/TI14_Personal/mbti_1.csv')

In [None]:
df

# Data Preprocess

In [None]:
# Function to remove MBTI type words from the posts
def remove_mbti_words(text):
    mbti_types = ['INFJ', 'INTJ', 'ENFJ', 'ENTJ', 'INFP', 'INTP', 'ENFP', 'ENTP',
                  'ISFJ', 'ISTJ', 'ESFJ', 'ESTJ', 'ISFP', 'ISTP', 'ESFP', 'ESTP']
    for mbti in mbti_types:
        text = re.sub(mbti, '', text, flags=re.IGNORECASE)
    return text

In [None]:
def preprocess_text(text):
    text = remove_mbti_words(text)  # Remove MBTI words first
    # Lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Replace underscores with space
    text = re.sub(r'_', ' ', text)
    # Replace punctuation with a space
    text = re.sub(r'[^\w\s]', ' ', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

In [None]:
# Apply preprocessing
df['posts_cleaned'] = df['posts'].apply(preprocess_text)  # Apply the preprocessing to each post

In [None]:
# Add MBTI dimension columns
df['I/E'] = df['type'].apply(lambda x: x[0])
df['N/S'] = df['type'].apply(lambda x: x[1])
df['F/T'] = df['type'].apply(lambda x: x[2])
df['J/P'] = df['type'].apply(lambda x: x[3])

In [None]:
# Save the cleaned posts
df['tokens'] = df['posts_cleaned'].apply(word_tokenize)

In [None]:
# Verify the cleaned posts
print("Original vs Cleaned Data Sample:")
print(df[['posts', 'posts_cleaned']].head())

Original vs Cleaned Data Sample:
                                               posts  \
0  'http://www.youtube.com/watch?v=qsXHcwe3krw|||...   
1  'I'm finding the lack of me in these posts ver...   
2  'Good one  _____   https://www.youtube.com/wat...   
3  'Dear INTP,   I enjoyed our conversation the o...   
4  'You're fired.|||That's another silly misconce...   

                                       posts_cleaned  
0  moment sportscenter top ten play prank life ch...  
1  finding lack post alarming sex boring position...  
2  good one course say know blessing curse absolu...  
3  dear enjoyed conversation day esoteric gabbing...  
4  fired another silly misconception approaching ...  


In [None]:
df

Unnamed: 0,type,posts,posts_cleaned,I/E,N/S,F/T,J/P,tokens
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,moment sportscenter top ten play prank life ch...,I,N,F,J,"[moment, sportscenter, top, ten, play, prank, ..."
1,ENTP,'I'm finding the lack of me in these posts ver...,finding lack post alarming sex boring position...,E,N,T,P,"[finding, lack, post, alarming, sex, boring, p..."
2,INTP,'Good one _____ https://www.youtube.com/wat...,good one course say know blessing curse absolu...,I,N,T,P,"[good, one, course, say, know, blessing, curse..."
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...",dear enjoyed conversation day esoteric gabbing...,I,N,T,J,"[dear, enjoyed, conversation, day, esoteric, g..."
4,ENTJ,'You're fired.|||That's another silly misconce...,fired another silly misconception approaching ...,E,N,T,J,"[fired, another, silly, misconception, approac..."
...,...,...,...,...,...,...,...,...
8670,ISFP,'https://www.youtube.com/watch?v=t8edHB_h908||...,always think cat fi doms reason website become...,I,S,F,P,"[always, think, cat, fi, doms, reason, website..."
8671,ENFP,'So...if this thread already exists someplace ...,thread already exists someplace else heck dele...,E,N,F,P,"[thread, already, exists, someplace, else, hec..."
8672,INTP,'So many questions when i do these things. I ...,many question thing would take purple pill pic...,I,N,T,P,"[many, question, thing, would, take, purple, p..."
8673,INFP,'I am very conflicted right now when it comes ...,conflicted right come wanting child honestly m...,I,N,F,P,"[conflicted, right, come, wanting, child, hone..."


In [None]:
df['F/T'].value_counts()

Unnamed: 0_level_0,count
F/T,Unnamed: 1_level_1
F,4694
T,3981


In [None]:
# Save the DataFrame to a CSV file locally
csv_filename = 'preprocessed_mbti1.csv'
df.to_csv(csv_filename, index=False)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Download the file to your local machine
files.download(csv_filename)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# List files in the directory to verify existence
!ls /content/drive/MyDrive/TI14_Personal/preprocessed_mbti1(1).csv

/bin/bash: -c: line 1: syntax error near unexpected token `('
/bin/bash: -c: line 1: `ls /content/drive/MyDrive/TI14_Personal/preprocessed_mbti1(1).csv'


In [None]:
df = pd.read_csv('/content/drive/MyDrive/TI14_Personal/preprocessed_mbti1(1).csv')

In [None]:
df

Unnamed: 0,type,posts,posts_cleaned,I/E,N/S,F/T,J/P,tokens
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,moment sportscenter top ten play prank life ch...,I,N,F,J,"['moment', 'sportscenter', 'top', 'ten', 'play..."
1,ENTP,'I'm finding the lack of me in these posts ver...,finding lack post alarming sex boring position...,E,N,T,P,"['finding', 'lack', 'post', 'alarming', 'sex',..."
2,INTP,'Good one _____ https://www.youtube.com/wat...,good one course say know blessing curse absolu...,I,N,T,P,"['good', 'one', 'course', 'say', 'know', 'bles..."
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...",dear enjoyed conversation day esoteric gabbing...,I,N,T,J,"['dear', 'enjoyed', 'conversation', 'day', 'es..."
4,ENTJ,'You're fired.|||That's another silly misconce...,fired another silly misconception approaching ...,E,N,T,J,"['fired', 'another', 'silly', 'misconception',..."
...,...,...,...,...,...,...,...,...
8670,ISFP,'https://www.youtube.com/watch?v=t8edHB_h908||...,always think cat fi doms reason website become...,I,S,F,P,"['always', 'think', 'cat', 'fi', 'doms', 'reas..."
8671,ENFP,'So...if this thread already exists someplace ...,thread already exists someplace else heck dele...,E,N,F,P,"['thread', 'already', 'exists', 'someplace', '..."
8672,INTP,'So many questions when i do these things. I ...,many question thing would take purple pill pic...,I,N,T,P,"['many', 'question', 'thing', 'would', 'take',..."
8673,INFP,'I am very conflicted right now when it comes ...,conflicted right come wanting child honestly m...,I,N,F,P,"['conflicted', 'right', 'come', 'wanting', 'ch..."


# Split data

In [None]:
# Check distribution before splitting
print("Distribution before splitting:")
print(df['type'].value_counts(normalize=True))

Distribution before splitting:
type
INFP    0.211182
INFJ    0.169452
INTP    0.150317
INTJ    0.125764
ENTP    0.078963
ENFP    0.077810
ISTP    0.038847
ISFP    0.031239
ENTJ    0.026628
ISTJ    0.023631
ENFJ    0.021902
ISFJ    0.019135
ESTP    0.010259
ESFP    0.005533
ESFJ    0.004841
ESTJ    0.004496
Name: proportion, dtype: float64


In [None]:
# Split data into training and testing sets (80% train, 20% test)
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['type'], random_state=42)

In [None]:
# Check distribution after splitting
print("\nDistribution in training set:")
print(train_df['type'].value_counts(normalize=True))

print("\nDistribution in test set:")
print(test_df['type'].value_counts(normalize=True))


Distribution in training set:
type
INFP    0.211239
INFJ    0.169452
INTP    0.150288
INTJ    0.125793
ENTP    0.078963
ENFP    0.077810
ISTP    0.038905
ISFP    0.031268
ENTJ    0.026657
ISTJ    0.023631
ENFJ    0.021902
ISFJ    0.019164
ESTP    0.010231
ESFP    0.005476
ESFJ    0.004755
ESTJ    0.004467
Name: proportion, dtype: float64

Distribution in test set:
type
INFP    0.210951
INFJ    0.169452
INTP    0.150432
INTJ    0.125648
ENTP    0.078963
ENFP    0.077810
ISTP    0.038617
ISFP    0.031124
ENTJ    0.026513
ISTJ    0.023631
ENFJ    0.021902
ISFJ    0.019020
ESTP    0.010375
ESFP    0.005764
ESFJ    0.005187
ESTJ    0.004611
Name: proportion, dtype: float64


In [None]:
train_df

Unnamed: 0,type,posts,posts_cleaned,I/E,N/S,F/T,J/P,tokens
8331,INFP,'this is actually exactly what i expected! :l...,actually exactly expected laughing introversio...,I,N,F,P,"['actually', 'exactly', 'expected', 'laughing'..."
1290,ISTP,"'Nope. Not now, not ever. I'm too busy with ...",nope ever busy work cause adrenaline rush acti...,I,S,T,P,"['nope', 'ever', 'busy', 'work', 'cause', 'adr..."
1982,ENFJ,'Yes peace is the absence of conflict - your I...,yes peace absence conflict friend suxx hardd i...,E,N,F,J,"['yes', 'peace', 'absence', 'conflict', 'frien..."
769,INFP,"'I apologize for the delayed response, but tha...",apologize delayed response thank taking time s...,I,N,F,P,"['apologize', 'delayed', 'response', 'thank', ..."
8339,INFP,"'Nightglow, I can't even imagine what you must...",nightglow even imagine must struggling right d...,I,N,F,P,"['nightglow', 'even', 'imagine', 'must', 'stru..."
...,...,...,...,...,...,...,...,...
4273,INFP,'I'm annoyed. I'm sick of negative associatio...,annoyed sick negative association uncontrolled...,I,N,F,P,"['annoyed', 'sick', 'negative', 'association',..."
2698,INFP,'My dad just told me that he loved me for I th...,dad told loved think first time life mean said...,I,N,F,P,"['dad', 'told', 'loved', 'think', 'first', 'ti..."
7435,ENTJ,"'I have dated a few INFJs, including my curren...",dated including current partner year probably ...,E,N,T,J,"['dated', 'including', 'current', 'partner', '..."
1843,INTP,'People who are unable to replace social norms...,people unable replace social norm rational eff...,I,N,T,P,"['people', 'unable', 'replace', 'social', 'nor..."


# Data Augmentation

In [None]:
def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name())
    if word in synonyms:
        synonyms.remove(word)
    return list(synonyms)

In [None]:
def random_insertion(text, n=1):
    words = text.split()
    if len(words) == 0:
        return text
    for _ in range(n):
        new_word = random.choice(words)
        synonyms = get_synonyms(new_word)
        if synonyms:
            synonym = random.choice(synonyms)
            insert_position = random.randint(0, len(words))
            words.insert(insert_position, synonym)
    return ' '.join(words)

In [None]:
def random_synonym_replacement(text):
    words = text.split()
    if len(words) == 0:
        return text
    random_word = random.choice(words)
    synonyms = get_synonyms(random_word)
    if synonyms:
        synonym = random.choice(synonyms)
        new_words = [synonym if word == random_word else word for word in words]
        return ' '.join(new_words)
    else:
        return text

In [None]:
def random_deletion(text, p=0.1):
    words = text.split()
    if len(words) == 0:
        return text
    if len(words) == 1:
        return text
    new_words = [word for word in words if random.uniform(0, 1) > p]
    return ' '.join(new_words)

In [None]:
def random_deletion(text, p=0.1):
    words = text.split()
    if len(words) == 0:
        return text
    if len(words) == 1:
        return text
    new_words = [word for word in words if random.uniform(0, 1) > p]
    return ' '.join(new_words)

In [None]:
def random_swap(text, n=1):
    words = text.split()
    if len(words) < 2:
        return text
    for _ in range(n):
        idx1, idx2 = random.sample(range(len(words)), 2)
        words[idx1], words[idx2] = words[idx2], words[idx1]
    return ' '.join(words)

In [None]:
train_df['posts_cleaned'] = train_df['posts_cleaned'].astype(str).fillna('')

In [None]:
def augment_dimension(train_df, dimension_col, target_samples):
    class_counts = train_df[dimension_col].value_counts()
    majority_class = class_counts.idxmax()
    minority_class = class_counts.idxmin()

    # If classes are already balanced, skip augmentation
    if class_counts[minority_class] == class_counts[majority_class]:
        return train_df

    # Augment the minority class
    minority_texts = train_df[train_df[dimension_col] == minority_class]['posts_cleaned']
    num_samples_needed = target_samples - class_counts[minority_class]

    augmented_texts = []
    augmented_labels = []

    while num_samples_needed > 0:
        for text in minority_texts:
            if num_samples_needed <= 0:
                break
            augmented_text = random.choice([
                random_deletion(text),
                random_swap(text),
                random_insertion(text),
                random_synonym_replacement(text)
            ])
            augmented_texts.append(augmented_text)
            augmented_labels.append(minority_class)
            num_samples_needed -= 1

    # Create a DataFrame for the augmented data and append it to the original DataFrame
    augmented_df = pd.DataFrame({
        'posts_cleaned': augmented_texts,
        'type': train_df[train_df[dimension_col] == minority_class]['type'].iloc[0],
        dimension_col: minority_class
    })

    return pd.concat([train_df, augmented_df])

In [None]:
# Apply augmentation for each MBTI dimension
target_samples = train_df['type'].value_counts().max()  # Balance to the max class count
train_df_balanced = train_df.copy()

In [None]:
for dimension in ['I/E', 'N/S', 'F/T', 'J/P']:
    train_df_balanced = augment_dimension(train_df_balanced, dimension, target_samples)

In [None]:
# Now apply the general augmentation process for balancing overall MBTI types
total_target_samples = 25000
current_total_samples = len(train_df_balanced)
class_counts = train_df_balanced['type'].value_counts()

current_proportions = class_counts / current_total_samples
target_class_counts = (current_proportions * total_target_samples).astype(int)

In [None]:
augmented_texts = []
augmented_labels = []

for label in class_counts.index:
    current_size = class_counts[label]
    target_size = target_class_counts[label]
    texts_to_augment = train_df_balanced[train_df_balanced['type'] == label]['posts_cleaned']
    num_samples_needed = target_size - current_size

    while num_samples_needed > 0:
        for text in texts_to_augment:
            if num_samples_needed <= 0:
                break
            augmented_text = random.choice([
                random_deletion(text),
                random_swap(text),
                random_insertion(text),
                random_synonym_replacement(text)
            ])
            augmented_texts.append(augmented_text)
            augmented_labels.append(label)
            num_samples_needed -= 1

In [None]:
# Append the augmented data to the original training set
final_augmented_df = pd.DataFrame({
    'posts_cleaned': augmented_texts,
    'type': augmented_labels
})

In [None]:
# Combine with the existing balanced data
train_df_final = pd.concat([train_df_balanced, final_augmented_df])

In [None]:
# Verify the final class distribution for types and each dimension
print("Final MBTI Type Distribution:")
print(train_df_final['type'].value_counts())

Final MBTI Type Distribution:
type
INFP    4920
INFJ    3946
INTP    3500
INTJ    2929
ISTP    2614
ENTP    1839
ENFP    1812
ISFP     728
ENTJ     620
ISTJ     550
ENFJ     510
ISFJ     446
ESTP     238
ESFP     127
ESFJ     110
ESTJ     104
Name: count, dtype: int64


In [None]:
for dimension in ['I/E', 'N/S', 'F/T', 'J/P']:
    print(f"Class distribution for {dimension} dimension:")
    print(train_df_final[dimension].value_counts())

Class distribution for I/E dimension:
I/E
I    5342
E    1598
Name: count, dtype: int64
Class distribution for N/S dimension:
N/S
N    5983
S    1466
Name: count, dtype: int64
Class distribution for F/T dimension:
F/T
F    3755
T    3185
Name: count, dtype: int64
Class distribution for J/P dimension:
J/P
P    4193
J    2747
Name: count, dtype: int64


## Beda

In [None]:
# Convert all entries in the 'posts_cleaned' column to strings, replacing NaNs with empty strings
train_df['posts_cleaned'] = train_df['posts_cleaned'].astype(str).fillna('')

In [None]:
# Augmentation process
total_target_samples = 25000
current_total_samples = len(train_df)
class_counts = train_df['type'].value_counts()

current_proportions = class_counts / current_total_samples
target_class_counts = (current_proportions * total_target_samples).astype(int)

In [None]:
target_class_counts

Unnamed: 0_level_0,count
type,Unnamed: 1_level_1
INFP,5280
INFJ,4236
INTP,3757
INTJ,3144
ENTP,1974
ENFP,1945
ISTP,972
ISFP,781
ENTJ,666
ISTJ,590


In [None]:
augmented_texts = []
augmented_labels = []

In [None]:
for label in class_counts.index:
    current_size = class_counts[label]
    target_size = target_class_counts[label]
    texts_to_augment = train_df[train_df['type'] == label]['posts_cleaned']
    num_samples_needed = target_size - current_size

    while num_samples_needed > 0:
        for text in texts_to_augment:
            if num_samples_needed <= 0:
                break
            augmented_text = random.choice([
                random_deletion(text),
                random_swap(text),
                random_insertion(text),
                random_synonym_replacement(text)
            ])
            augmented_texts.append(augmented_text)
            augmented_labels.append(label)
            num_samples_needed -= 1

In [None]:
# Append the augmented data to the original training set
augmented_df = pd.DataFrame({
    'posts_cleaned': augmented_texts,
    'type': augmented_labels
})

In [None]:
train_df = pd.concat([train_df, augmented_df])

In [None]:
print(train_df['type'].value_counts(normalize=True))

type
INFP    0.211276
INFJ    0.169501
INTP    0.150334
INTJ    0.125805
ENTP    0.078988
ENFP    0.077828
ISTP    0.038894
ISFP    0.031251
ENTJ    0.026650
ISTJ    0.023608
ENFJ    0.021888
ISFJ    0.019167
ESTP    0.010204
ESFP    0.005442
ESFJ    0.004722
ESTJ    0.004442
Name: proportion, dtype: float64


# Encode tokens

In [None]:
train_df_final

Unnamed: 0,type,posts,posts_cleaned,I/E,N/S,F/T,J/P,tokens
8331,INFP,'this is actually exactly what i expected! :l...,actually exactly expected laughing introversio...,I,N,F,P,"['actually', 'exactly', 'expected', 'laughing'..."
1290,ISTP,"'Nope. Not now, not ever. I'm too busy with ...",nope ever busy work cause adrenaline rush acti...,I,S,T,P,"['nope', 'ever', 'busy', 'work', 'cause', 'adr..."
1982,ENFJ,'Yes peace is the absence of conflict - your I...,yes peace absence conflict friend suxx hardd i...,E,N,F,J,"['yes', 'peace', 'absence', 'conflict', 'frien..."
769,INFP,"'I apologize for the delayed response, but tha...",apologize delayed response thank taking time s...,I,N,F,P,"['apologize', 'delayed', 'response', 'thank', ..."
8339,INFP,"'Nightglow, I can't even imagine what you must...",nightglow even imagine must struggling right d...,I,N,F,P,"['nightglow', 'even', 'imagine', 'must', 'stru..."
...,...,...,...,...,...,...,...,...
17539,ESTJ,,mausi girl awesome reading really hit home man...,,,,,
17540,ESTJ,,people priority sound like know say enjoy frie...,,,,,
17541,ESTJ,,negotiation style hm manipulative terrible und...,,,,,
17542,ESTJ,,hey queen welcome back yes coworker favorite r...,,,,,


In [None]:
train_df_final['tokens'] = train_df_final.apply(
    lambda row: word_tokenize(row['posts_cleaned']) if pd.isna(row['tokens']) else row['tokens'],
    axis=1
)

In [None]:
# from nltk.tokenize import word_tokenize

# # Tokenize the 'posts_cleaned' column
# train_df['posts_tokenized'] = train_df['posts_cleaned'].apply(word_tokenize)

In [None]:
# Add MBTI dimension columns
train_df_final['I/E'] = train_df_final['type'].apply(lambda x: x[0])
train_df_final['N/S'] = train_df_final['type'].apply(lambda x: x[1])
train_df_final['F/T'] = train_df_final['type'].apply(lambda x: x[2])
train_df_final['J/P'] = train_df_final['type'].apply(lambda x: x[3])

In [None]:
train_df_final['tokens'] = train_df_final['tokens'].astype(str)

In [None]:
# List files in the directory to verify existence
!ls /content/drive/MyDrive/TI14_Personal/augmented_train_post_tokenized.csv

/content/drive/MyDrive/TI14_Personal/augmented_train_post_tokenized.csv


In [None]:
train_df = pd.read_csv('/content/drive/MyDrive/TI14_Personal/augmented_train_post_tokenized.csv')

In [None]:
train_df

Unnamed: 0,type,posts,posts_cleaned,I/E,N/S,F/T,J/P,tokens
8331,INFP,'this is actually exactly what i expected! :l...,actually exactly expected laughing introversio...,I,N,F,P,"['actually', 'exactly', 'expected', 'laughing'..."
1290,ISTP,"'Nope. Not now, not ever. I'm too busy with ...",nope ever busy work cause adrenaline rush acti...,I,S,T,P,"['nope', 'ever', 'busy', 'work', 'cause', 'adr..."
1982,ENFJ,'Yes peace is the absence of conflict - your I...,yes peace absence conflict friend suxx hardd i...,E,N,F,J,"['yes', 'peace', 'absence', 'conflict', 'frien..."
769,INFP,"'I apologize for the delayed response, but tha...",apologize delayed response thank taking time s...,I,N,F,P,"['apologize', 'delayed', 'response', 'thank', ..."
8339,INFP,"'Nightglow, I can't even imagine what you must...",nightglow even imagine must struggling right d...,I,N,F,P,"['nightglow', 'even', 'imagine', 'must', 'stru..."
...,...,...,...,...,...,...,...,...
964,ESTJ,,hitler u mind trolling calling hitler trolling...,E,S,T,J,"['hitler', 'u', 'mind', 'trolling', 'calling',..."
965,ESTJ,,never speak politics btw democratic socialist ...,E,S,T,J,"['never', 'speak', 'politics', 'btw', 'democra..."
966,ESTJ,,let try answer point one one bold know real si...,E,S,T,J,"['let', 'try', 'answer', 'point', 'one', 'one'..."
967,ESTJ,,girl reading really hit home way especially da...,E,S,T,J,"['girl', 'reading', 'really', 'hit', 'home', '..."


# Model

In [None]:
# Define a mapping from MBTI type to dimensions
def get_mbti_dimensions(mbti_type):
    ie = 1 if mbti_type[0] == 'I' else 0
    ns = 1 if mbti_type[1] == 'N' else 0
    ft = 1 if mbti_type[2] == 'F' else 0
    jp = 1 if mbti_type[3] == 'J' else 0
    return ie, ns, ft, jp

# Apply the function to both train and test sets
train_df_final[['I/E', 'N/S', 'F/T', 'J/P']] = train_df_final['type'].apply(lambda x: pd.Series(get_mbti_dimensions(x)))
test_df[['I/E', 'N/S', 'F/T', 'J/P']] = test_df['type'].apply(lambda x: pd.Series(get_mbti_dimensions(x)))

## Random Forest

In [None]:
# Define the RandomForest model
rf_model = RandomForestClassifier(random_state=42, max_depth=10)

# rf_model = RandomForestClassifier(random_state=42, class_weight='balanced')


In [None]:
train_df_final

Unnamed: 0,type,posts,posts_cleaned,I/E,N/S,F/T,J/P,tokens
8331,INFP,'this is actually exactly what i expected! :l...,actually exactly expected laughing introversio...,1,1,1,0,"['actually', 'exactly', 'expected', 'laughing'..."
1290,ISTP,"'Nope. Not now, not ever. I'm too busy with ...",nope ever busy work cause adrenaline rush acti...,1,0,0,0,"['nope', 'ever', 'busy', 'work', 'cause', 'adr..."
1982,ENFJ,'Yes peace is the absence of conflict - your I...,yes peace absence conflict friend suxx hardd i...,0,1,1,1,"['yes', 'peace', 'absence', 'conflict', 'frien..."
769,INFP,"'I apologize for the delayed response, but tha...",apologize delayed response thank taking time s...,1,1,1,0,"['apologize', 'delayed', 'response', 'thank', ..."
8339,INFP,"'Nightglow, I can't even imagine what you must...",nightglow even imagine must struggling right d...,1,1,1,0,"['nightglow', 'even', 'imagine', 'must', 'stru..."
...,...,...,...,...,...,...,...,...
17539,ESTJ,,mausi girl awesome reading really hit home man...,0,0,0,1,"['mausi', 'girl', 'awesome', 'reading', 'reall..."
17540,ESTJ,,people priority sound like know say enjoy frie...,0,0,0,1,"['people', 'priority', 'sound', 'like', 'know'..."
17541,ESTJ,,negotiation style hm manipulative terrible und...,0,0,0,1,"['negotiation', 'style', 'hm', 'manipulative',..."
17542,ESTJ,,hey queen welcome back yes coworker favorite r...,0,0,0,1,"['hey', 'queen', 'welcome', 'back', 'yes', 'co..."


In [None]:
string_to_float_train = train_df_final['tokens']
string_to_float_test = test_df['tokens']

In [None]:
string_to_float_train = string_to_float_train.astype(str)

In [None]:
from sklearn.preprocessing import LabelEncoder

XTest_float_tokens = LabelEncoder().fit_transform(string_to_float_test).astype(float)


In [None]:
XTrain_float_tokens = LabelEncoder().fit_transform(string_to_float_train).astype(float)

In [None]:
train_combined_features = XTrain_float_tokens
test_combined_features = XTest_float_tokens

In [None]:
train_combined_features = train_combined_features.reshape(-1, 1)
test_combined_features = test_combined_features.reshape(-1, 1)

In [None]:
# Train the model on class type
rf_model.fit(train_combined_features, train_df_final['type'])

In [None]:
train_combined_features # ini yg klo ada feature extraction

array([[ 0.00950791, -0.11958562, -0.0200958 , ...,  0.09692195,
         0.05050966,  0.        ],
       [ 0.08754832, -0.06622712,  0.01812679, ...,  0.09196591,
         0.03195126,  0.        ],
       [ 0.01326174, -0.11912014,  0.12532912, ...,  0.        ,
         0.16925002,  0.04852553],
       ...,
       [-0.04320237, -0.11863177, -0.09634498, ...,  0.13781697,
         0.02872864,  0.        ],
       [-0.11235117, -0.13667928,  0.10928326, ...,  0.03355358,
         0.        ,  0.        ],
       [ 0.00991569, -0.25300655,  0.10154124, ...,  0.04856195,
         0.05061491,  0.        ]])

In [None]:
# Predict on the train set and map to dimensions
train_predictions = rf_model.predict(train_combined_features)
train_pred_dimensions = pd.DataFrame([get_mbti_dimensions(pred) for pred in train_predictions], columns=['I/E', 'N/S', 'F/T', 'J/P'])

In [None]:
# Predict on the test set and map to dimensions
test_predictions = rf_model.predict(test_combined_features)
test_pred_dimensions = pd.DataFrame([get_mbti_dimensions(pred) for pred in test_predictions], columns=['I/E', 'N/S', 'F/T', 'J/P'])

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

# For the J/P dimension
precision_J = precision_score(test_df['J/P'], test_pred_dimensions['J/P'], pos_label=1)
recall_J = recall_score(test_df['J/P'], test_pred_dimensions['J/P'], pos_label=1)
f1_J = f1_score(test_df['J/P'], test_pred_dimensions['J/P'], pos_label=1)

# Print the results
print(f"Precision for 'J': {precision_J}")
print(f"Recall for 'J': {recall_J}")
print(f"F1-Score for 'J': {f1_J}")


Precision for 'J': 0.3977987421383648
Recall for 'J': 0.3682678311499272
F1-Score for 'J': 0.38246409674981097


In [None]:
# You can also calculate for 'P' similarly by setting pos_label=0, or calculate both at once using average='binary'
precision_P = precision_score(test_df['J/P'], test_pred_dimensions['J/P'], pos_label=0)
recall_P = recall_score(test_df['J/P'], test_pred_dimensions['J/P'], pos_label=0)
f1_P = f1_score(test_df['J/P'], test_pred_dimensions['J/P'], pos_label=0)

print(f"Precision for 'P': {precision_P}")
print(f"Recall for 'P': {recall_P}")
print(f"F1-Score for 'P': {f1_P}")

Precision for 'P': 0.6050955414012739
Recall for 'P': 0.6345419847328244
F1-Score for 'P': 0.6194690265486725


In [None]:
# Assuming you have already trained your model and have the predictions

# Predict on the test set and map to dimensions
test_predictions = rf_model.predict(test_combined_features)
test_pred_dimensions = pd.DataFrame([get_mbti_dimensions(pred) for pred in test_predictions], columns=['I/E', 'N/S', 'F/T', 'J/P'])


# Extract the 'J/P' dimension from the actual and predicted data
actual_jp = test_df['J/P'].values
predicted_jp = test_pred_dimensions['J/P'].values

# Print the actual and predicted outputs for the 'J/P' dimension
print("Actual vs Predicted for 'J/P' Dimension:")
for i in range(len(actual_jp)):
    print(f"Actual: {actual_jp[i]} | Predicted: {predicted_jp[i]}")

Actual vs Predicted for 'J/P' Dimension:
Actual: 0 | Predicted: 0
Actual: 1 | Predicted: 0
Actual: 1 | Predicted: 1
Actual: 1 | Predicted: 1
Actual: 1 | Predicted: 0
Actual: 1 | Predicted: 1
Actual: 1 | Predicted: 1
Actual: 1 | Predicted: 1
Actual: 0 | Predicted: 1
Actual: 0 | Predicted: 0
Actual: 0 | Predicted: 1
Actual: 1 | Predicted: 0
Actual: 0 | Predicted: 0
Actual: 0 | Predicted: 0
Actual: 0 | Predicted: 0
Actual: 0 | Predicted: 0
Actual: 1 | Predicted: 1
Actual: 0 | Predicted: 0
Actual: 1 | Predicted: 1
Actual: 1 | Predicted: 1
Actual: 1 | Predicted: 0
Actual: 1 | Predicted: 0
Actual: 0 | Predicted: 0
Actual: 0 | Predicted: 1
Actual: 1 | Predicted: 0
Actual: 1 | Predicted: 1
Actual: 0 | Predicted: 0
Actual: 0 | Predicted: 0
Actual: 0 | Predicted: 0
Actual: 1 | Predicted: 1
Actual: 1 | Predicted: 0
Actual: 0 | Predicted: 1
Actual: 0 | Predicted: 1
Actual: 0 | Predicted: 0
Actual: 0 | Predicted: 0
Actual: 0 | Predicted: 0
Actual: 0 | Predicted: 0
Actual: 0 | Predicted: 0
Actual: 0

In [None]:
# Initialize lists to store metrics for each dimension
average_accuracies = []
precisions = []
recalls = []
f1_scores = []

# Calculate metrics for each dimension and their averages
for dimension in ['I/E', 'N/S', 'F/T', 'J/P']:
    train_accuracy = accuracy_score(train_df_final[dimension], train_pred_dimensions[dimension])
    test_accuracy = accuracy_score(test_df[dimension], test_pred_dimensions[dimension])

    # Calculate the average accuracy for this dimension
    average_accuracy = (train_accuracy + test_accuracy) / 2
    average_accuracies.append(average_accuracy)

    # Calculate precision, recall, and f1-score for the test set
    precision = precision_score(test_df[dimension], test_pred_dimensions[dimension])
    recall = recall_score(test_df[dimension], test_pred_dimensions[dimension])
    f1 = f1_score(test_df[dimension], test_pred_dimensions[dimension])

    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)

    print(f"Train Accuracy for {dimension}: {train_accuracy}")
    print(f"Test Accuracy for {dimension}: {test_accuracy}")
    print(f"Average Accuracy for {dimension}: {average_accuracy}")
    print(f"Precision for {dimension}: {precision}")
    print(f"Recall for {dimension}: {recall}")
    print(f"F1-Score for {dimension}: {f1}\n")

# If you want to calculate an overall average accuracy, precision, recall, and f1-score across all dimensions
overall_average_accuracy = sum(average_accuracies) / len(average_accuracies)
overall_precision = sum(precisions) / len(precisions)
overall_recall = sum(recalls) / len(recalls)
overall_f1_score = sum(f1_scores) / len(f1_scores)

print(f"Overall Average Accuracy: {overall_average_accuracy}")
print(f"Overall Precision: {overall_precision}")
print(f"Overall Recall: {overall_recall}")
print(f"Overall F1-Score: {overall_f1_score}")

Train Accuracy for I/E: 0.8010242868003041
Test Accuracy for I/E: 0.7452449567723343
Average Accuracy for I/E: 0.7731346217863192
Precision for I/E: 0.766109785202864
Recall for I/E: 0.9625187406296851
F1-Score for I/E: 0.8531561461794019

Train Accuracy for N/S: 0.8251110310887049
Test Accuracy for N/S: 0.8265129682997118
Average Accuracy for N/S: 0.8258119996942084
Precision for N/S: 0.8609431680773881
Recall for N/S: 0.9525083612040134
F1-Score for N/S: 0.904414099714195

Train Accuracy for F/T: 0.6094506461809307
Test Accuracy for F/T: 0.5244956772334294
Average Accuracy for F/T: 0.56697316170718
Precision for F/T: 0.5406562054208274
Recall for F/T: 0.8072417465388712
F1-Score for F/T: 0.6475865014950876

Train Accuracy for J/P: 0.7053975113031649
Test Accuracy for J/P: 0.5291066282420749
Average Accuracy for J/P: 0.6172520697726198
Precision for J/P: 0.3977987421383648
Recall for J/P: 0.3682678311499272
F1-Score for J/P: 0.38246409674981097

Overall Average Accuracy: 0.69579296324

## SVM

In [None]:
# Define the SVM model
svm_model = SVC(random_state=42)

# Adjust the regularization parameter C
# svm_model = SVC(C=10, random_state=42, class_weight='balanced')


In [None]:
# Train the model on class type
svm_model.fit(train_combined_features, train_df_final['type'])

In [None]:
# Predict on the train set and map to dimensions
train_predictions = svm_model.predict(train_combined_features)
train_pred_dimensions = pd.DataFrame([get_mbti_dimensions(pred) for pred in train_predictions], columns=['I/E', 'N/S', 'F/T', 'J/P'])

In [None]:
# Predict on the test set and map to dimensions
test_predictions = svm_model.predict(test_combined_features)
test_pred_dimensions = pd.DataFrame([get_mbti_dimensions(pred) for pred in test_predictions], columns=['I/E', 'N/S', 'F/T', 'J/P'])

In [None]:
# Initialize lists to store metrics for each dimension
average_accuracies = []
precisions = []
recalls = []
f1_scores = []

# Calculate metrics for each dimension and their averages
for dimension in ['I/E', 'N/S', 'F/T', 'J/P']:
    train_accuracy = accuracy_score(train_df_final[dimension], train_pred_dimensions[dimension])
    test_accuracy = accuracy_score(test_df[dimension], test_pred_dimensions[dimension])

    # Calculate the average accuracy for this dimension
    average_accuracy = (train_accuracy + test_accuracy) / 2
    average_accuracies.append(average_accuracy)

    # Calculate precision, recall, and f1-score for the test set
    precision = precision_score(test_df[dimension], test_pred_dimensions[dimension])
    recall = recall_score(test_df[dimension], test_pred_dimensions[dimension])
    f1 = f1_score(test_df[dimension], test_pred_dimensions[dimension])

    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)

    print(f"Train Accuracy for {dimension}: {train_accuracy}")
    print(f"Test Accuracy for {dimension}: {test_accuracy}")
    print(f"Average Accuracy for {dimension}: {average_accuracy}")
    print(f"Precision for {dimension}: {precision}")
    print(f"Recall for {dimension}: {recall}")
    print(f"F1-Score for {dimension}: {f1}\n")

# If you want to calculate an overall average accuracy, precision, recall, and f1-score across all dimensions
overall_average_accuracy = sum(average_accuracies) / len(average_accuracies)
overall_precision = sum(precisions) / len(precisions)
overall_recall = sum(recalls) / len(recalls)
overall_f1_score = sum(f1_scores) / len(f1_scores)

print(f"Overall Average Accuracy: {overall_average_accuracy}")
print(f"Overall Precision: {overall_precision}")
print(f"Overall Recall: {overall_recall}")
print(f"Overall F1-Score: {overall_f1_score}")

Train Accuracy for I/E: 0.7855399511863321
Test Accuracy for I/E: 0.7688760806916427
Average Accuracy for I/E: 0.7772080159389874
Precision for I/E: 0.7688760806916427
Recall for I/E: 1.0
F1-Score for I/E: 0.8693385467579016

Train Accuracy for N/S: 0.8032649141759692
Test Accuracy for N/S: 0.861671469740634
Average Accuracy for N/S: 0.8324681919583017
Precision for N/S: 0.861671469740634
Recall for N/S: 1.0
F1-Score for N/S: 0.9256965944272446

Train Accuracy for F/T: 0.5041011483215301
Test Accuracy for F/T: 0.5412103746397694
Average Accuracy for F/T: 0.5226557614806497
Precision for F/T: 0.5412103746397694
Recall for F/T: 1.0
F1-Score for F/T: 0.7023186237845923

Train Accuracy for J/P: 0.6312967630936662
Test Accuracy for J/P: 0.6040345821325649
Average Accuracy for J/P: 0.6176656726131156
Precision for J/P: 0.0
Recall for J/P: 0.0
F1-Score for J/P: 0.0

Overall Average Accuracy: 0.6874994104977635
Overall Precision: 0.5429394812680115
Overall Recall: 0.75
Overall F1-Score: 0.6243

  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
precision = precision_score(test_df['J/P'], test_pred_dimensions['J/P'], pos_label=0)
recall = recall_score(test_df['J/P'], test_pred_dimensions['J/P'], pos_label=0)
f1 = f1_score(test_df['J/P'], test_pred_dimensions['J/P'], pos_label=0)

print(f"Precision for {dimension}: {precision}")
print(f"Recall for {dimension}: {recall}")
print(f"F1-Score for {dimension}: {f1}\n")

Precision for J/P: 0.6040345821325649
Recall for J/P: 1.0
F1-Score for J/P: 0.7531440891124687



In [None]:
train_df['J/P'].value_counts()

Unnamed: 0_level_0,count
J/P,Unnamed: 1_level_1
P,4193
J,2747


# LightGBM


In [None]:
# Define the LightGBM model
lgbm_model = LGBMClassifier(random_state=42)

In [None]:
# Train the model on class type
lgbm_model.fit(train_combined_features, train_df_final['type'])

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000650 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 24993, number of used features: 1
[LightGBM] [Info] Start training from score -3.891940
[LightGBM] [Info] Start training from score -2.624165
[LightGBM] [Info] Start training from score -3.696632
[LightGBM] [Info] Start training from score -2.609374
[LightGBM] [Info] Start training from score -5.425871
[LightGBM] [Info] Start training from score -5.282164
[LightGBM] [Info] Start training from score -5.481960
[LightGBM] [Info] Start training from score -4.654080
[LightGBM] [Info] Start training from score -1.845893
[LightGBM] [Info] Start training from score -1.625287
[LightGBM] [Info] Start training from score -2.143935
[LightGBM] [Info] Start training from score -1.965833
[LightGBM] [Info] Start training from score -4.026032
[LightGBM] 

In [None]:
# Predict on the train set and map to dimensions
train_predictions = lgbm_model.predict(train_combined_features)
train_pred_dimensions = pd.DataFrame([get_mbti_dimensions(pred) for pred in train_predictions], columns=['I/E', 'N/S', 'F/T', 'J/P'])

In [None]:
# Predict on the test set and map to dimensions
test_predictions = lgbm_model.predict(test_combined_features)
test_pred_dimensions = pd.DataFrame([get_mbti_dimensions(pred) for pred in test_predictions], columns=['I/E', 'N/S', 'F/T', 'J/P'])

In [None]:
# Initialize lists to store metrics for each dimension
average_accuracies = []
precisions = []
recalls = []
f1_scores = []

# Calculate metrics for each dimension and their averages
for dimension in ['I/E', 'N/S', 'F/T', 'J/P']:
    train_accuracy = accuracy_score(train_df_final[dimension], train_pred_dimensions[dimension])
    test_accuracy = accuracy_score(test_df[dimension], test_pred_dimensions[dimension])

    # Calculate the average accuracy for this dimension
    average_accuracy = (train_accuracy + test_accuracy) / 2
    average_accuracies.append(average_accuracy)

    # Calculate precision, recall, and f1-score for the test set
    precision = precision_score(test_df[dimension], test_pred_dimensions[dimension])
    recall = recall_score(test_df[dimension], test_pred_dimensions[dimension])
    f1 = f1_score(test_df[dimension], test_pred_dimensions[dimension])

    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)

    print(f"Train Accuracy for {dimension}: {train_accuracy}")
    print(f"Test Accuracy for {dimension}: {test_accuracy}")
    print(f"Average Accuracy for {dimension}: {average_accuracy}")
    print(f"Precision for {dimension}: {precision}")
    print(f"Recall for {dimension}: {recall}")
    print(f"F1-Score for {dimension}: {f1}\n")

# If you want to calculate an overall average accuracy, precision, recall, and f1-score across all dimensions
overall_average_accuracy = sum(average_accuracies) / len(average_accuracies)
overall_precision = sum(precisions) / len(precisions)
overall_recall = sum(recalls) / len(recalls)
overall_f1_score = sum(f1_scores) / len(f1_scores)

print(f"Overall Average Accuracy: {overall_average_accuracy}")
print(f"Overall Precision: {overall_precision}")
print(f"Overall Recall: {overall_recall}")
print(f"Overall F1-Score: {overall_f1_score}")

Train Accuracy for I/E: 0.7793782259032529
Test Accuracy for I/E: 0.7688760806916427
Average Accuracy for I/E: 0.7741271532974479
Precision for I/E: 0.7688760806916427
Recall for I/E: 1.0
F1-Score for I/E: 0.8693385467579016

Train Accuracy for N/S: 0.7863001640459328
Test Accuracy for N/S: 0.8219020172910663
Average Accuracy for N/S: 0.8041010906684996
Precision for N/S: 0.8615853658536585
Recall for N/S: 0.9451505016722408
F1-Score for N/S: 0.9014354066985646

Train Accuracy for F/T: 0.5630776617452887
Test Accuracy for F/T: 0.5123919308357349
Average Accuracy for F/T: 0.5377347962905118
Precision for F/T: 0.5368171021377672
Recall for F/T: 0.7220447284345048
F1-Score for F/T: 0.6158038147138964

Train Accuracy for J/P: 0.5995678790061217
Test Accuracy for J/P: 0.5037463976945245
Average Accuracy for J/P: 0.551657138350323
Precision for J/P: 0.38430851063829785
Recall for J/P: 0.42066957787481807
F1-Score for J/P: 0.40166782487838776

Overall Average Accuracy: 0.6669050446516955
Over

# hyperparameter

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

In [None]:
# Define the parameter grid for RandomForest
param_dist_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'bootstrap': [True, False]
}

In [None]:
# Create a RandomForest model
rf_model = RandomForestClassifier(random_state=42)

In [None]:
# Randomized search on hyper parameters
rf_random_search = RandomizedSearchCV(estimator=rf_model, param_distributions=param_dist_rf,
                                      n_iter=10, cv=3, verbose=1, random_state=42, n_jobs=-1)

In [None]:
# Fit the random search model
rf_random_search.fit(train_combined_features, train_df_final['type'])

Fitting 3 folds for each of 10 candidates, totalling 30 fits


In [None]:
# Best hyperparameters
print("Best RF Parameters:", rf_random_search.best_params_)

# Use the best model
best_rf_model = rf_random_search.best_estimator_

Best RF Parameters: {'n_estimators': 300, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': None, 'bootstrap': False}


In [None]:
# Evaluate on dimensions
for dimension in ['I/E', 'N/S', 'F/T', 'J/P']:
    # Predict on the train set
    train_predictions = best_rf_model.predict(train_combined_features)
    train_pred_dimensions = pd.DataFrame([get_mbti_dimensions(pred) for pred in train_predictions], columns=['I/E', 'N/S', 'F/T', 'J/P'])
    train_accuracy = accuracy_score(train_df_final[dimension], train_pred_dimensions[dimension])

    # Predict on the test set
    test_predictions = best_rf_model.predict(test_combined_features)
    test_pred_dimensions = pd.DataFrame([get_mbti_dimensions(pred) for pred in test_predictions], columns=['I/E', 'N/S', 'F/T', 'J/P'])
    test_accuracy = accuracy_score(test_df[dimension], test_pred_dimensions[dimension])

    print(f"Random Forest Train Accuracy for {dimension}: {train_accuracy}")
    print(f"Random Forest Test Accuracy for {dimension}: {test_accuracy}")
    print(f"Average Accuracy for {dimension}: {(train_accuracy + test_accuracy) / 2}\n")

Random Forest Train Accuracy for I/E: 0.999679910374905
Random Forest Test Accuracy for I/E: 0.6489913544668587
Average Accuracy for I/E: 0.8243356324208819

Random Forest Train Accuracy for N/S: 1.0
Random Forest Test Accuracy for N/S: 0.7141210374639769
Average Accuracy for N/S: 0.8570605187319884

Random Forest Train Accuracy for F/T: 0.9991597647341256
Random Forest Test Accuracy for F/T: 0.4979827089337176
Average Accuracy for F/T: 0.7485712368339216

Random Forest Train Accuracy for J/P: 0.9993998319529468
Random Forest Test Accuracy for J/P: 0.5319884726224784
Average Accuracy for J/P: 0.7656941522877125



## SVM

In [None]:
# Define the parameter grid for SVM
param_dist_svm = {
    'C': [0.1, 1, 10],
    'gamma': [1, 0.1, 0.01],
    'kernel': ['rbf', 'linear']
}

In [None]:
# Create an SVM model
svm_model = SVC(random_state=42, probability=True)

# Randomized search on hyper parameters
svm_random_search = RandomizedSearchCV(estimator=svm_model, param_distributions=param_dist_svm,
                                       n_iter=10, cv=3, verbose=1, random_state=42, n_jobs=-1)

In [None]:
# Fit the random search model
svm_random_search.fit(train_combined_features, train_df_final['type'])

Fitting 3 folds for each of 10 candidates, totalling 30 fits


In [None]:
# Best hyperparameters
print("Best SVM Parameters:", svm_random_search.best_params_)

# Use the best model
best_svm_model = svm_random_search.best_estimator_

In [None]:
# Evaluate on dimensions
for dimension in ['I/E', 'N/S', 'F/T', 'J/P']:
    # Predict on the train set
    train_predictions = best_svm_model.predict(train_combined_features)
    train_pred_dimensions = pd.DataFrame([get_mbti_dimensions(pred) for pred in train_predictions], columns=['I/E', 'N/S', 'F/T', 'J/P'])
    train_accuracy = accuracy_score(train_df[dimension], train_pred_dimensions[dimension])

    # Predict on the test set
    test_predictions = best_svm_model.predict(test_combined_features)
    test_pred_dimensions = pd.DataFrame([get_mbti_dimensions(pred) for pred in test_predictions], columns=['I/E', 'N/S', 'F/T', 'J/P'])
    test_accuracy = accuracy_score(test_df[dimension], test_pred_dimensions[dimension])

    print(f"SVM Train Accuracy for {dimension}: {train_accuracy}")
    print(f"SVM Test Accuracy for {dimension}: {test_accuracy}")
    print(f"Average Accuracy for {dimension}: {(train_accuracy + test_accuracy) / 2}\n")