**TFIDF+W2V✅ | AUGMENT✅**

#Import Library + Data

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
# List files in the directory to verify existence
!ls /content/drive/MyDrive/TI14_Personal/mbti_1.csv

/content/drive/MyDrive/TI14_Personal/mbti_1.csv


In [None]:
!ls /content/drive/MyDrive/TI14_Personal/augmented_train_set1.csv

/content/drive/MyDrive/TI14_Personal/augmented_train_set1.csv


In [None]:
!ls /content/drive/MyDrive/TI14_Personal/test_set.csv

/content/drive/MyDrive/TI14_Personal/test_set.csv


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.util import ngrams
from gensim.models import KeyedVectors
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
from scipy.stats import uniform
from sklearn.preprocessing import StandardScaler

import nltk
import re
import os
import random
from collections import Counter
from google.colab import files
from nltk import pos_tag, ne_chunk
from nltk.util import ngrams

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

In [None]:
df = pd.read_csv('/content/drive/MyDrive/TI14_Personal/mbti_1.csv')

In [None]:
df.dtypes

Unnamed: 0,0
type,object
posts,object


#Data Preprocess

In [None]:
# Function to remove MBTI type words from the posts
def remove_mbti_words(text):
    mbti_types = ['INFJ', 'INTJ', 'ENFJ', 'ENTJ', 'INFP', 'INTP', 'ENFP', 'ENTP',
                  'ISFJ', 'ISTJ', 'ESFJ', 'ESTJ', 'ISFP', 'ISTP', 'ESFP', 'ESTP']
    for mbti in mbti_types:
        text = re.sub(mbti, '', text, flags=re.IGNORECASE)
    return text

In [None]:
def preprocess_text(text):
    text = remove_mbti_words(text)  # Remove MBTI words first
    # Lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Replace underscores with space
    text = re.sub(r'_', ' ', text)
    # Replace punctuation with a space
    text = re.sub(r'[^\w\s]', ' ', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

In [None]:
# Apply preprocessing
df['posts_cleaned'] = df['posts'].apply(preprocess_text)  # Apply the preprocessing to each post

In [None]:
# Add MBTI dimension columns
df['I/E'] = df['type'].apply(lambda x: x[0])
df['N/S'] = df['type'].apply(lambda x: x[1])
df['F/T'] = df['type'].apply(lambda x: x[2])
df['J/P'] = df['type'].apply(lambda x: x[3])

In [None]:
# Save the cleaned posts
df['tokens'] = df['posts_cleaned'].apply(word_tokenize)

In [None]:
# Verify the cleaned posts
print("Original vs Cleaned Data Sample:")
print(df[['posts', 'posts_cleaned']].head())

Original vs Cleaned Data Sample:
                                               posts  \
0  'http://www.youtube.com/watch?v=qsXHcwe3krw|||...   
1  'I'm finding the lack of me in these posts ver...   
2  'Good one  _____   https://www.youtube.com/wat...   
3  'Dear INTP,   I enjoyed our conversation the o...   
4  'You're fired.|||That's another silly misconce...   

                                       posts_cleaned  
0  moment sportscenter top ten play prank life ch...  
1  finding lack post alarming sex boring position...  
2  good one course say know blessing curse absolu...  
3  dear enjoyed conversation day esoteric gabbing...  
4  fired another silly misconception approaching ...  


In [None]:
df.head()

Unnamed: 0,type,posts,I/E,N/S,F/T,J/P
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,I,N,F,J
1,ENTP,'I'm finding the lack of me in these posts ver...,E,N,T,P
2,INTP,'Good one _____ https://www.youtube.com/wat...,I,N,T,P
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...",I,N,T,J
4,ENTJ,'You're fired.|||That's another silly misconce...,E,N,T,J


In [None]:
df['F/T'].value_counts()

Unnamed: 0_level_0,count
F/T,Unnamed: 1_level_1
F,4694
T,3981


In [None]:
# Save the DataFrame to a CSV file locally
csv_filename = 'preprocessed_mbti1.csv'
df.to_csv(csv_filename, index=False)

# Download the file to your local machine
files.download(csv_filename)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Load the CSV file into a new DataFrame
df = pd.read_csv('preprocessed_mbti1.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'preprocessed_mbti1.csv'

#Split Data

In [None]:
# Check distribution before splitting
print("Distribution before splitting:")
print(df['type'].value_counts(normalize=True))

Distribution before splitting:
type
INFP    0.211182
INFJ    0.169452
INTP    0.150317
INTJ    0.125764
ENTP    0.078963
ENFP    0.077810
ISTP    0.038847
ISFP    0.031239
ENTJ    0.026628
ISTJ    0.023631
ENFJ    0.021902
ISFJ    0.019135
ESTP    0.010259
ESFP    0.005533
ESFJ    0.004841
ESTJ    0.004496
Name: proportion, dtype: float64


In [None]:
# Split data into training and testing sets (80% train, 20% test)
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['type'], random_state=42)

In [None]:
# Check distribution after splitting
print("\nDistribution in training set:")
print(train_df['type'].value_counts(normalize=True))

print("\nDistribution in test set:")
print(test_df['type'].value_counts(normalize=True))


Distribution in training set:
type
INFP    0.211239
INFJ    0.169452
INTP    0.150288
INTJ    0.125793
ENTP    0.078963
ENFP    0.077810
ISTP    0.038905
ISFP    0.031268
ENTJ    0.026657
ISTJ    0.023631
ENFJ    0.021902
ISFJ    0.019164
ESTP    0.010231
ESFP    0.005476
ESFJ    0.004755
ESTJ    0.004467
Name: proportion, dtype: float64

Distribution in test set:
type
INFP    0.210951
INFJ    0.169452
INTP    0.150432
INTJ    0.125648
ENTP    0.078963
ENFP    0.077810
ISTP    0.038617
ISFP    0.031124
ENTJ    0.026513
ISTJ    0.023631
ENFJ    0.021902
ISFJ    0.019020
ESTP    0.010375
ESFP    0.005764
ESFJ    0.005187
ESTJ    0.004611
Name: proportion, dtype: float64


In [None]:
# Save train and test sets to CSV
train_df.to_csv('train_set.csv', index=False)
test_df.to_csv('test_set.csv', index=False)

In [None]:
# Download the file to your local machine
files.download('test_set.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Load the CSV file into a new DataFrame
train_df = pd.read_csv('train_set.csv')
test_df = pd.read_csv('test_set.csv')

#Data Augmentation for Training Set

In [None]:
def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name())
    if word in synonyms:
        synonyms.remove(word)
    return list(synonyms)

In [None]:
def random_insertion(text, n=1):
    words = text.split()
    if len(words) == 0:
        return text
    for _ in range(n):
        new_word = random.choice(words)
        synonyms = get_synonyms(new_word)
        if synonyms:
            synonym = random.choice(synonyms)
            insert_position = random.randint(0, len(words))
            words.insert(insert_position, synonym)
    return ' '.join(words)

In [None]:
def random_synonym_replacement(text):
    words = text.split()
    if len(words) == 0:
        return text
    random_word = random.choice(words)
    synonyms = get_synonyms(random_word)
    if synonyms:
        synonym = random.choice(synonyms)
        new_words = [synonym if word == random_word else word for word in words]
        return ' '.join(new_words)
    else:
        return text

In [None]:
def random_deletion(text, p=0.1):
    words = text.split()
    if len(words) == 0:
        return text
    if len(words) == 1:
        return text
    new_words = [word for word in words if random.uniform(0, 1) > p]
    return ' '.join(new_words)

In [None]:
def random_swap(text, n=1):
    words = text.split()
    if len(words) < 2:
        return text
    for _ in range(n):
        idx1, idx2 = random.sample(range(len(words)), 2)
        words[idx1], words[idx2] = words[idx2], words[idx1]
    return ' '.join(words)

In [None]:
# Convert all entries in the 'posts_cleaned' column to strings, replacing NaNs with empty strings
train_df['posts_cleaned'] = train_df['posts_cleaned'].astype(str).fillna('')

In [None]:
# Augmentation process
total_target_samples = 25000
current_total_samples = len(train_df)
class_counts = train_df['type'].value_counts()

current_proportions = class_counts / current_total_samples
target_class_counts = (current_proportions * total_target_samples).astype(int)

In [None]:
target_class_counts

Unnamed: 0_level_0,count
type,Unnamed: 1_level_1
INFP,5280
INFJ,4236
INTP,3757
INTJ,3144
ENTP,1974
ENFP,1945
ISTP,972
ISFP,781
ENTJ,666
ISTJ,590


In [None]:
augmented_texts = []
augmented_labels = []

In [None]:
for label in class_counts.index:
    current_size = class_counts[label]
    target_size = target_class_counts[label]
    texts_to_augment = train_df[train_df['type'] == label]['posts_cleaned']
    num_samples_needed = target_size - current_size

    while num_samples_needed > 0:
        for text in texts_to_augment:
            if num_samples_needed <= 0:
                break
            augmented_text = random.choice([
                random_deletion(text),
                random_swap(text),
                random_insertion(text),
                random_synonym_replacement(text)
            ])
            augmented_texts.append(augmented_text)
            augmented_labels.append(label)
            num_samples_needed -= 1

In [None]:
# Append the augmented data to the original training set
augmented_df = pd.DataFrame({
    'posts_cleaned': augmented_texts,
    'type': augmented_labels
})

In [None]:
train_df = pd.concat([train_df, augmented_df])

In [None]:
print(train_df['type'].value_counts(normalize=True))

type
INFP    0.211276
INFJ    0.169501
INTP    0.150334
INTJ    0.125805
ENTP    0.078988
ENFP    0.077828
ISTP    0.038894
ISFP    0.031251
ENTJ    0.026650
ISTJ    0.023608
ENFJ    0.021888
ISFJ    0.019167
ESTP    0.010204
ESFP    0.005442
ESFJ    0.004722
ESTJ    0.004442
Name: proportion, dtype: float64


In [None]:
train_df.to_csv('augmented_train_set1.csv', index=False)

In [None]:
files.download('augmented_train_set1.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
train_df = pd.read_csv('/content/drive/MyDrive/TI14_Personal/augmented_train_set1.csv')

In [None]:
test_df = pd.read_csv('/content/drive/MyDrive/TI14_Personal/test_set.csv')

#Feature Extraction

In [None]:
# Convert all entries in the 'posts_cleaned' column to strings, replacing NaNs with empty strings
train_df['posts_cleaned'] = train_df['posts_cleaned'].astype(str).fillna('')
test_df['posts_cleaned'] = test_df['posts_cleaned'].astype(str).fillna('')

In [None]:
# Tokenize the data
train_tokens = train_df['posts_cleaned'].apply(word_tokenize)
test_tokens = test_df['posts_cleaned'].apply(word_tokenize)

In [None]:
# Train Word2Vec model
word2vec_model = Word2Vec(sentences=train_tokens, vector_size=400, window=5, min_count=1, workers=4)

In [None]:
# Function to get Word2Vec features
def get_word2vec_features(text, model, num_features):
    words = text.split()
    feature_vector = np.zeros((num_features,), dtype="float32")
    for word in words:
        if word in model.wv:
            feature_vector = np.add(feature_vector, model.wv[word])
    if len(words) > 0:
        feature_vector = np.divide(feature_vector, len(words))
    return feature_vector

In [None]:
# Extract Word2Vec features
train_word2vec_features = np.array([get_word2vec_features(text, word2vec_model, 400) for text in train_df['posts_cleaned']])
test_word2vec_features = np.array([get_word2vec_features(text, word2vec_model, 400) for text in test_df['posts_cleaned']])

In [None]:
# TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=400)
train_tfidf_features = tfidf_vectorizer.fit_transform(train_df['posts_cleaned']).toarray()
test_tfidf_features = tfidf_vectorizer.transform(test_df['posts_cleaned']).toarray()

In [None]:
# Combine Word2Vec and TF-IDF features
train_combined_features = np.hstack((train_word2vec_features, train_tfidf_features))
test_combined_features = np.hstack((test_word2vec_features, test_tfidf_features))

#Model

In [None]:
# Define a mapping from MBTI type to dimensions
def get_mbti_dimensions(mbti_type):
    ie = 1 if mbti_type[0] == 'I' else 0
    ns = 1 if mbti_type[1] == 'N' else 0
    ft = 1 if mbti_type[2] == 'F' else 0
    jp = 1 if mbti_type[3] == 'J' else 0
    return ie, ns, ft, jp

# Apply the function to both train and test sets
train_df[['I/E', 'N/S', 'F/T', 'J/P']] = train_df['type'].apply(lambda x: pd.Series(get_mbti_dimensions(x)))
test_df[['I/E', 'N/S', 'F/T', 'J/P']] = test_df['type'].apply(lambda x: pd.Series(get_mbti_dimensions(x)))

In [None]:
# Define the RandomForest model
rf_model = RandomForestClassifier(random_state=42, max_depth=10)

In [None]:
# Train the model on class type
rf_model.fit(train_combined_features, train_df['type'])

In [None]:
# Predict on the train set and map to dimensions
train_predictions = rf_model.predict(train_combined_features)
train_pred_dimensions = pd.DataFrame([get_mbti_dimensions(pred) for pred in train_predictions], columns=['I/E', 'N/S', 'F/T', 'J/P'])

In [None]:
# Predict on the test set and map to dimensions
test_predictions = rf_model.predict(test_combined_features)
test_pred_dimensions = pd.DataFrame([get_mbti_dimensions(pred) for pred in test_predictions], columns=['I/E', 'N/S', 'F/T', 'J/P'])

In [None]:
# Initialize lists to store metrics for each dimension
average_accuracies = []
precisions = []
recalls = []
f1_scores = []

# Calculate metrics for each dimension and their averages
for dimension in ['I/E', 'N/S', 'F/T', 'J/P']:
    train_accuracy = accuracy_score(train_df[dimension], train_pred_dimensions[dimension])
    test_accuracy = accuracy_score(test_df[dimension], test_pred_dimensions[dimension])

    # Calculate the average accuracy for this dimension
    average_accuracy = (train_accuracy + test_accuracy) / 2
    average_accuracies.append(average_accuracy)

    # Calculate precision, recall, and f1-score for the test set
    precision = precision_score(test_df[dimension], test_pred_dimensions[dimension])
    recall = recall_score(test_df[dimension], test_pred_dimensions[dimension])
    f1 = f1_score(test_df[dimension], test_pred_dimensions[dimension])

    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)

    print(f"Train Accuracy for {dimension}: {train_accuracy}")
    print(f"Test Accuracy for {dimension}: {test_accuracy}")
    print(f"Average Accuracy for {dimension}: {average_accuracy}")
    print(f"Precision for {dimension}: {precision}")
    print(f"Recall for {dimension}: {recall}")
    print(f"F1-Score for {dimension}: {f1}\n")

# If you want to calculate an overall average accuracy, precision, recall, and f1-score across all dimensions
overall_average_accuracy = sum(average_accuracies) / len(average_accuracies)
overall_precision = sum(precisions) / len(precisions)
overall_recall = sum(recalls) / len(recalls)
overall_f1_score = sum(f1_scores) / len(f1_scores)

print(f"Overall Average Accuracy: {overall_average_accuracy}")
print(f"Overall Precision: {overall_precision}")
print(f"Overall Recall: {overall_recall}")
print(f"Overall F1-Score: {overall_f1_score}")

Train Accuracy for I/E: 0.9262934656476332
Test Accuracy for I/E: 0.7659942363112392
Average Accuracy for I/E: 0.8461438509794361
Precision for I/E: 0.7697674418604651
Recall for I/E: 0.992503748125937
F1-Score for I/E: 0.8670595939751146

Train Accuracy for N/S: 0.9221719819134888
Test Accuracy for N/S: 0.861671469740634
Average Accuracy for N/S: 0.8919217258270614
Precision for N/S: 0.861671469740634
Recall for N/S: 1.0
F1-Score for N/S: 0.9256965944272446

Train Accuracy for F/T: 0.9428994437997679
Test Accuracy for F/T: 0.7089337175792507
Average Accuracy for F/T: 0.8259165806895092
Precision for F/T: 0.683277027027027
Recall for F/T: 0.8615548455804047
F1-Score for F/T: 0.7621290626471974

Train Accuracy for J/P: 0.9360969949181706
Test Accuracy for J/P: 0.6230547550432277
Average Accuracy for J/P: 0.7795758749806991
Precision for J/P: 0.534020618556701
Recall for J/P: 0.37700145560407566
F1-Score for J/P: 0.44197952218430037

Overall Average Accuracy: 0.8358895081191764
Overall P

In [None]:
# Define the SVM model
svm_model = SVC(random_state=42)

In [None]:
# Train the model on class type
svm_model.fit(train_combined_features, train_df['type'])

In [None]:
# Predict on the train set and map to dimensions
train_predictions = svm_model.predict(train_combined_features)
train_pred_dimensions = pd.DataFrame([get_mbti_dimensions(pred) for pred in train_predictions], columns=['I/E', 'N/S', 'F/T', 'J/P'])

In [None]:
# Predict on the test set and map to dimensions
test_predictions = svm_model.predict(test_combined_features)
test_pred_dimensions = pd.DataFrame([get_mbti_dimensions(pred) for pred in test_predictions], columns=['I/E', 'N/S', 'F/T', 'J/P'])

In [None]:
# Initialize lists to store metrics for each dimension
average_accuracies = []
precisions = []
recalls = []
f1_scores = []

# Calculate metrics for each dimension and their averages
for dimension in ['I/E', 'N/S', 'F/T', 'J/P']:
    train_accuracy = accuracy_score(train_df[dimension], train_pred_dimensions[dimension])
    test_accuracy = accuracy_score(test_df[dimension], test_pred_dimensions[dimension])

    # Calculate the average accuracy for this dimension
    average_accuracy = (train_accuracy + test_accuracy) / 2
    average_accuracies.append(average_accuracy)

    # Calculate precision, recall, and f1-score for the test set
    precision = precision_score(test_df[dimension], test_pred_dimensions[dimension])
    recall = recall_score(test_df[dimension], test_pred_dimensions[dimension])
    f1 = f1_score(test_df[dimension], test_pred_dimensions[dimension])

    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)

    print(f"Train Accuracy for {dimension}: {train_accuracy}")
    print(f"Test Accuracy for {dimension}: {test_accuracy}")
    print(f"Average Accuracy for {dimension}: {average_accuracy}")
    print(f"Precision for {dimension}: {precision}")
    print(f"Recall for {dimension}: {recall}")
    print(f"F1-Score for {dimension}: {f1}\n")

# If you want to calculate an overall average accuracy, precision, recall, and f1-score across all dimensions
overall_average_accuracy = sum(average_accuracies) / len(average_accuracies)
overall_precision = sum(precisions) / len(precisions)
overall_recall = sum(recalls) / len(recalls)
overall_f1_score = sum(f1_scores) / len(f1_scores)

print(f"Overall Average Accuracy: {overall_average_accuracy}")
print(f"Overall Precision: {overall_precision}")
print(f"Overall Recall: {overall_recall}")
print(f"Overall F1-Score: {overall_f1_score}")

Train Accuracy for I/E: 0.8311392101156416
Test Accuracy for I/E: 0.7746397694524496
Average Accuracy for I/E: 0.8028894897840456
Precision for I/E: 0.8075668623613829
Recall for I/E: 0.9280359820089955
F1-Score for I/E: 0.8636205092431113

Train Accuracy for N/S: 0.8829978792365252
Test Accuracy for N/S: 0.8680115273775216
Average Accuracy for N/S: 0.8755047033070233
Precision for N/S: 0.8710433763188745
Recall for N/S: 0.9939799331103679
F1-Score for N/S: 0.9284598562949077

Train Accuracy for F/T: 0.8401424512824617
Test Accuracy for F/T: 0.7648414985590778
Average Accuracy for F/T: 0.8024919749207697
Precision for F/T: 0.7565217391304347
Recall for F/T: 0.8338658146964856
F1-Score for F/T: 0.7933130699088146

Train Accuracy for J/P: 0.7631147212996678
Test Accuracy for J/P: 0.6570605187319885
Average Accuracy for J/P: 0.7100876200158281
Precision for J/P: 0.5793103448275863
Recall for J/P: 0.4890829694323144
F1-Score for J/P: 0.5303867403314917

Overall Average Accuracy: 0.79774344

In [None]:
# Define the LightGBM model
lgbm_model = LGBMClassifier(random_state=42)

In [None]:
# Train the model on class type
lgbm_model.fit(train_combined_features, train_df['type'])

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.282899 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 204000
[LightGBM] [Info] Number of data points in the train set: 24991, number of used features: 800
[LightGBM] [Info] Start training from score -3.821822
[LightGBM] [Info] Start training from score -2.553254
[LightGBM] [Info] Start training from score -3.624981
[LightGBM] [Info] Start training from score -2.538454
[LightGBM] [Info] Start training from score -5.355586
[LightGBM] [Info] Start training from score -5.213616
[LightGBM] [Info] Start training from score -5.416741
[LightGBM] [Info] Start training from score -4.585007
[LightGBM] [Info] Start training from score -1.774896
[LightGBM] [Info] Start training from score -1.554590
[LightGBM] [Info] Start training from score -2.073020
[LightGBM] [Info] Start training from score -1.894895
[LightGBM] [Info] Start training from score -3.954570
[Light

In [None]:
# Predict on the train set and map to dimensions
train_predictions = lgbm_model.predict(train_combined_features)
train_pred_dimensions = pd.DataFrame([get_mbti_dimensions(pred) for pred in train_predictions], columns=['I/E', 'N/S', 'F/T', 'J/P'])

In [None]:
# Predict on the test set and map to dimensions
test_predictions = lgbm_model.predict(test_combined_features)
test_pred_dimensions = pd.DataFrame([get_mbti_dimensions(pred) for pred in test_predictions], columns=['I/E', 'N/S', 'F/T', 'J/P'])

In [None]:
# Initialize lists to store metrics for each dimension
average_accuracies = []
precisions = []
recalls = []
f1_scores = []

# Calculate metrics for each dimension and their averages
for dimension in ['I/E', 'N/S', 'F/T', 'J/P']:
    train_accuracy = accuracy_score(train_df[dimension], train_pred_dimensions[dimension])
    test_accuracy = accuracy_score(test_df[dimension], test_pred_dimensions[dimension])

    # Calculate the average accuracy for this dimension
    average_accuracy = (train_accuracy + test_accuracy) / 2
    average_accuracies.append(average_accuracy)

    # Calculate precision, recall, and f1-score for the test set
    precision = precision_score(test_df[dimension], test_pred_dimensions[dimension])
    recall = recall_score(test_df[dimension], test_pred_dimensions[dimension])
    f1 = f1_score(test_df[dimension], test_pred_dimensions[dimension])

    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)

    print(f"Train Accuracy for {dimension}: {train_accuracy}")
    print(f"Test Accuracy for {dimension}: {test_accuracy}")
    print(f"Average Accuracy for {dimension}: {average_accuracy}")
    print(f"Precision for {dimension}: {precision}")
    print(f"Recall for {dimension}: {recall}")
    print(f"F1-Score for {dimension}: {f1}\n")

# If you want to calculate an overall average accuracy, precision, recall, and f1-score across all dimensions
overall_average_accuracy = sum(average_accuracies) / len(average_accuracies)
overall_precision = sum(precisions) / len(precisions)
overall_recall = sum(recalls) / len(recalls)
overall_f1_score = sum(f1_scores) / len(f1_scores)

print(f"Overall Average Accuracy: {overall_average_accuracy}")
print(f"Overall Precision: {overall_precision}")
print(f"Overall Recall: {overall_recall}")
print(f"Overall F1-Score: {overall_f1_score}")

Train Accuracy for I/E: 1.0
Test Accuracy for I/E: 0.7717579250720461
Average Accuracy for I/E: 0.885878962536023
Precision for I/E: 0.7945979899497487
Recall for I/E: 0.9482758620689655
F1-Score for I/E: 0.8646616541353384

Train Accuracy for N/S: 1.0
Test Accuracy for N/S: 0.8622478386167147
Average Accuracy for N/S: 0.9311239193083574
Precision for N/S: 0.8651162790697674
Recall for N/S: 0.9953177257525083
F1-Score for N/S: 0.9256609642301711

Train Accuracy for F/T: 1.0
Test Accuracy for F/T: 0.7400576368876081
Average Accuracy for F/T: 0.8700288184438041
Precision for F/T: 0.7293233082706767
Recall for F/T: 0.8264110756123536
F1-Score for F/T: 0.7748377433849226

Train Accuracy for J/P: 1.0
Test Accuracy for J/P: 0.6443804034582132
Average Accuracy for J/P: 0.8221902017291066
Precision for J/P: 0.5583333333333333
Recall for J/P: 0.487627365356623
F1-Score for J/P: 0.5205905205905206

Overall Average Accuracy: 0.8773054755043228
Overall Precision: 0.7368427276558815
Overall Recall:

#*

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

In [None]:
# Define the adjusted parameter grid for RandomForest
param_dist_rf = {
    'n_estimators': randint(50, 150),  # Fewer trees
    'max_depth': [3, 5, 7, 10],  # Further restrict depth
    'min_samples_split': randint(5, 15),  # Increase min_samples_split
    'min_samples_leaf': randint(2, 5),  # Increase min_samples_leaf
    'max_features': ['auto', 'sqrt'],  # Limit the number of features
    'bootstrap': [True, False]
}

In [None]:
# Create a RandomForest model
rf_model = RandomForestClassifier(random_state=42)

In [None]:
# Randomized search on hyperparameters
rf_random_search = RandomizedSearchCV(estimator=rf_model, param_distributions=param_dist_rf,
                                      n_iter=15, cv=4, verbose=1, random_state=42, n_jobs=-1)

In [None]:
# Fit the random search model
rf_random_search.fit(train_combined_features, train_df['type'])

Fitting 4 folds for each of 15 candidates, totalling 60 fits


32 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1145, in wrapper
    estimator._validate_params()
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/_param_validation.py", line 96, in validate_parameter_constraints
    raise InvalidParameterError(
sklea

In [None]:
# Best hyperparameters
print("Best RF Parameters:", rf_random_search.best_params_)

# Use the best model
best_rf_model = rf_random_search.best_estimator_

Best RF Parameters: {'bootstrap': False, 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 6, 'n_estimators': 139}


In [None]:
# Evaluate on dimensions using the best_rf_model
print("Random Forest Results:")
for dimension in ['I/E', 'N/S', 'F/T', 'J/P']:
    # Train predictions
    train_predictions = best_rf_model.predict(train_combined_features)
    train_pred_dimensions = pd.DataFrame([get_mbti_dimensions(pred) for pred in train_predictions], columns=['I/E', 'N/S', 'F/T', 'J/P'])
    train_accuracy = accuracy_score(train_df[dimension], train_pred_dimensions[dimension])
    train_precision = precision_score(train_df[dimension], train_pred_dimensions[dimension])
    train_recall = recall_score(train_df[dimension], train_pred_dimensions[dimension])
    train_f1 = f1_score(train_df[dimension], train_pred_dimensions[dimension])

    # Test predictions
    test_predictions = best_rf_model.predict(test_combined_features)
    test_pred_dimensions = pd.DataFrame([get_mbti_dimensions(pred) for pred in test_predictions], columns=['I/E', 'N/S', 'F/T', 'J/P'])
    test_accuracy = accuracy_score(test_df[dimension], test_pred_dimensions[dimension])
    test_precision = precision_score(test_df[dimension], test_pred_dimensions[dimension])
    test_recall = recall_score(test_df[dimension], test_pred_dimensions[dimension])
    test_f1 = f1_score(test_df[dimension], test_pred_dimensions[dimension])

    # Print results
    print(f"Train Accuracy for {dimension}: {train_accuracy}")
    print(f"Test Accuracy for {dimension}: {test_accuracy}")
    print(f"Average Accuracy for {dimension}: {(train_accuracy + test_accuracy) / 2}\n")
    print(f"Train Precision for {dimension}: {train_precision}")
    print(f"Test Precision for {dimension}: {test_precision}")
    print(f"Train Recall for {dimension}: {train_recall}")
    print(f"Test Recall for {dimension}: {test_recall}")
    print(f"Train F1 Score for {dimension}: {train_f1}")
    print(f"Test F1 Score for {dimension}: {test_f1}\n\n")

Random Forest Results:
Train Accuracy for I/E: 0.9351366491937098
Test Accuracy for I/E: 0.7706051873198847
Average Accuracy for I/E: 0.8528709182567973

Train Precision for I/E: 0.9224534816804143
Test Precision for I/E: 0.7727272727272727
Train Recall for I/E: 0.999792088985914
Test Recall for I/E: 0.9940029985007496
Train F1 Score for I/E: 0.9595669851088773
Test F1 Score for I/E: 0.8695081967213115


Train Accuracy for N/S: 0.9254131487335441
Test Accuracy for N/S: 0.861671469740634
Average Accuracy for N/S: 0.8935423092370891

Train Precision for N/S: 0.9203861102806133
Test Precision for N/S: 0.861671469740634
Train Recall for N/S: 1.0
Test Recall for N/S: 1.0
Train F1 Score for N/S: 0.9585427694497576
Test F1 Score for N/S: 0.9256965944272446


Train Accuracy for F/T: 0.949781921491737
Test Accuracy for F/T: 0.7129682997118155
Average Accuracy for F/T: 0.8313751106017763

Train Precision for F/T: 0.9324543467531552
Test Precision for F/T: 0.6876595744680851
Train Recall for F/T: