In [20]:
# Importing libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report



In [21]:
# Loading the dataset
df = pd.read_csv('mbti_1.csv')

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['posts'], df['type'], test_size=0.2, random_state=42)

# Converting text data into numerical features using TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)



In [22]:
# Initializing and training the KNN model
knn_model = KNeighborsClassifier(n_neighbors=21) 
knn_model.fit(X_train_tfidf, y_train)

# Predicting personality types on the test set
y_pred = knn_model.predict(X_test_tfidf)

In [24]:
# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred, zero_division=1)

# Printing results
print(f"Accuracy: {accuracy}")
print("Classification Report:\n", classification_rep)

Accuracy: 0.45994236311239195
Classification Report:
               precision    recall  f1-score   support

        ENFJ       0.50      0.20      0.28        41
        ENFP       0.47      0.50      0.48       125
        ENTJ       0.82      0.32      0.46        44
        ENTP       0.68      0.30      0.41       135
        ESFJ       0.20      0.29      0.24         7
        ESFP       1.00      0.00      0.00         8
        ESTJ       0.33      0.14      0.20         7
        ESTP       1.00      0.33      0.50        15
        INFJ       0.38      0.58      0.46       288
        INFP       0.39      0.76      0.52       370
        INTJ       0.50      0.27      0.35       193
        INTP       0.71      0.39      0.50       293
        ISFJ       0.94      0.38      0.54        45
        ISFP       0.75      0.11      0.20        53
        ISTJ       0.92      0.25      0.39        44
        ISTP       0.81      0.19      0.31        67

    accuracy              