Import Libraries

In [31]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

Import Dataset

In [32]:
df = pd.read_csv('mbti_1.csv')

Split into Training and Test sets

In [33]:
train_data, test_data, train_labels, test_labels = train_test_split(
    df['posts'], df['type'], test_size=0.2, random_state=42
)

Convert text data to feature vectors using CountVectorizer

In [34]:
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(train_data)
X_test = vectorizer.transform(test_data)

Define parameter grid for hyperparameter tuning

In [35]:
param_grid = {
    'alpha': [0.05, 0.1, 0.2, 0.5, 1.0, 2.0],
}

Initialize and train the Multinomial Naive Bayes model

In [36]:
naive_bayes_model = MultinomialNB()

Use GridSearchCV to find the best hyperparameters

In [37]:
grid_search = GridSearchCV(estimator=naive_bayes_model, param_grid=param_grid, cv=5)
grid_search.fit(X_train, train_labels)
best_params = grid_search.best_params_
print(f'Best Hyperparameters: {best_params}')

Best Hyperparameters: {'alpha': 0.1}


Make predictions on the test set using the best model

In [38]:
best_model = grid_search.best_estimator_
predictions = best_model.predict(X_test)

Evaluate the model

In [39]:
accuracy = accuracy_score(test_labels, predictions)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.48


Display additional evaluation metrics

In [40]:
print(classification_report(test_labels, predictions))

              precision    recall  f1-score   support

        ENFJ       0.00      0.00      0.00        41
        ENFP       0.49      0.50      0.49       125
        ENTJ       0.50      0.05      0.08        44
        ENTP       0.51      0.32      0.39       135
        ESFJ       0.00      0.00      0.00         7
        ESFP       0.00      0.00      0.00         8
        ESTJ       0.00      0.00      0.00         7
        ESTP       0.00      0.00      0.00        15
        INFJ       0.43      0.61      0.51       288
        INFP       0.49      0.68      0.57       370
        INTJ       0.40      0.50      0.45       193
        INTP       0.57      0.61      0.59       293
        ISFJ       0.00      0.00      0.00        45
        ISFP       0.08      0.02      0.03        53
        ISTJ       1.00      0.05      0.09        44
        ISTP       0.50      0.18      0.26        67

    accuracy                           0.48      1735
   macro avg       0.31   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
