In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report


In [2]:
# Assuming your dataset is in a CSV file named 'personality_data.csv'
df = pd.read_csv('mbti_1.csv')
print(df.head())


   type                                              posts
0  INFJ  'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1  ENTP  'I'm finding the lack of me in these posts ver...
2  INTP  'Good one  _____   https://www.youtube.com/wat...
3  INTJ  'Dear INTP,   I enjoyed our conversation the o...
4  ENTJ  'You're fired.|||That's another silly misconce...


In [3]:
# Extract features and target variable
X = df['posts']
y = df['type']


In [4]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [5]:
# Convert text data into numerical format using TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # You can adjust max_features based on your dataset
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)


In [6]:
# Create and train the logistic regression model
model = LogisticRegression(max_iter=1000)  # You can adjust hyperparameters based on your dataset
model.fit(X_train_tfidf, y_train)


LogisticRegression(max_iter=1000)

In [7]:
# Use the trained model to make predictions on the test data
predictions = model.predict(X_test_tfidf)


In [8]:
# Assess the model's performance using accuracy and other relevant metrics
accuracy = accuracy_score(y_test, predictions)
report = classification_report(y_test, predictions)
print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{report}")


Accuracy: 0.6190201729106628
Classification Report:
              precision    recall  f1-score   support

        ENFJ       0.40      0.05      0.09        41
        ENFP       0.73      0.58      0.65       125
        ENTJ       0.77      0.23      0.35        44
        ENTP       0.68      0.53      0.59       135
        ESFJ       0.00      0.00      0.00         7
        ESFP       0.00      0.00      0.00         8
        ESTJ       0.00      0.00      0.00         7
        ESTP       1.00      0.07      0.12        15
        INFJ       0.62      0.66      0.64       288
        INFP       0.55      0.88      0.68       370
        INTJ       0.60      0.69      0.64       193
        INTP       0.66      0.80      0.72       293
        ISFJ       1.00      0.13      0.24        45
        ISFP       0.57      0.08      0.13        53
        ISTJ       0.80      0.09      0.16        44
        ISTP       0.79      0.33      0.46        67

    accuracy                

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [9]:
# creating parameter grid that will try different params 
param_grid = {'C': [0.1, 1, 10], 'max_iter': [100, 500, 1000]}


In [11]:
# testing to find best parameters
grid_search = GridSearchCV(LogisticRegression(solver = 'liblinear', max_iter = 100), param_grid, cv=5)
grid_search.fit(X_train_tfidf, y_train)



GridSearchCV(cv=5, estimator=LogisticRegression(solver='liblinear'),
             param_grid={'C': [0.1, 1, 10], 'max_iter': [100, 500, 1000]})

In [12]:
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_


This is the result of training the model using the best possible parameters. 

In [13]:
# Use the best model to make predictions
predictions = best_model.predict(X_test_tfidf)

# Assess the model's performance using accuracy and other relevant metrics
accuracy = accuracy_score(y_test, predictions)
report = classification_report(y_test, predictions)
print(f"Best Parameters: {best_params}")
print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{report}")


Best Parameters: {'C': 10, 'max_iter': 100}
Accuracy: 0.6570605187319885
Classification Report:
              precision    recall  f1-score   support

        ENFJ       0.60      0.22      0.32        41
        ENFP       0.70      0.65      0.68       125
        ENTJ       0.69      0.45      0.55        44
        ENTP       0.66      0.57      0.61       135
        ESFJ       0.00      0.00      0.00         7
        ESFP       0.00      0.00      0.00         8
        ESTJ       0.00      0.00      0.00         7
        ESTP       0.60      0.20      0.30        15
        INFJ       0.65      0.68      0.66       288
        INFP       0.65      0.82      0.72       370
        INTJ       0.60      0.69      0.64       193
        INTP       0.68      0.80      0.73       293
        ISFJ       0.92      0.49      0.64        45
        ISFP       0.68      0.32      0.44        53
        ISTJ       0.75      0.34      0.47        44
        ISTP       0.72      0.49      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [14]:
# increase options inside parameter grid and rerun steps above
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100], 'max_iter': [100, 500, 1000, 2000], 'solver': ['lbfgs', 'saga', 'newton-cg']}
grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid_search.fit(X_train_tfidf, y_train)
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

After giving the param_grid more options to pick from, and training the model on those, it seems to platoue around 66% accuracy. 

In [17]:
final_model = LogisticRegression(C=best_params['C'], max_iter=best_params['max_iter'], solver=best_params['solver'])
final_model.fit(X_train_tfidf, y_train)

# Evaluate on the test set
test_predictions = final_model.predict(X_test_tfidf)
test_accuracy = accuracy_score(y_test, test_predictions)
test_report = classification_report(y_test, test_predictions)

print(f"Final Model Test Accuracy: {test_accuracy}")
print(f"Final Model Test Classification Report:\n{test_report}")


Final Model Test Accuracy: 0.6553314121037463
Final Model Test Classification Report:
              precision    recall  f1-score   support

        ENFJ       0.59      0.24      0.34        41
        ENFP       0.69      0.64      0.66       125
        ENTJ       0.70      0.52      0.60        44
        ENTP       0.68      0.57      0.62       135
        ESFJ       0.00      0.00      0.00         7
        ESFP       0.00      0.00      0.00         8
        ESTJ       0.50      0.14      0.22         7
        ESTP       0.83      0.33      0.48        15
        INFJ       0.61      0.66      0.63       288
        INFP       0.65      0.79      0.72       370
        INTJ       0.58      0.72      0.64       193
        INTP       0.69      0.77      0.73       293
        ISFJ       0.92      0.49      0.64        45
        ISFP       0.68      0.36      0.47        53
        ISTJ       0.75      0.41      0.53        44
        ISTP       0.68      0.51      0.58      

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [1]:
# Function to check if actual vs. predicted MBTI types are only a letter different
def is_one_letter_different(actual, predicted):
    differences = sum(c1 != c2 for c1, c2 in zip(actual, predicted))
    return differences == 1

# Iterate through the confusion matrix and identify instances
total_one_letter_away_count = 0

for i in range(len(mbti_types)):
    for j in range(len(mbti_types)):
        if i != j and is_one_letter_different(mbti_types[i], mbti_types[j]):
            total_one_letter_away_count += conf_matrix[i, j]

total_incorrect_count = conf_matrix.sum() - np.trace(conf_matrix)  # Total incorrect predictions

percentage_one_letter_away = (total_one_letter_away_count / total_incorrect_count) * 100

print(f'Percentage of incorrect classifications that are only 1 MBTI letter away: {percentage_one_letter_away:.2f}%')

NameError: name 'mbti_types' is not defined