### Base Model using `sklearn`

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import re, nltk

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import classification_report, accuracy_score

In [3]:
data = pd.read_csv('dataset/mbti_1.csv')
train_data, test_data = train_test_split(data, test_size=0.2, random_state=2021, stratify=data.type)

In [4]:
def clear_text(data):
    data_length = []
    cleaned_text = []
    
    for sentence in data.posts:
        sentence = sentence.lower()
        # remove links from text
        sentence = re.sub('https?://[^\s<>"]+|www\.[^\s<>"]+',' ',sentence)
        # remove other symbols
        sentence= re.sub('[^0-9a-z]',' ',sentence)

        data_length.append(len(sentence.split()))
        cleaned_text.append(sentence)
    return cleaned_text, data_length

In [None]:
train_data['cleaned'], train_length = clear_text(train_data)
test_data['cleaned'], test_length = clear_text(test_data)

### Tokenizer

In [6]:
class Lemmatizer(object):
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
    def __call__(self, sentence):
        return [self.lemmatizer.lemmatize(word) for word in sentence.split() if len(word) > 2]

In [None]:
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english', tokenizer=Lemmatizer())
vectorizer.fit(train_data.cleaned)

In [12]:
train_post = vectorizer.transform(train_data.cleaned).toarray()
test_post = vectorizer.transform(test_data.cleaned).toarray()

In [13]:
train_post.shape

(6940, 5000)

In [14]:
target_encoder = LabelEncoder()
train_target = target_encoder.fit_transform(train_data.type)
test_target = target_encoder.fit_transform(test_data.type)

## Model Selection

In [28]:
model_accuracy = {}

### Logistic Regression

In [15]:
from sklearn.linear_model import LogisticRegression

logit_model = LogisticRegression(max_iter=3000, C=0.5, n_jobs=-1)
logit_model.fit(train_post, train_target)

LogisticRegression(C=0.5, max_iter=3000, n_jobs=-1)

In [29]:
model_accuracy['logistic'] = accuracy_score(test_target, logit_model.predict(test_post))

### Linear Support Vector Classifier

In [23]:
from sklearn.svm import LinearSVC

linear_SVC = LinearSVC(C=0.1)
linear_SVC.fit(train_post, train_target)

LinearSVC(C=0.1)

In [30]:
model_accuracy['Linear SVM'] = accuracy_score(test_target, linear_SVC.predict(test_post))

### Multinomial Naive Bayes

In [25]:
from sklearn.naive_bayes import MultinomialNB

multinominal_nb = MultinomialNB()
multinominal_nb.fit(train_post, train_target)

MultinomialNB()

In [31]:
model_accuracy['MultinomialNB'] = accuracy_score(test_target, multinominal_nb.predict(test_post))

### Decision Tree Classifier

In [32]:
from sklearn.tree import DecisionTreeClassifier

decision_tree = DecisionTreeClassifier(max_depth=14)
decision_tree.fit(train_post, train_target)

DecisionTreeClassifier(max_depth=14)

In [33]:
model_accuracy['Decision Tree'] = accuracy_score(test_target, decision_tree.predict(test_post))

### Random Forest Classifier

In [35]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(max_depth=14)
random_forest.fit(train_post, train_target)

RandomForestClassifier(max_depth=14)

In [36]:
model_accuracy['Random Forest'] = accuracy_score(test_target, random_forest.predict(test_post))

In [40]:
model_accuracy

{'logistic': 0.624207492795389,
 'Linear SVM': 0.6622478386167147,
 'MultinomialNB': 0.37579250720461094,
 'Decision Tree': 0.5002881844380404,
 'Random Forest': 0.48818443804034584}